#### Imports

In [4]:
import os
import sys

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.utils import class_weight
from sklearn.model_selection import train_test_split
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping

from helpers import *


#### Helpers

In [5]:
def loadDataset(path):
    return pd.read_csv(path)

def getMatchResult(row, labels):
    if row[labels[0]] > row[labels[1]]:
        return 'H'
    elif row[labels[0]] < row[labels[1]]:
        return 'A'
    else:
        return 'D'

def predictMatch(home, away):
    # Encode home and away teams separately using their encoders
    home_encoded = le_home_encoder.transform([[home]]).toarray()
    away_encoded = le_away_encoder.transform([[away]]).toarray()
    
    # Combine the team encoding
    team_input = np.hstack((home_encoded, away_encoded))
    
    # Compute numeric features
    home_shots = home_avg_shots.get(home, 0)
    away_shots = away_avg_shots.get(away, 0)
    home_corners = home_avg_corners.get(home, 0)
    away_corners = away_avg_corners.get(away, 0)

    numeric_input = scaler.transform([[home_shots, away_shots, home_corners, away_corners]])
    
    # Combine both team and numeric inputs
    input_data = np.hstack((team_input, numeric_input))

    # Predict
    pred_probs = model.predict(input_data)[0]
    label_order = le_target.transform(['H', 'D', 'A']) 

    # Output
    print(f"\n{home} vs {away}")
    print(f"Home win: {pred_probs[label_order[0]]*100:.2f}%")
    print(f"Draw:     {pred_probs[label_order[1]]*100:.2f}%")
    print(f"Away win: {pred_probs[label_order[2]]*100:.2f}%")


#### Parameters & Dataset

In [6]:
# dataset_path = "datasets/worldcup/matches_1930_2022.csv"
dataset_path = "datasets/laliga/combined_data_laliga.csv"
dataset = dataset_path.split('/')[1]

test_ratio = 0.4
labels = getColumnLabels(dataset)

df = loadDataset(dataset_path)
df['result'] = df.apply(getMatchResult, args=(labels[0], ), axis=1)

#### Encoding

In [7]:
home_avg_shots = df.groupby("Home Team")["Home Team Total Shots"].mean()
away_avg_shots = df.groupby("Away Team")["Away Team Total Shots"].mean()
home_avg_corners = df.groupby("Home Team")["Home Team Corners"].mean()
away_avg_corners = df.groupby("Away Team")["Away Team Corners"].mean()

df['home_avg_shots'] = df["Home Team"].map(home_avg_shots)
df['away_avg_shots'] = df["Away Team"].map(away_avg_shots)
df['home_avg_corners'] = df["Home Team"].map(home_avg_corners)
df['away_avg_corners'] = df["Away Team"].map(away_avg_corners)

# === Prepare Features ===
team_features = df[["home_avg_shots", "away_avg_shots", "home_avg_corners", "away_avg_corners"]].values
scaler = StandardScaler()
scaled_team_features = scaler.fit_transform(team_features)

le_home_encoder = OneHotEncoder(handle_unknown='ignore')
le_home_encoded = le_home_encoder.fit(df[[labels[1][0]]])
le_home = le_home_encoded.transform(df[[labels[1][0]]]).toarray()

le_away_encoder = OneHotEncoder(handle_unknown='ignore')
le_away_encoded = le_away_encoder.fit(df[[labels[1][1]]])
le_away = le_away_encoded.transform(df[[labels[1][1]]]).toarray()

# Combine all features
X = np.hstack((le_home, le_away, scaled_team_features))

# Encode targets
le_target = LabelEncoder()
Y = le_target.fit_transform(df['result'])

# === Train-test Split ===
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=42)


#### Model definition

In [8]:
model = Sequential()
model.add(Dense(128, activation='sigmoid', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='sigmoid'))
model.add(Dropout(0.3))
model.add(Dense(32, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


#### Model compilation and fitting

In [9]:
# es_callback = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=le_target.classes_,
    y=df['result']
)

class_weight_dict = dict(zip(le_target.transform(le_target.classes_), class_weights))

model.compile(optimizer='adam', 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

model.fit(X_train, Y_train, epochs = 100, 
          batch_size = 16, validation_split=0.5,
          class_weight=class_weight_dict
          )

Epoch 1/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.3540 - loss: 1.1653 - val_accuracy: 0.2682 - val_loss: 1.0960
Epoch 2/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.3418 - loss: 1.1328 - val_accuracy: 0.2682 - val_loss: 1.1144
Epoch 3/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.3588 - loss: 1.1215 - val_accuracy: 0.4950 - val_loss: 1.0706
Epoch 4/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.3481 - loss: 1.1167 - val_accuracy: 0.5113 - val_loss: 1.0639
Epoch 5/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.3913 - loss: 1.0777 - val_accuracy: 0.4123 - val_loss: 1.0764
Epoch 6/100
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.3450 - loss: 1.1209 - val_accuracy: 0.4085 - val_loss: 1.0750
Epoch 7/100
[1m50/50[0m [

<keras.src.callbacks.history.History at 0x238f5bcadb0>

#### Prediction

In [15]:
predictMatch("BARCELONA", "REAL MADRID")
predictMatch("REAL MADRID", "BARCELONA")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step

BARCELONA vs REAL MADRID
Home win: 79.60%
Draw:     12.07%
Away win: 8.33%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 92ms/step





REAL MADRID vs BARCELONA
Home win: 47.06%
Draw:     18.40%
Away win: 34.54%


#### Tests & Data validation

In [16]:
df['result'].value_counts(normalize=True)

result
H    0.456391
A    0.287218
D    0.256391
Name: proportion, dtype: float64