In [1]:
import csv
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns

import pickle

from constants.constants import *
import json

# Embeds graphs in Jupyter notebook (instead of pop-ups)
%matplotlib inline

# pd.set_option('display.max_columns', None) # show all columns

Try using Random Forest with Boruta Preprocessor to predict wins and losses

### Note: If the model does not exist, please run feature_engineering.ipynb

In [3]:
with open(BORUTA_FEATURES_PATH, 'r') as f:
    selected_features = json.load(f)
with open('data/machine_learning/pkl/boruta/processor.pkl', 'rb') as f:
    preprocessor = pickle.load(f)

df_train_preprocessed, df_test_preprocessed, y_train_encoded, y_test_encoded = preprocessor.get_processed_data() # contains team details
df_train_preprocessed_features = df_train_preprocessed[selected_features]
df_test_preprocessed_features = df_test_preprocessed[selected_features]

In [4]:
prediction_df = pd.read_csv('data/machine_learning/2023_predict.csv')
prediction_df = prediction_df[['team', 'opponent', *selected_features]]

### Helper Function

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, X_train, y_train, X_test, y_test, df_2023):
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    model_name = model.__class__.__name__

    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average='weighted', zero_division=1)
    recall = recall_score(y_test, predictions, average='weighted', zero_division=1)
    f1 = f1_score(y_test, predictions, average='weighted', zero_division=1)

    print(f"Model: {model_name}")
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)

    df_2023_copy = df_2023.copy()

    predictions_2023 = model.predict(df_2023.drop(['team', 'opponent'], axis=1))
    predicted_labels_2023 = preprocessor.target_encoder.inverse_transform(predictions_2023)
    df_2023_copy['predicted_result'] = predicted_labels_2023
    final_df = df_2023_copy[['team', 'opponent', 'is_home', 'predicted_result']]
    display(final_df)
    final_df.to_csv(f"./data/machine_learning/predictions/{model_name}.csv", index=False)

### Random Forest Classifier

In [6]:
from sklearn.ensemble import RandomForestClassifier
evaluate_model(RandomForestClassifier(random_state=42), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: RandomForestClassifier
Accuracy: 0.456811797752809
Precision: 0.4453594198549805
Recall: 0.456811797752809
F1 Score: 0.4491029543119585


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,W
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,W
3,Sheffield Utd,Crystal Palace,1,W
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,W
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,W
3500,Nantes,Monaco,0,L


### Logistic Regression

In [7]:
from sklearn.linear_model import LogisticRegression
evaluate_model(LogisticRegression(random_state=42, max_iter=10000), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: LogisticRegression
Accuracy: 0.5105337078651685
Precision: 0.47482294747384823
Recall: 0.5105337078651685
F1 Score: 0.44431977538841344


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,L
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,L
3,Sheffield Utd,Crystal Palace,1,L
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,W
3500,Nantes,Monaco,0,L


### Support Vector Machines

In [8]:
from sklearn.svm import SVC
evaluate_model(SVC(kernel='linear', random_state=42), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: SVC
Accuracy: 0.5101825842696629
Precision: 0.6334582712456719
Recall: 0.5101825842696629
F1 Score: 0.4366782059958706


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,L
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,D
3,Sheffield Utd,Crystal Palace,1,L
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,W
3500,Nantes,Monaco,0,L


### Decision Tree

In [9]:
from sklearn.tree import DecisionTreeClassifier
evaluate_model(DecisionTreeClassifier(random_state=42), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: DecisionTreeClassifier
Accuracy: 0.39325842696629215
Precision: 0.3974035576622274
Recall: 0.39325842696629215
F1 Score: 0.3951429017120973


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,W
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,W
3,Sheffield Utd,Crystal Palace,1,W
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,D
3498,Strasbourg,Lyon,0,D
3499,PSG,Metz,0,D
3500,Nantes,Monaco,0,D


### K-Neighbours

In [10]:
from sklearn.neighbors import KNeighborsClassifier
evaluate_model(KNeighborsClassifier(), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: KNeighborsClassifier
Accuracy: 0.4336376404494382
Precision: 0.4404733987958723
Recall: 0.4336376404494382
F1 Score: 0.43548007213087137


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,L
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,L
3,Sheffield Utd,Crystal Palace,1,L
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,W
3500,Nantes,Monaco,0,L


### Naive Bayes

In [11]:
from sklearn.naive_bayes import GaussianNB
evaluate_model(GaussianNB(), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: GaussianNB
Accuracy: 0.4469803370786517
Precision: 0.4758330031381299
Recall: 0.4469803370786517
F1 Score: 0.4556111638571228


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,L
1,Arsenal,Nottingham,1,W
2,Bournemouth,West Ham,1,L
3,Sheffield Utd,Crystal Palace,1,L
4,Brighton,Luton,1,W
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,W
3500,Nantes,Monaco,0,L


### Gradient Boosting

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
evaluate_model(GradientBoostingClassifier(random_state=42), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: GradientBoostingClassifier
Accuracy: 0.5158005617977528
Precision: 0.48195014647256845
Recall: 0.5158005617977528
F1 Score: 0.4672849819976799


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,L
1,Arsenal,Nottingham,1,L
2,Bournemouth,West Ham,1,L
3,Sheffield Utd,Crystal Palace,1,L
4,Brighton,Luton,1,L
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,L
3500,Nantes,Monaco,0,L


### Extra Trees

In [13]:
from sklearn.ensemble import ExtraTreesClassifier
evaluate_model(ExtraTreesClassifier(random_state=42), df_train_preprocessed_features, y_train_encoded, df_test_preprocessed_features, y_test_encoded, prediction_df)

Model: ExtraTreesClassifier
Accuracy: 0.4666432584269663
Precision: 0.449857110445794
Recall: 0.4666432584269663
F1 Score: 0.45577893851144613


Unnamed: 0,team,opponent,is_home,predicted_result
0,Burnley,Manchester City,1,W
1,Arsenal,Nottingham,1,L
2,Bournemouth,West Ham,1,L
3,Sheffield Utd,Crystal Palace,1,W
4,Brighton,Luton,1,L
...,...,...,...,...
3497,Clermont,Lorient,0,L
3498,Strasbourg,Lyon,0,L
3499,PSG,Metz,0,L
3500,Nantes,Monaco,0,L


In [14]:
main_df = pd.read_csv('./data/teams/teams.csv')

In [15]:
encoded_df = pd.read_csv(f"./data/machine_learning/train_teams.csv", encoding='utf-8')
scores_df = pd.read_csv(f"./data/scores/scores.csv", encoding='utf-8')

FileNotFoundError: [Errno 2] No such file or directory: './data/machine_learning/train_teams.csv'

In [None]:
encoded_df

In [None]:
encoded_df["avg_age"]

In [None]:
encoded_df.groupby('season_start_year').squad.count().unique()

In [None]:
print(f"Initial Dataframe is {len(scores_df)} rows")

home_df = scores_df.copy()
away_df = scores_df.copy()

home_df['time'] = home_df['time']
home_df['team'] = home_df['home_team']
home_df['opponent'] = home_df['away_team']
home_df['is_home'] = 1
home_df['score'] = home_df['home_score']
home_df['opponent_score'] = home_df['away_score']

away_df['time'] = away_df['time']
away_df['team'] = away_df['away_team']
away_df['opponent'] = away_df['home_team']
away_df['is_home'] = 0
away_df['score'] = away_df['away_score']
away_df['opponent_score'] = away_df['home_score']

final_scores_df = pd.concat([home_df, away_df], ignore_index=True)

# Selecting the relevant columns for the new dataframe
final_scores_df = final_scores_df[['time','team', 'opponent', 'is_home', 'score', 'opponent_score', 'season_start_year', 'season_end_year']]

final_scores_df

In [None]:
teams_perf_df = encoded_df.add_prefix('team_')
teams_vs_perf_df = encoded_df.add_prefix('opponent_')

extended_df = pd.merge(final_scores_df, teams_perf_df, how='left', left_on=['team', 'season_start_year'], right_on=['team_squad', 'team_season_end_year'])
extended_df = pd.merge(extended_df, teams_vs_perf_df, how='left', left_on=['opponent', 'season_start_year'], right_on=['opponent_squad', 'opponent_season_end_year'])
extended_df.shape

In [None]:
print(f'Original Performance DF has {len(encoded_df.columns)} columns')
print(f'Original Scores DF has {len(final_scores_df.columns)} columns')
print(f'Final Dataframe should have {len(encoded_df.columns) * 2 + len(final_scores_df.columns)} columns')
print(f'Final Dataframe has {len(extended_df.columns)} columns')

In [None]:
df_sorted = extended_df.sort_values(by=['season_start_year', 'team', 'time'])
df_sorted

In [None]:
extended_df.loc[(extended_df['team'] == 'Heidenheim') & (extended_df['opponent'] == 'Werder Bremen')]

In [None]:
null_cols = df_sorted.columns[df_sorted.isnull().any()]
df_sorted.loc[df_sorted.isnull().any(axis=1), ['team', 'opponent', 'is_home', 'season_start_year', *null_cols]]

In [None]:
cols_to_drop = ['opponent', 'time', 'season_end_year', 'team_squad', 'team_season_start_year', 'opponent_squad', 'opponent_season_start_year', 'team_season_end_year', 'opponent_season_end_year']
df_sorted.drop(columns=cols_to_drop, inplace = True)
df_sorted

In [None]:
df_sorted['is_home'] = scaler.fit_transform(df_sorted[['is_home']])

In [None]:
SEQUENCE_LENGTH = 3
sequences = []
for _, group in df_sorted.groupby(['season_start_year', 'team']):
    for i in range(len(group) - SEQUENCE_LENGTH + 1):
        seq = group.iloc[i:i+SEQUENCE_LENGTH]
        sequences.append(seq)

In [None]:
label_columns = ['score', 'opponent_score']
feature_columns = [feature for feature in df_sorted.columns if feature not in ['team', 'season_start_year', *label_columns]]
N_FEATURES = len(feature_columns)
print(N_FEATURES)

In [None]:
from sklearn.model_selection import train_test_split

# Flatten sequences if necessary
X = []  # features
y = []  # [[score1, opponent_score1], [score2, opponent_score2], ...]

for sequence in sequences:
    # 'sequence' is a DataFrame representing a single sequence
    features = sequence[feature_columns].values
    labels = sequence[label_columns].values[-1] # only include last result
    
    # Append to your lists
    X.append(features)
    y.append(labels)

X = np.array(X, dtype='float32')
y = np.array(y)

# Verify the shape of X
print("Shape of X:", X.shape)
print("Expected shape: (num_samples, SEQUENCE_LENGTH, N_FEATURES)")

# Verify the shape of y
print("Shape of y:", y.shape)
print("Expected shape: (num_samples, 2)")

# Assuming 'X' and 'y' are prepared from your sequences
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(SEQUENCE_LENGTH, N_FEATURES), 
               kernel_regularizer=l1_l2(l1=0.01, l2=0.01), 
               recurrent_regularizer=l1_l2(l1=0.01, l2=0.01)))
model.add(Dropout(0.2))
model.add(Dense(2, activation='relu', kernel_regularizer=l1_l2(l1=0.01, l2=0.01))) # predicting goals for both teams
model.compile(optimizer='adam', loss='mse')

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model

# This saves a PNG image of your model architecture to a file
plot_model(model, to_file='lstm_model.png', show_shapes=True, show_layer_names=True)

In [None]:
import datetime
from tensorflow.keras.callbacks import TensorBoard

# Create a logs directory with a timestamp to organize training runs
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=100, validation_split=0.2, callbacks=[early_stopping, tensorboard_callback])

In [None]:
# !tensorboard --logdir logs/fit --bind_all

In [None]:
y_pred = model.predict(X_test)  # Model predictions
y_pred = np.rint(y_pred).astype(int)

In [None]:
def outcomes_from_scores(scores):
    # Assuming scores is an array of shape (n_samples, 2) where scores[:, 0] is the predicted scores for the home team
    # and scores[:, 1] is for the away team
    outcomes = ['win' if team_score > opponent_score else 'draw' if team_score == opponent_score else 'lose' for team_score, opponent_score in scores]
    return outcomes

y_pred_outcomes = outcomes_from_scores(y_pred)
y_test_outcomes = outcomes_from_scores(y_test)

In [None]:
# Assuming y_pred and y_test are numpy arrays of the scores
comparison_df = pd.DataFrame({
    'Predicted Team Score': y_pred[:, 0],
    'Predicted Opponent Score': y_pred[:, 1],
    'Actual Team Score': y_test[:, 0],
    'Actual Opponent Score': y_test[:, 1],
    'Predicted Outcome': y_pred_outcomes,
    'Actual Outcome': y_test_outcomes
})

# Sample for inspection
display(comparison_df.sample(10))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

conf_matrix = confusion_matrix(y_test_outcomes, y_pred_outcomes, labels=["win", "draw", "lose"])

fig, ax = plt.subplots(figsize=(10, 7)) 
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax,
            xticklabels=["win", "draw", "lose"], yticklabels=["win", "draw", "lose"])

# Labels, title, and ticks
label_font = {'size':'14'}
ax.set_xlabel('Predicted labels', fontdict=label_font)
ax.set_ylabel('True labels', fontdict=label_font)
ax.set_title('Confusion Matrix', fontdict={'size':'16'}) 
ax.tick_params(axis='both', which='major', labelsize=12)

plt.show()

In [None]:
# Calculate the accuracy
accuracy = accuracy_score(y_test_outcomes, y_pred_outcomes)
print("Accuracy:", accuracy)

In [None]:
print("Classification Report:")
print(classification_report(y_test_outcomes, y_pred_outcomes, labels=['win', 'draw', 'lose']))

In [None]:
# model = Sequential()
# # Assuming M is the number of features after preprocessing and Dense layers
# model.add(Dense(64, activation='relu', input_shape=(38, M)))  # Dense layer example
# model.add(LSTM(128, return_sequences=False))  # LSTM layer
# model.add(Dense(3, activation='softmax'))  # Output layer for 3 classes: win/loss/draw

# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(json.dumps(TEAM_COLUMNS_DICT_COMBINED, indent=2))