# Model Selection

### Overview

We explore and choose models for predicting Over2.5 goals. 

### Environment Setup

Create the conda environment using the `aifootballpredictions_notebooks.yml` file in the `conda` folder:

```bash
conda env create -f conda/aifootballpredictions_notebooks.yml
conda activate aifootballpredictions_notebooks
```

#### Run the notebook on your local GPU

Follow the official tensorflow installation guide [here](https://www.tensorflow.org/install/pip#windows-native).

In [87]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.experimental import enable_halving_search_cv # noqa
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, HalvingGridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, f1_score, roc_auc_score
from sklearn.pipeline import Pipeline
import xgboost as xgb
import numpy as np
import tensorflow as tf
from tensorflow.python.client import device_lib
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [7]:
# Check the available devices
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 7869017430287257553
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 1734606848
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15568512601144704986
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.6"
xla_global_id: 416903419
]


In [8]:
# Check if TensorFlow is built with CUDA support
tf.test.is_built_with_cuda()

True

In [9]:
# Check the available GPUs
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [10]:
print(tf.config.list_physical_devices('GPU'))

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
# read the data
uk_data = pd.read_csv('../data/processed/E0_merged_preprocessed.csv')

In [12]:
# show the first 5 rows of the data
uk_data.head()

Unnamed: 0,Date,Div,Time,HomeTeam,AwayTeam,FTR,HTR,Referee,Season,Last5HomeOver2.5Perc,...,HomeOver2.5Perc,AvgLast5AwayGoalsConceded,AvgLast5HomeGoalsScored,AwayOver2.5Perc,AvgLast5HomeGoalsConceded,AvgLast5AwayGoalsScored,PC<2.5,B365C>2.5,HR,Over2.5
0,2022-08-05,E0,20:00,Crystal Palace,Arsenal,A,A,A Taylor,2022/2023,0.0,...,42.11,0.0,0.0,47.37,2.0,2.0,1.78,2.1,0,0
1,2022-08-20,E0,17:30,Bournemouth,Arsenal,A,A,C Pawson,2022/2023,50.0,...,47.37,0.0,1.0,47.37,1.5,2.5,2.1,1.72,0,1
2,2022-09-04,E0,16:30,Man United,Arsenal,H,H,P Tierney,2022/2023,100.0,...,57.89,1.0,2.0,47.37,1.33,2.0,2.18,1.72,0,1
3,2022-09-18,E0,12:00,Brentford,Arsenal,A,A,D Coote,2022/2023,75.0,...,47.37,0.75,2.5,47.37,1.5,2.25,2.13,1.72,0,1
4,2022-10-16,E0,14:00,Leeds,Arsenal,A,A,C Kavanagh,2022/2023,40.0,...,63.16,0.6,1.2,47.37,0.6,2.0,2.39,1.61,0,0


In [37]:
# select the target variable
y = uk_data['Over2.5'].values

# Select only numerical columns for X, excluding 'Date' and the target variable 'Over2.5'
numerical_columns = uk_data.select_dtypes(include=['number']).columns
X = uk_data[numerical_columns].drop(columns=['Over2.5']).values

In [38]:
numerical_columns

Index(['Last5HomeOver2.5Perc', 'Last5AwayOver2.5Perc', 'HST', 'AST',
       'HomeOver2.5Perc', 'AvgLast5AwayGoalsConceded',
       'AvgLast5HomeGoalsScored', 'AwayOver2.5Perc',
       'AvgLast5HomeGoalsConceded', 'AvgLast5AwayGoalsScored', 'PC<2.5',
       'B365C>2.5', 'HR', 'Over2.5'],
      dtype='object')

In [39]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0,

In [40]:
X

array([[  0.  ,   0.  ,   2.  , ...,   1.78,   2.1 ,   0.  ],
       [ 50.  ,  50.  ,   1.  , ...,   2.1 ,   1.72,   0.  ],
       [100.  ,  66.67,   6.  , ...,   2.18,   1.72,   0.  ],
       ...,
       [ 60.  ,  60.  ,   6.  , ...,   2.  ,   1.85,   0.  ],
       [ 40.  ,  60.  ,   9.  , ...,   1.92,   1.95,   0.  ],
       [ 80.  ,  60.  ,  12.  , ...,   3.57,   1.25,   0.  ]])

### Nested Cross Validation

In [42]:
def create_dnn_model(input_dim: int, dropout_rate: float = 0.5) -> tf.keras.Model:
    """
    Creates a Deep Neural Network (DNN) model for binary classification.

    Parameters:
    ----------
    input_dim : int
        The number of input features (dimensions).
    dropout_rate : float, optional
        The dropout rate to be used in Dropout layers to prevent overfitting (default is 0.5).

    Returns:
    -------
    tf.keras.Model
        A compiled DNN model ready for training.
    """
    model = Sequential()

    # Input layer
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Hidden layers
    model.add(Dense(64, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

    return model


In [52]:
def create_lstm_model(input_shape: tuple, dropout_rate: float = 0.5) -> tf.keras.Model:
    """
    Creates an LSTM model for binary classification.

    Parameters:
    ----------
    input_shape : tuple
        The shape of the input data (timesteps, features).
    dropout_rate : float, optional
        The dropout rate to be used in Dropout layers to prevent overfitting (default is 0.5).

    Returns:
    -------
    tf.keras.Model
        A compiled LSTM model ready for training.
    """
    model = Sequential()

    # LSTM layer
    model.add(LSTM(128, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    model.add(LSTM(64, activation='relu', input_shape=input_shape))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Dense hidden layer
    model.add(Dense(32, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(dropout_rate))

    # Output layer
    model.add(Dense(1, activation='sigmoid'))

    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'], verbose=1)

    return model


In [88]:
# Logistic Regression Model and Hyperparameters
lr_model = LogisticRegression(solver='liblinear')
lr_param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# K-Nearest Neighbors Model and Hyperparameters
knn_model = KNeighborsClassifier()
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Support Vector Machine Model and Hyperparameters
svm_model = SVC(probability=True)
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4, 5],
    'class_weight': [None, 'balanced']

}

# Random Forest Model and Hyperparameters
rf_model = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees in the forest
    'max_depth': [3, 5, 7, 9],  # Maximum depth of the tree (None means nodes are expanded until all leaves are pure)
    #'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    #'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    #'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    #'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
    #'class_weight': [None, 'balanced', 'balanced_subsample']  # Weighing of classes in case of class imbalance
}

xgb_model = xgb.XGBClassifier(tree_method = "hist", 
                              device = "cuda",
                              eval_metric='logloss')
xgb_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [3, 5, 7, 9]
}

# Deep Neural Network Model
dnn_model = create_dnn_model(input_dim=X.shape[1])
dnn_param_grid = {
    'batch_size': [32, 64],
    'epochs': [10, 20],
    'dropout_rate': [0.3, 0.5]
}

In [89]:
# Define scoring metrics
accuracy_scorer = make_scorer(accuracy_score)
precision_scorer = make_scorer(precision_score)
f1_scorer = make_scorer(f1_score)
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)



In [None]:
# Outer loop: 5-fold cross-validation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Inner loop: 3-fold cross-validation for hyperparameter tuning
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Combine the models and hyperparameters into a dictionary
models = {
    'XGBoost': (xgb_model, xgb_param_grid),
    #'LSTM': (lstm_model, lstm_param_grid),
    #'Neural Network': (dnn_model, dnn_param_grid),
    'Logistic Regression': (lr_model, lr_param_grid),
    'KNN': (knn_model, knn_param_grid),
    'SVM': (svm_model, svm_param_grid),
    'Random Forest': (rf_model, rf_param_grid),
}

results = {}
best_params = {}

for model_name, (model, param_grid) in models.items():
    print(f"Evaluating {model_name}...")
    
    # Initialize GridSearchCV with the inner cross-validation and hyperparameter grid
    grid_search = HalvingGridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring=accuracy_scorer, verbose=0)
    
    # Perform nested cross-validation
    nested_cv_score = cross_val_score(grid_search, X, y, cv=outer_cv, scoring=accuracy_scorer)
    
    # Fit the grid search on the whole dataset to get the best parameters
    grid_search.fit(X, y)
    
    # Store the results and best parameters
    results[model_name] = nested_cv_score
    best_params[model_name] = grid_search.best_params_
    
    print(f"{model_name} - {accuracy_scorer._score_func.__name__}: {np.mean(nested_cv_score):.4f} ± {np.std(nested_cv_score):.4f}")
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

In [91]:
# Show the best parameters for each model
for model_name, params in best_params.items():
    print(f"Best parameters for {model_name}: {params}")

Best parameters for XGBoost: {'max_depth': 3, 'n_estimators': 50}
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}
Best parameters for SVM: {'C': 10, 'class_weight': None, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best parameters for Random Forest: {'max_depth': 7, 'n_estimators': 200}


In [92]:
# Compare models
print(f"\nModel Comparison {accuracy_scorer._score_func.__name__}:")
for model_name, scores in results.items():
    print(f"{model_name}: {np.mean(scores):.4f} ± {np.std(scores):.4f}")


Model Comparison accuracy_score:
XGBoost: 0.7674 ± 0.0411
Logistic Regression: 0.8125 ± 0.0218
KNN: 0.7514 ± 0.0286
SVM: 0.8125 ± 0.0306
Random Forest: 0.7699 ± 0.0291


- First Run hyperparameters selection
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2'}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best parameters for SVM: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

Model Comparison accuracy_score:
XGBoost: 0.7740 ± 0.0378
Logistic Regression: 0.8125 ± 0.0196
KNN: 0.7634 ± 0.0317
SVM: 0.8058 ± 0.0308

- Second Run hyperparameters selection
Best parameters for XGBoost: {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200}
Best parameters for Logistic Regression: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best parameters for SVM: {'C': 10, 'class_weight': 'balanced', 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

Model Comparison accuracy_score:
XGBoost: 0.7740 ± 0.0267
Logistic Regression: 0.7966 ± 0.0281
KNN: 0.7713 ± 0.0328
SVM: 0.8125 ± 0.0233

- Third Run
Best parameters for XGBoost: {'max_depth': 3, 'n_estimators': 50}
Best parameters for Logistic Regression: {'C': 1, 'penalty': 'l1', 'solver': 'liblinear'}
Best parameters for KNN: {'metric': 'manhattan', 'n_neighbors': 5, 'weights': 'uniform'}
Best parameters for SVM: {'C': 10, 'class_weight': None, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best parameters for Random Forest: {'max_depth': 7, 'n_estimators': 200}

Model Comparison accuracy_score:
XGBoost: 0.7674 ± 0.0411
Logistic Regression: 0.8125 ± 0.0218
KNN: 0.7514 ± 0.0286
SVM: 0.8125 ± 0.0306
Random Forest: 0.7699 ± 0.0291

### Ensamble learning

In [93]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import operator

In [96]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
 def __init__(self, classifiers, vote='classlabel', weights=None):

    self.classifiers = classifiers
    self.named_classifiers = {
    key: value for key,
    value in _name_estimators(classifiers)
    }
    self.vote = vote
    self.weights = weights

def fit(self, X, y):
    if self.vote not in ('probability', 'classlabel'):
        raise ValueError("vote must be 'probability' or 'classlabel'; got (vote=%r)"
                         % self.vote)

    if self.weights and len(self.weights) != len(self.classifiers):
        raise ValueError('Number of classifiers and weights must be equal;'
                         ' got %d weights, %d classifiers'
                         % (len(self.weights), len(self.classifiers)))

    self.labelenc_ = LabelEncoder()
    self.labelenc_.fit(y)
    self.classes_ = self.labelenc_.classes_
    self.classifiers_ = []
    for clf in self.classifiers:
        fitted_clf = clone(clf).fit(X, self.labelenc_.transform(y))
        self.classifiers_.append(fitted_clf)
    return self

def predict(self, X):
    if self.vote == 'probability':
        maj_vote = np.argmax(self.predict_proba(X), axis=1)
    else:  # 'classlabel' vote

        predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
        maj_vote = np.apply_along_axis(
            lambda x:
            np.argmax(np.bincount(x, weights=self.weights)),
            axis=1,
            arr=predictions)
    maj_vote = self.labelenc_.inverse_transform(maj_vote)
    return maj_vote

def predict_proba(self, X):
    probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
    avg_proba = np.average(probas, axis=0, weights=self.weights)
    return avg_proba    

def get_params(self, deep=True):
    if not deep:
        return super(MajorityVoteClassifier, self).get_params(deep=False)
    else:
        out = self.named_classifiers.copy()
        for name, step in self.named_classifiers.items():
            for key, value in step.get_params(deep=True).items():
                out['%s__%s' % (name, key)] = value
        return out


In [109]:
# Define the best models with their best hyperparameters
best_models = {
    'XGBoost': xgb.XGBClassifier(**best_params['XGBoost']),
    'Logistic Regression': LogisticRegression(**best_params['Logistic Regression']),
    'KNN': KNeighborsClassifier(**best_params['KNN']),
    'SVM': SVC(**best_params['SVM']),
    'Random Forest': RandomForestClassifier(**best_params['Random Forest'])
}

mv_clf = MajorityVoteClassifier(classifiers=[best_models["Logistic Regression"],
                                best_models["KNN"],
                                best_models["SVM"],
                                best_models["Random Forest"]])

clf_labels = ['Logistic Regression', 'KNN', 'SVM', 'Random Forest', 'Majority Voting']

all_clf = [ mv_clf]