In [182]:
%reset

In [183]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, Perceptron, PassiveAggressiveClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid, KNeighborsClassifier

import pandas as pd
from tqdm import tqdm
import pickle
import category_encoders as ce

In [184]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [185]:
# class_names = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
# 
# df = df_league_0.copy()
# 
# df['Match_Result'] = df[class_names].idxmax(axis=1)
# 
# enc = ce.OneHotEncoder().fit(df['Match_Result'].astype(str))
# y_onehot = enc.transform(df['Match_Result'].astype(str))
# print("One-hot encoded labels:")
# print(y_onehot)
# 
# names = y_onehot.columns
# 
# for class_ in names:
#     enc_target = ce.TargetEncoder(smoothing=0)
#     transformed_color = enc_target.fit_transform(df['Match_Result'], y_onehot[class_])
#     print(f"\nTarget encoding for {class_}:")
#     print(transformed_color)

In [174]:
def target_encode_multiclass(X,y): #X,y are pandas df and series
    y=y.astype(str)   #convert to string to onehot encode
    enc=ce.OneHotEncoder().fit(y)
    y_onehot=enc.transform(y)
    class_names=y_onehot.columns  #names of onehot encoded columns
    X_obj=X.select_dtypes('object') #separate categorical columns
    X=X.select_dtypes(exclude='object')
    for class_ in class_names:

        enc=ce.TargetEncoder()
        enc.fit(X_obj,y_onehot[class_]) #convert all categorical 
        temp=enc.transform(X_obj)       #columns for class_
        temp.columns=[str(x)+'_'+str(class_) for x in temp.columns]
        X=pd.concat([X,temp],axis=1)    #add to original dataset

    return X

In [175]:
df = df_league_0.copy()
class_names = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
df['Match_Result'] = df[class_names].idxmax(axis=1).astype(object)
X = df.drop(columns=['Match_Result'] + class_names, axis=1)
y = df['Match_Result']
X_encoded = target_encode_multiclass(X, y)



In [176]:
# result_columns = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
# df_league_0['Match_Result'] = df_league_0[result_columns].idxmax(axis=1)
df.dtypes

League                         int64
Match_Date                   float64
Full_Time_Home_Team_Goals    float64
Full_Time_Away_Team_Goals    float64
DayOfWeek                    float64
                              ...   
AwayTeam_Waregem               int64
AwayTeam_Groningen             int64
AwayTeam_Ajax                  int64
AwayTeam_Monza                 int64
Match_Result                  object
Length: 975, dtype: object

In [177]:
columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals']
df_league_0.drop(columns=columns_to_drop, axis=1, inplace=True)

In [178]:
# label_encoder = LabelEncoder()
# df_league_0['Match_Result'] = label_encoder.fit_transform(df_league_0['Match_Result'])

In [179]:
def run_classification_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    

    train_split = df[df['Season_2122'] != 1]
    test_split = df[df['Season_2122'] == 1]
    
    # X = train_split.drop(columns=[target] + columns_to_drop, axis=1)
    # y = test_split[target]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    X_train = train_split.drop(columns=target, axis=1)
    y_train = train_split[target]
    
    X_test = test_split.drop(columns=target, axis=1)
    y_test = test_split[target]

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=5, cv=tscv, scoring='accuracy', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Save the model to a file
    with open(f'../models/classification/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division='warn')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        # 'Confusion Matrix': confusion
    }

    return pd.DataFrame([result])


In [180]:
# Define models and hyperparameter distributions for classification
classification_models = [
    ('Gaussian Process', GaussianProcessClassifier(multi_class='one_vs_rest')),
    ('Linear SVM', LinearSVC(multi_class='ovr')),
    ('Logistic Regression', LogisticRegression(multi_class='ovr')),
    ('Logistic Regression CV', LogisticRegressionCV(multi_class='ovr')),
    ('SGD Classifier', SGDClassifier()),
    ('Passive Aggressive Classifier', PassiveAggressiveClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('K Neighbors Classifier', KNeighborsClassifier()),
    ('Neural Network Classification', MLPClassifier())
]

classification_param_dist = {
    'Gaussian Process': {},  # No hyperparameters specified

    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'max_iter': [50, 100, 200],
    },

    'Linear SVM': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'loss': ['hinge', 'squared_hinge'],
        'dual': [True, False],
        'max_iter': [1000, 2000, 3000],
    },

    'Logistic Regression CV': {'Cs': [1, 3, 5],
                               'max_iter': [100, 500, 1000],
                               'cv': [3, 5, 10]},

    'SGD Classifier': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                       'max_iter': [100, 500, 1000],
                       'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},


    'Passive Aggressive Classifier': {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                                      'max_iter': [100, 500, 1000],
                                      'loss': ['hinge', 'squared_hinge']},

    'Random Forest': {'n_estimators': [10, 50, 100],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10],
                      'max_features': ['sqrt', 'log2', None]},

    'XGBoost': {'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_child_weight': [1, 3, 5]},

    'K Neighbors Classifier': {'n_neighbors': [3, 5, 10, 20],
                               'weights': ['uniform', 'distance'],
                               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                               'leaf_size': [30, 50, 100],
                               'p': [1, 2]},
    
    'Neural Network Classification': {'hidden_layer_sizes': [(50, 50), (100, 50, 25)],
                                      'activation': ['relu', 'tanh'],
                                      'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                                      'max_iter': [200]}
}


In [181]:
dfs = [
    # (df_encoded, 'df_encoded')
    # (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    # (df_league_1, 'df_league_1'),
    # (df_league_2, 'df_league_2'),
    # (df_league_3, 'df_league_3')
]

target = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']


# Run the pipeline for each classification model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(classification_models, desc=f'Processing dataframe {df_name}'):
        result = run_classification_pipeline(df, df_name, target, model_name, model, classification_param_dist[model_name], test_size=0.3, random_state=42)
        result_df = pd.concat([result_df, result], ignore_index=True)
        display(result_df)
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='F1 Score', ascending=False))


Processing dataframe df_league_0:   0%|          | 0/10 [00:00<?, ?it/s]


ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py", line 703, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1163, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1184, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1245, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (386, 3) instead.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py", line 703, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1163, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1184, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1245, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (769, 3) instead.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py", line 703, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1163, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1184, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1245, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (1152, 3) instead.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py", line 703, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1163, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1184, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1245, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (1535, 3) instead.

--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/gaussian_process/_gpc.py", line 703, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1163, in check_X_y
    y = _check_y(y, multi_output=multi_output, y_numeric=y_numeric, estimator=estimator)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1184, in _check_y
    y = column_or_1d(y, warn=True)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py", line 1245, in column_or_1d
    raise ValueError(
ValueError: y should be a 1d array, got an array of shape (1918, 3) instead.
