In [10]:
%reset

In [11]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier, PassiveAggressiveClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

import pandas as pd
from tqdm import tqdm
import pickle

In [12]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [13]:
class_names = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
df_all['Match_Result'] = df_all[class_names].idxmax(axis=1).astype(object)
df_league_0['Match_Result'] = df_league_0[class_names].idxmax(axis=1).astype(object)
df_league_1['Match_Result'] = df_league_1[class_names].idxmax(axis=1).astype(object)
df_league_2['Match_Result'] = df_league_2[class_names].idxmax(axis=1).astype(object)
df_league_3['Match_Result'] = df_league_3[class_names].idxmax(axis=1).astype(object)

In [14]:
label_encoder = LabelEncoder()
df_all['Match_Result'] = label_encoder.fit_transform(df_all['Match_Result'])
df_league_0['Match_Result'] = label_encoder.fit_transform(df_league_0['Match_Result'])
df_league_1['Match_Result'] = label_encoder.fit_transform(df_league_1['Match_Result'])
df_league_2['Match_Result'] = label_encoder.fit_transform(df_league_2['Match_Result'])
df_league_3['Match_Result'] = label_encoder.fit_transform(df_league_3['Match_Result'])


In [15]:
columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals']
df_all.drop(columns=class_names + columns_to_drop, axis=1, inplace=True)
df_league_0.drop(columns=class_names + columns_to_drop, axis=1, inplace=True)
df_league_1.drop(columns=class_names + columns_to_drop, axis=1, inplace=True)
df_league_2.drop(columns=class_names + columns_to_drop, axis=1, inplace=True)
df_league_3.drop(columns=class_names + columns_to_drop, axis=1, inplace=True)

In [16]:
def run_classification_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    df = df.sort_values(by='Match_Date', ascending=True)
    
    train_split = df[df['Season_2122'] != 1]
    test_split = df[df['Season_2122'] == 1]
    
    # X = train_split.drop(columns=[target] + columns_to_drop, axis=1)
    # y = test_split[target]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    X_train = train_split.drop(columns=target, axis=1)
    y_train = train_split[target]
    
    X_test = test_split.drop(columns=target, axis=1)
    y_test = test_split[target]

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=5, cv=tscv, scoring='recall', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Save the model to a file
    with open(f'../models/classification/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division='warn')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    # confusion = confusion_matrix(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        # 'Confusion Matrix': confusion
    }

    return pd.DataFrame([result])


In [17]:
# Define models and hyperparameter distributions for classification
classification_models = [
    # ('Linear SVM', LinearSVC(multi_class='ovr')),
    # ('Logistic Regression', LogisticRegression(multi_class='ovr')),
    # ('Logistic Regression CV', LogisticRegressionCV(multi_class='ovr')),
    ('SGD Classifier', SGDClassifier()),
    ('Passive Aggressive Classifier', PassiveAggressiveClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('K Neighbors Classifier', KNeighborsClassifier()),
    ('Neural Network Classification', MLPClassifier())
]

classification_param_dist = {
    'Linear SVM': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l2'],
        'loss': ['hinge', 'squared_hinge'],
        'max_iter': [1000, 2000, 3000],
    },

    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'max_iter': [50, 100, 200, 500],
    },

    'Logistic Regression CV': {'Cs': [1, 3, 5],
                               'max_iter': [100, 500, 1000],
                               'cv': [3, 5, 10]},

    'SGD Classifier': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                       'max_iter': [100, 500, 1000],
                       'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']},


    'Passive Aggressive Classifier': {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                                      'max_iter': [100, 500, 1000],
                                      'loss': ['hinge', 'squared_hinge']},

    'Random Forest': {'n_estimators': [10, 50, 100],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10],
                      'max_features': ['sqrt', 'log2', None]},

    'XGBoost': {'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_child_weight': [1, 3, 5]},

    'K Neighbors Classifier': {'n_neighbors': [3, 5, 10, 20],
                               'weights': ['uniform', 'distance'],
                               'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                               'leaf_size': [30, 50, 100],
                               'p': [1, 2]},
    
    'Neural Network Classification': {'hidden_layer_sizes': [(50, 50), (100, 50, 25)],
                                      'activation': ['relu', 'tanh'],
                                      'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                                      'max_iter': [200]}
}


In [18]:
dfs = [
    (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

# target = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
target = 'Match_Result'

# Run the pipeline for each classification model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(classification_models, desc=f'Processing dataframe {df_name}'):
        result = run_classification_pipeline(df, df_name, target, model_name, model, classification_param_dist[model_name], test_size=0.3, random_state=42)
        result_df = pd.concat([result_df, result], ignore_index=True)
        display(result_df)
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='F1 Score', ascending=False))


Traceback (most recent call last):
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 211, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/Users/maksim/

InvalidParameterError: The 'loss' parameter of SGDClassifier must be a str among {'modified_huber', 'log_loss', 'perceptron', 'squared_hinge', 'hinge', 'epsilon_insensitive', 'squared_error', 'squared_epsilon_insensitive', 'huber'}. Got 'log' instead.