In [46]:
%reset

In [47]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

import pandas as pd
from tqdm import tqdm
import pickle


In [48]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [49]:
result_columns = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
df_all['Match_Result'] = df_all[result_columns].idxmax(axis=1)


In [50]:
df_all.drop(columns=result_columns, axis=1, inplace=True)

In [51]:
label_encoder = LabelEncoder()
df_all['Match_Result'] = label_encoder.fit_transform(df_all['Match_Result'])

In [52]:
# df_encoded = pd.get_dummies(df_all, columns=['Match_Result'], prefix='Match_Result')

In [53]:
# def encode_result_column(df):
#     result_columns = ['Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']
#     encoding_scheme = {'Full_Time_Result_A': 1, 'Full_Time_Result_D': 2, 'Full_Time_Result_H': 0}
#     df['Encoded_Result'] = df[result_columns].idxmax(axis=1)
#     df['Encoded_Result'] = df['Encoded_Result'].map(encoding_scheme)
#     return df
# 
# # Dataframes to apply the function
# dfs_to_encode = [
#     (df_all, 'df_all'),
#     (df_league_0, 'df_league_0'),
#     (df_league_1, 'df_league_1'),
#     (df_league_2, 'df_league_2'),
#     (df_league_3, 'df_league_3')
# ]
# 
# # Apply the function to each dataframe
# df_all = encode_result_column(df_all)
# df_league_0 = encode_result_column(df_league_0)
# df_league_1 = encode_result_column(df_league_1)
# df_league_2 = encode_result_column(df_league_2)
# df_league_3 = encode_result_column(df_league_3)



In [54]:
def run_classification_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    # columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
    #                    'Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H']

    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals']

    X = df.drop(columns=[target] + columns_to_drop, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=tscv, scoring='accuracy', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Save the model to a file
    with open(f'../models/classification/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    confusion = confusion_matrix(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        # 'Confusion Matrix': confusion
    }

    return pd.DataFrame([result])


In [55]:
# Define models and hyperparameter distributions for classification
classification_models = [
    ('Logistic Regression', LogisticRegression()),
    ('Random Forest', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('Neural Network Classification', MLPClassifier())
]

classification_param_dist = {
    'Logistic Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},

    'Random Forest': {
        'n_estimators': [10, 50, 100],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2', None]},

    'XGBoost': {'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_child_weight': [1, 3, 5]},

    'Neural Network Classification': {
        'hidden_layer_sizes': [(50, 50), (100, 50, 25)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'max_iter': [200]}

}


dfs = [
    # (df_encoded, 'df_encoded')
    (df_all, 'df_all'),
    # (df_league_0, 'df_league_0'),
    # (df_league_1, 'df_league_1'),
    # (df_league_2, 'df_league_2'),
    # (df_league_3, 'df_league_3')
]

target = 'Match_Result'


# Run the pipeline for each classification model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(classification_models, desc=f'Processing dataframe {df_name}'):
        result = run_classification_pipeline(df, df_name, target, model_name, model, classification_param_dist[model_name], test_size=0.3, random_state=42)
        result_df = pd.concat([result_df, result], ignore_index=True)
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='Accuracy', ascending=False))


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Processing dataframe df_all: 100%|██████████| 4/4 [20:50<00:00, 312.52s/it]


Dataframe:  df_all





Unnamed: 0,Model,Best Parameters,Accuracy,Precision,Recall,F1 Score
2,XGBoost,"{'n_estimators': 100, 'min_child_weight': 1, '...",0.559185,0.525743,0.559185,0.506055
1,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",0.55036,0.515892,0.55036,0.518075
0,Logistic Regression,{'C': 0.001},0.430774,0.185566,0.430774,0.259393
3,Neural Network Classification,"{'max_iter': 200, 'hidden_layer_sizes': (50, 5...",0.430774,0.185566,0.430774,0.259393
