# Regression task
## Before start

### TODO steps:

1. read csv  

2. write pipeline: split data, train model, predict, evaluate  

3. models: 
    LinearRegression(), 
    RandomForestRegressor(), 
    GradientBoostingRegressor() (xgboost), 
    SVR(), 
    neural network

4. hyperparameter tuning: 
    GridSearchCV, 
    RandomizedSearchCV

5. dont forget about time series cross validation

## Import libraries

In [11]:
%reset

In [12]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor


from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



## Read data

In [13]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [14]:
# Create target
df_all['Full_Time_Total_Goals'] = df_all['Full_Time_Home_Team_Goals'] + df_all['Full_Time_Away_Team_Goals']
df_league_0['Full_Time_Total_Goals'] = df_league_0['Full_Time_Home_Team_Goals'] + df_league_0['Full_Time_Away_Team_Goals']
df_league_1['Full_Time_Total_Goals'] = df_league_1['Full_Time_Home_Team_Goals'] + df_league_1['Full_Time_Away_Team_Goals']
df_league_2['Full_Time_Total_Goals'] = df_league_2['Full_Time_Home_Team_Goals'] + df_league_2['Full_Time_Away_Team_Goals']
df_league_3['Full_Time_Total_Goals'] = df_league_3['Full_Time_Home_Team_Goals'] + df_league_3['Full_Time_Away_Team_Goals']

## Pipeline

In [15]:
def run_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
                       'Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H',]

    X = df.drop(columns=[target] + columns_to_drop, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=tscv, scoring='neg_mean_squared_error', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Save the model to a file
    with open(f'../models/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }

    return pd.DataFrame([result])


In [16]:
# Define models and hyperparameter distributions
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor()),
    ('XGBoost', XGBRegressor())
]

param_dist = {
    'Linear Regression': {'fit_intercept': [True, False]},

    'Random Forest': {'n_estimators': [10, 50, 100],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10]},


    'XGBoost': {'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_child_weight': [1, 3, 5]},

}


dfs = [
    (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

target = 'Full_Time_Total_Goals'

# Run the pipeline for each model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(models, desc=f'Processing dataframe {df_name}'):
        result = run_pipeline(df, df_name, target, model_name, model, param_dist[model_name], test_size=0.3, random_state=42)
        result_df = pd.concat([result_df, result], ignore_index=True)
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='MSE', ascending=True))

# Sort results by MSE
# result_df.sort_values(by='MSE', ascending=True)


Processing dataframe df_all: 100%|██████████| 1/1 [04:48<00:00, 288.37s/it]


Dataframe:  df_all





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Neural Network Regression,"{'max_iter': 200, 'learning_rate_init': 0.1, '...",8.059147,2.363497,-1.96934


Processing dataframe df_league_0: 100%|██████████| 1/1 [00:56<00:00, 56.64s/it]


Dataframe:  df_league_0





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Neural Network Regression,"{'max_iter': 200, 'learning_rate_init': 0.1, '...",2.521295,1.270696,-0.002012


Processing dataframe df_league_1: 100%|██████████| 1/1 [02:59<00:00, 179.20s/it]


Dataframe:  df_league_1





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Neural Network Regression,"{'max_iter': 100, 'learning_rate_init': 0.01, ...",134.835373,11.483359,-48.668488


Processing dataframe df_league_2: 100%|██████████| 1/1 [02:00<00:00, 120.81s/it]


Dataframe:  df_league_2





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Neural Network Regression,"{'max_iter': 200, 'learning_rate_init': 0.1, '...",2.621736,1.29412,-0.020702


Processing dataframe df_league_3: 100%|██████████| 1/1 [01:04<00:00, 64.83s/it]


Dataframe:  df_league_3





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Neural Network Regression,"{'max_iter': 100, 'learning_rate_init': 0.01, ...",309152700.0,17574.915207,-114042100.0


In [17]:
df_all[target].describe()

count    32862.000000
mean         2.645944
std          1.637276
min          0.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         13.000000
Name: Full_Time_Total_Goals, dtype: float64