# Regression task
## Before start

### TODO steps:

1. read csv  

2. write pipeline: split data, train model, predict, evaluate  

3. models: 
    LinearRegression(), 
    RandomForestRegressor(), 
    GradientBoostingRegressor() (xgboost), 
    SVR(), 
    neural network

4. hyperparameter tuning: 
    GridSearchCV, 
    RandomizedSearchCV

5. dont forget about time series cross validation

## Import libraries

In [9]:
%reset

In [10]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import PoissonRegressor, TweedieRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit, ARDRegression



from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import warnings

# Disable all warnings (use with caution)
warnings.filterwarnings("ignore")

pd.set_option('display.float_format', lambda x: '%.3f' % x)


## Read data

In [20]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [21]:
# Create target
df_all['Full_Time_Total_Goals'] = (df_all['Full_Time_Home_Team_Goals'] + df_all['Full_Time_Away_Team_Goals']).astype(int)
df_league_0['Full_Time_Total_Goals'] = (df_league_0['Full_Time_Home_Team_Goals'] + df_league_0['Full_Time_Away_Team_Goals']).astype(int)
df_league_1['Full_Time_Total_Goals'] = (df_league_1['Full_Time_Home_Team_Goals'] + df_league_1['Full_Time_Away_Team_Goals']).astype(int)
df_league_2['Full_Time_Total_Goals'] = (df_league_2['Full_Time_Home_Team_Goals'] + df_league_2['Full_Time_Away_Team_Goals']).astype(int)
df_league_3['Full_Time_Total_Goals'] = (df_league_3['Full_Time_Home_Team_Goals'] + df_league_3['Full_Time_Away_Team_Goals']).astype(int)

In [13]:
# df_league_0['League'].value_counts()

## Pipeline

In [14]:
def run_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
                       'Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H',
                       'has_favorite_None', 'Time_kick_off_bin_Morning' ]

    df = df.sort_values(by='Match_Date', ascending=True)

    teams_to_drop = df.columns[df.columns.str.startswith('HomeTeam_') | df.columns.str.startswith('AwayTeam_')]
    df = df.drop(columns=teams_to_drop)
    
    # X = df.drop(columns=[target] + columns_to_drop, axis=1)
    # y = df[target]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    

    X = df[df['Season_2122'] != 1]
    y = df[df['Season_2122'] == 1]

    X_train = X.drop(columns=[target] + columns_to_drop, axis=1)
    y_train = X[target]

    X_test = y.drop(columns=[target] + columns_to_drop, axis=1)
    y_test = y[target]

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=50, cv=tscv, scoring='neg_mean_absolute_error', random_state=random_state, verbose=0)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Round the predicted values to integers
    y_pred_rounded = y_pred.round(decimals=0).astype(int)

    # Save the model to a file
    with open(f'../models/regression/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics on rounded predictions
    mse = mean_squared_error(y_test, y_pred_rounded)
    mae = mean_absolute_error(y_test, y_pred_rounded)
    r2 = r2_score(y_test, y_pred_rounded)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }

    return pd.DataFrame([result])


In [15]:
# Define models and hyperparameter distributions
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Random Forest Regression', RandomForestRegressor()),
    ('XGBoost Regression', XGBRegressor()),
    ('ARD Regression', ARDRegression()),
    ('Voting Regression', VotingRegressor(estimators=[
        ('ridge', Ridge(solver='cholesky', fit_intercept=False, alpha=10)),
        ('lasso', Lasso(tol=0.0001, max_iter=500, fit_intercept=True, alpha=0.001)),
        ('xgb', XGBRegressor(n_estimators=100, min_child_weight=3, max_depth=3, learning_rate=0.1)),
        ('ard', ARDRegression(tol=0.1, max_iter=200, lambda_2=1e-10, lambda_1=1e-08, alpha_2=1e-09, alpha_1=1e-08)),
    ])),
]

param_dist = {
    'Linear Regression': {'fit_intercept': [True, False]},

    'Ridge Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                         'fit_intercept': [True, False],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
                         'max_iter': [10, 50, 100, 500, 1000]},

    'Lasso Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                         'fit_intercept': [True, False],
                         'max_iter': [10, 50, 100, 500, 1000],
                         'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1]},

    'Random Forest Regression': {'n_estimators': [10, 50, 100],
                                 'max_depth': [None, 10, 20],
                                 'min_samples_split': [2, 5, 10]},

    'XGBoost Regression': {
        'n_estimators': [50, 100, 300],
        'max_depth': [2, 3, 5, 10, 15],
        'learning_rate': [0.05, 0.1, 0.15, 0.20],
        'min_child_weight': [1, 2, 3, 4]
    },

    'Orthogonal Matching Pursuit': {'n_nonzero_coefs': [None, 5, 10, 20],
                                    'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1]},

    'ARD Regression': {'max_iter': [100, 200, 300],
                       'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1],
                       'alpha_1': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6],
                       'alpha_2': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6],
                       'lambda_1': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6],
                       'lambda_2': [1e-10, 1e-9, 1e-8, 1e-7, 1e-6]},
    
    'Voting Regression': {}
}


In [16]:

dfs = [
    (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

target = 'Full_Time_Total_Goals'

final_results = []

# Run the pipeline for each model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(models, desc=f'Processing dataframe {df_name}'):
        result = run_pipeline(df, df_name, target, model_name, model, param_dist[model_name], test_size=0.3, random_state=4)
        result_df = pd.concat([result_df, result], ignore_index=True)
        # display(result_df)
    final_results.append((df_name, result_df))
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='MAE', ascending=True))



Processing dataframe df_league_0: 100%|██████████| 7/7 [01:05<00:00,  9.31s/it]


Dataframe:  df_league_0





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
4,XGBoost Regression,"{'n_estimators': 100, 'min_child_weight': 1, '...",2.177,1.17,0.168
5,ARD Regression,"{'tol': 0.1, 'max_iter': 300, 'lambda_2': 1e-0...",2.23,1.175,0.148
2,Lasso Regression,"{'tol': 0.01, 'max_iter': 500, 'fit_intercept'...",2.253,1.179,0.139
6,Voting Regression,No hyperparameter tuning,2.228,1.18,0.148
1,Ridge Regression,"{'solver': 'cholesky', 'max_iter': 10, 'fit_in...",2.255,1.187,0.138
0,Linear Regression,{'fit_intercept': True},2.25,1.192,0.14
3,Random Forest Regression,"{'n_estimators': 100, 'min_samples_split': 10,...",2.405,1.211,0.081


Processing dataframe df_league_1: 100%|██████████| 7/7 [05:53<00:00, 50.50s/it]


Dataframe:  df_league_1





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
5,ARD Regression,"{'tol': 1, 'max_iter': 100, 'lambda_2': 1e-06,...",2.252,1.141,0.182
2,Lasso Regression,"{'tol': 1, 'max_iter': 1000, 'fit_intercept': ...",2.254,1.141,0.182
1,Ridge Regression,"{'solver': 'svd', 'max_iter': 1000, 'fit_inter...",2.247,1.145,0.184
4,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 2, 'm...",2.27,1.147,0.176
6,Voting Regression,No hyperparameter tuning,2.255,1.148,0.181
3,Random Forest Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.424,1.193,0.12
0,Linear Regression,{'fit_intercept': True},2.716,1.299,0.014


Processing dataframe df_league_2: 100%|██████████| 7/7 [03:25<00:00, 29.41s/it]


Dataframe:  df_league_2





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
5,ARD Regression,"{'tol': 0.0001, 'max_iter': 300, 'lambda_2': 1...",2.04,1.084,0.161
2,Lasso Regression,"{'tol': 0.01, 'max_iter': 500, 'fit_intercept'...",2.041,1.089,0.16
1,Ridge Regression,"{'solver': 'svd', 'max_iter': 100, 'fit_interc...",2.032,1.09,0.164
6,Voting Regression,No hyperparameter tuning,2.028,1.09,0.166
0,Linear Regression,{'fit_intercept': True},2.072,1.1,0.148
4,XGBoost Regression,"{'n_estimators': 100, 'min_child_weight': 1, '...",2.081,1.11,0.144
3,Random Forest Regression,"{'n_estimators': 100, 'min_samples_split': 10,...",2.293,1.169,0.057


Processing dataframe df_league_3: 100%|██████████| 7/7 [01:07<00:00,  9.60s/it]


Dataframe:  df_league_3





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
1,Ridge Regression,"{'solver': 'cholesky', 'max_iter': 10, 'fit_in...",1.962,1.068,0.198
2,Lasso Regression,"{'tol': 0.01, 'max_iter': 500, 'fit_intercept'...",1.954,1.068,0.202
6,Voting Regression,No hyperparameter tuning,1.947,1.075,0.204
5,ARD Regression,"{'tol': 0.01, 'max_iter': 300, 'lambda_2': 1e-...",1.982,1.079,0.19
4,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.023,1.083,0.173
0,Linear Regression,{'fit_intercept': True},2.082,1.097,0.149
3,Random Forest Regression,"{'n_estimators': 50, 'min_samples_split': 10, ...",2.055,1.107,0.16


In [17]:
df_league_0[target].describe()

count   2889.000
mean       2.666
std        1.620
min        0.000
25%        2.000
50%        3.000
75%        4.000
max       10.000
Name: Full_Time_Total_Goals, dtype: float64

In [22]:
def drop_columns_and_teams(df):
    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
                       'has_favorite_None', 'Time_kick_off_bin_Morning' ]

    teams_to_drop = df.columns[df.columns.str.startswith('HomeTeam_') | df.columns.str.startswith('AwayTeam_')]

    # Drop specified columns and teams
    df_cleaned = df.drop(columns=columns_to_drop)
    df_cleaned = df_cleaned.drop(columns=teams_to_drop)

    return df_cleaned

dfs = [
    (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

# Apply the function to each dataframe in the list
dfs_cleaned = [(drop_columns_and_teams(df), name) for df, name in dfs]


In [25]:
# for df_cleaned, name in dfs_cleaned:
#     df_cleaned.to_csv(f'../data/for_train/cleaned/{name}_cleaned.csv', index=False)