# Regression task
## Before start

### TODO steps:

1. read csv  

2. write pipeline: split data, train model, predict, evaluate  

3. models: 
    LinearRegression(), 
    RandomForestRegressor(), 
    GradientBoostingRegressor() (xgboost), 
    SVR(), 
    neural network

4. hyperparameter tuning: 
    GridSearchCV, 
    RandomizedSearchCV

5. dont forget about time series cross validation

## Import libraries

In [1]:
%reset

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import PoissonRegressor, TweedieRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.linear_model import OrthogonalMatchingPursuit, ARDRegression



from sklearn.model_selection import train_test_split, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



## Read data

In [3]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [4]:
# Create target
df_all['Full_Time_Total_Goals'] = (df_all['Full_Time_Home_Team_Goals'] + df_all['Full_Time_Away_Team_Goals']).astype(int)
df_league_0['Full_Time_Total_Goals'] = (df_league_0['Full_Time_Home_Team_Goals'] + df_league_0['Full_Time_Away_Team_Goals']).astype(int)
df_league_1['Full_Time_Total_Goals'] = (df_league_1['Full_Time_Home_Team_Goals'] + df_league_1['Full_Time_Away_Team_Goals']).astype(int)
df_league_2['Full_Time_Total_Goals'] = (df_league_2['Full_Time_Home_Team_Goals'] + df_league_2['Full_Time_Away_Team_Goals']).astype(int)
df_league_3['Full_Time_Total_Goals'] = (df_league_3['Full_Time_Home_Team_Goals'] + df_league_3['Full_Time_Away_Team_Goals']).astype(int)

In [5]:
# test = df_league_0.copy()
# teams_to_drop = test.columns[test.columns.str.startswith('HomeTeam_') | test.columns.str.startswith('AwayTeam_')]
# test = test.drop(columns=teams_to_drop)

## Pipeline

In [6]:
def run_pipeline(df, df_name, target, model_name, model, param_dist, test_size=0.3, random_state=42):
    # Split data into train and test
    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
                       'Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H',]

    df = df.sort_values(by='Match_Date', ascending=True)

    teams_to_drop = df.columns[df.columns.str.startswith('HomeTeam_') | df.columns.str.startswith('AwayTeam_')]
    df = df.drop(columns=teams_to_drop)
    
    X = df.drop(columns=[target] + columns_to_drop, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    

    # train_split = df[df['Season_2122'] != 1]
    # test_split = df[df['Season_2122'] == 1]
    # 
    # X_train = train_split.drop(columns=[target] + columns_to_drop, axis=1)
    # y_train = train_split[target]
    # 
    # X_test = test_split.drop(columns=[target] + columns_to_drop, axis=1)
    # y_test = test_split[target]

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=5, cv=tscv, scoring='neg_mean_squared_error', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    best_model = random_search.best_estimator_
    y_pred = best_model.predict(X_test)

    # Save the model to a file
    with open(f'../models/regression/{df_name}_{model_name}.pkl', 'wb') as file:
        pickle.dump(best_model, file)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }

    return pd.DataFrame([result])


In [7]:
# Define models and hyperparameter distributions
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('Elastic Net Regression', ElasticNet()),
    ('Decision Tree Regression', DecisionTreeRegressor()),
    ('Random Forest Regression', RandomForestRegressor()),
    ('Gradient Boosting Regression', GradientBoostingRegressor()),
    ('K-Nearest Neighbors Regression', KNeighborsRegressor()),
    ('Neural Network Regression', MLPRegressor()),
    ('Huber Regression', HuberRegressor()),
    ('AdaBoost Regression', AdaBoostRegressor()),
    ('Bagging Regression', BaggingRegressor()),
    ('Extra Trees Regression', ExtraTreesRegressor()),
    ('Voting Regression', VotingRegressor(estimators=[('lr', LinearRegression()), ('rf', RandomForestRegressor()), ('xgb', XGBRegressor())])),
    ('XGBoost Regression', XGBRegressor()),
    ('Orthogonal Matching Pursuit', OrthogonalMatchingPursuit()),
    ('ARD Regression', ARDRegression()),
]

param_dist = {
    'Linear Regression': {'fit_intercept': [True, False]},

    'Ridge Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                         'fit_intercept': [True, False],
                         'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']},

    'Lasso Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                         'fit_intercept': [True, False],
                         'max_iter': [100, 500, 1000, 2000, 5000],
                         'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1]},

    'Elastic Net Regression': {'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
                               'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9],
                               'fit_intercept': [True, False],
                               'max_iter': [100, 500, 1000, 2000, 5000],
                               'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1]},

    'Decision Tree Regression': {'max_depth': [None, 10, 20, 30, 40, 50],
                                 'min_samples_split': [2, 5, 10],
                                 'min_samples_leaf': [1, 2, 4]},

    'Random Forest Regression': {'n_estimators': [10, 50, 100],
                                 'max_depth': [None, 10, 20],
                                 'min_samples_split': [2, 5, 10]},

    'Gradient Boosting Regression': {'n_estimators': [10, 50, 100],
                                     'learning_rate': [0.01, 0.1, 0.2],
                                     'max_depth': [3, 5, 7],
                                     'min_samples_split': [2, 5, 10],
                                     'subsample': [0.8, 0.9, 1.0]},

    'Support Vector Regression': {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                                  'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
                                  'degree': [2, 3, 4, 5],
                                  'gamma': ['scale', 'auto']},

    'K-Nearest Neighbors Regression': {'n_neighbors': [5, 10, 15],
                                       'weights': ['uniform', 'distance'],
                                       'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
                                       'p': [1, 2]},

    'Neural Network Regression': {'hidden_layer_sizes': [(50, 50), (100, 50, 25)],
                                  'activation': ['relu', 'tanh'],
                                  'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
                                  'max_iter': [200]},

    'Gaussian Process Regression': {'kernel': [DotProduct(), WhiteKernel()],
                                    'n_restarts_optimizer': [0, 1, 2],
                                    'normalize_y': [True, False]},

    'Huber Regression': {'epsilon': [1.1, 1.2, 1.35, 1.5],
                         'max_iter': [100, 200, 300, 500],
                         'alpha': [0.0001, 0.001, 0.01, 0.1, 1]},

    'Isotonic Regression': {'out_of_bounds': ['nan', 'clip']},

    'AdaBoost Regression': {'n_estimators': [50, 100, 200],
                            'learning_rate': [0.01, 0.1, 0.2, 0.5, 1],
                            'loss': ['linear', 'square', 'exponential']},

    'Bagging Regression': {'n_estimators': [10, 50, 100],
                           'max_samples': [0.5, 1.0, 2],
                           'max_features': [0.5, 1.0, 2],
                           'bootstrap_features': [True, False]},

    'Extra Trees Regression': {'n_estimators': [10, 50, 100],
                               'max_depth': [None, 10, 20],
                               'min_samples_split': [2, 5, 10]},

    'Voting Regression': {},

    'XGBoost Regression': {'n_estimators': [50, 100, 200],
                           'learning_rate': [0.01, 0.1, 0.2],
                           'max_depth': [3, 5, 7],
                           'min_child_weight': [1, 3, 5]},

    'LightGBM Regression': {'n_estimators': [50, 100, 200],
                            'learning_rate': [0.01, 0.1, 0.2],
                            'max_depth': [3, 5, 7],
                            'min_child_samples': [10, 20, 30]},

    'Orthogonal Matching Pursuit': {'n_nonzero_coefs': [None, 5, 10, 20],
                                    'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1]},

    'ARD Regression': {'n_iter': [100, 200, 300],
                       'tol': [1e-4, 1e-3, 1e-2, 1e-1, 1],
                       'alpha_1': [1e-10, 1e-9, 1e-8],
                       'alpha_2': [1e-10, 1e-9, 1e-8],
                       'lambda_1': [1e-10, 1e-9, 1e-8],
                       'lambda_2': [1e-10, 1e-9, 1e-8]}
}


In [8]:
# teams_to_drop = df_league_0.columns[df_league_0.columns.str.startswith('HomeTeam_') | df_league_0.columns.str.startswith('AwayTeam_')]
# df_league_0 = df_league_0.drop(columns=teams_to_drop)

dfs = [
    # (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

target = 'Full_Time_Total_Goals'

# Run the pipeline for each model
for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(models, desc=f'Processing dataframe {df_name}'):
        result = run_pipeline(df, df_name, target, model_name, model, param_dist[model_name], test_size=0.3, random_state=42)
        result_df = pd.concat([result_df, result], ignore_index=True)
        # display(result_df)
    print('\nDataframe: ', df_name)
    display(result_df.sort_values(by='MAE', ascending=True))



  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n


Dataframe:  df_league_0





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
16,ARD Regression,"{'tol': 0.0001, 'n_iter': 200, 'lambda_2': 1e-...",2.15253,1.148498,0.21161
1,Ridge Regression,"{'solver': 'cholesky', 'fit_intercept': True, ...",2.182203,1.15542,0.200742
15,Orthogonal Matching Pursuit,"{'tol': 0.0001, 'n_nonzero_coefs': None}",2.194159,1.156389,0.196363
0,Linear Regression,{'fit_intercept': False},2.194159,1.156389,0.196363
2,Lasso Regression,"{'tol': 1, 'max_iter': 500, 'fit_intercept': T...",2.179401,1.15669,0.201768
14,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.211982,1.16326,0.189835
12,Extra Trees Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.281363,1.17346,0.164423
13,Voting Regression,No hyperparameter tuning,2.288233,1.182543,0.161907
6,Gradient Boosting Regression,"{'subsample': 0.8, 'n_estimators': 100, 'min_s...",2.310823,1.197664,0.153634
10,AdaBoost Regression,"{'n_estimators': 100, 'loss': 'exponential', '...",2.304362,1.197805,0.156


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n


Dataframe:  df_league_1





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': True},2.082594,1.139504,0.24041
15,Orthogonal Matching Pursuit,"{'tol': 0.0001, 'n_nonzero_coefs': None}",2.082594,1.139504,0.24041
1,Ridge Regression,"{'solver': 'cholesky', 'fit_intercept': True, ...",2.082253,1.139538,0.240535
2,Lasso Regression,"{'tol': 1, 'max_iter': 500, 'fit_intercept': T...",2.082345,1.139771,0.240501
16,ARD Regression,"{'tol': 1, 'n_iter': 200, 'lambda_2': 1e-10, '...",2.084025,1.140186,0.239888
14,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.100657,1.144224,0.233822
12,Extra Trees Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.129296,1.150249,0.223377
13,Voting Regression,No hyperparameter tuning,2.145839,1.155565,0.217343
5,Random Forest Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.152664,1.158835,0.214854
6,Gradient Boosting Regression,"{'subsample': 0.8, 'n_estimators': 100, 'min_s...",2.241285,1.190718,0.182531


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n


Dataframe:  df_league_2





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
16,ARD Regression,"{'tol': 1, 'n_iter': 200, 'lambda_2': 1e-10, '...",1.949812,1.114663,0.233506
1,Ridge Regression,"{'solver': 'cholesky', 'fit_intercept': True, ...",1.955704,1.115651,0.23119
0,Linear Regression,{'fit_intercept': True},1.95619,1.115736,0.230999
15,Orthogonal Matching Pursuit,"{'tol': 0.0001, 'n_nonzero_coefs': None}",1.95619,1.115736,0.230999
2,Lasso Regression,"{'tol': 1, 'max_iter': 500, 'fit_intercept': T...",1.954772,1.116342,0.231556
14,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",1.977214,1.123927,0.222734
12,Extra Trees Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",1.992355,1.127241,0.216782
13,Voting Regression,No hyperparameter tuning,2.013184,1.129461,0.208594
5,Random Forest Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.017197,1.133513,0.207016
6,Gradient Boosting Regression,"{'subsample': 0.8, 'n_estimators': 100, 'min_s...",2.102081,1.159461,0.173647


  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n_iter_ = orthogonal_mp_gram(
  coef_, self.n


Dataframe:  df_league_3





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
16,ARD Regression,"{'tol': 1, 'n_iter': 200, 'lambda_2': 1e-10, '...",1.884291,1.085836,0.268867
1,Ridge Regression,"{'solver': 'cholesky', 'fit_intercept': True, ...",1.89417,1.089758,0.265034
2,Lasso Regression,"{'tol': 1, 'max_iter': 500, 'fit_intercept': T...",1.889928,1.089839,0.26668
15,Orthogonal Matching Pursuit,"{'tol': 0.0001, 'n_nonzero_coefs': None}",1.902895,1.091099,0.261649
0,Linear Regression,{'fit_intercept': True},1.902895,1.091099,0.261649
14,XGBoost Regression,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",1.900589,1.096732,0.262544
5,Random Forest Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",1.964231,1.107567,0.23785
13,Voting Regression,No hyperparameter tuning,1.970114,1.107785,0.235567
12,Extra Trees Regression,"{'n_estimators': 50, 'min_samples_split': 5, '...",1.969823,1.116141,0.23568
6,Gradient Boosting Regression,"{'subsample': 0.8, 'n_estimators': 100, 'min_s...",2.054325,1.14747,0.202892


In [9]:
df_league_0[target].describe()

count    2889.000000
mean        2.665974
std         1.620149
min         0.000000
25%         2.000000
50%         3.000000
75%         4.000000
max        10.000000
Name: Full_Time_Total_Goals, dtype: float64