# Regression task
## Before start

### TODO steps:

1. read csv  

2. write pipeline: split data, train model, predict, evaluate  

3. models: 
    LinearRegression(), 
    RandomForestRegressor(), 
    GradientBoostingRegressor() (xgboost), 
    SVR(), 
    neural network

4. hyperparameter tuning: 
    GridSearchCV, 
    RandomizedSearchCV

5. dont forget about time series cross validation

## Import libraries

In [1]:
%reset

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


## Read data

In [3]:
df_all = pd.read_csv('../data/for_train/train_df.csv')
df_league_0 = pd.read_csv('../data/for_train/df_league_0.csv')
df_league_1 = pd.read_csv('../data/for_train/df_league_1.csv')
df_league_2 = pd.read_csv('../data/for_train/df_league_2.csv')
df_league_3 = pd.read_csv('../data/for_train/df_league_3.csv')


In [4]:
# Create target
df_all['Full_Time_Total_Goals'] = df_all['Full_Time_Home_Team_Goals'] + df_all['Full_Time_Away_Team_Goals']
df_league_0['Full_Time_Total_Goals'] = df_league_0['Full_Time_Home_Team_Goals'] + df_league_0['Full_Time_Away_Team_Goals']
df_league_1['Full_Time_Total_Goals'] = df_league_1['Full_Time_Home_Team_Goals'] + df_league_1['Full_Time_Away_Team_Goals']
df_league_2['Full_Time_Total_Goals'] = df_league_2['Full_Time_Home_Team_Goals'] + df_league_2['Full_Time_Away_Team_Goals']
df_league_3['Full_Time_Total_Goals'] = df_league_3['Full_Time_Home_Team_Goals'] + df_league_3['Full_Time_Away_Team_Goals']

In [5]:
def run_pipeline_2(df, target, model_name, model, param_dist, test_size=0.2, random_state=42):
    # Split data into train and test
    columns_to_drop = ['Full_Time_Home_Team_Goals', 'Full_Time_Away_Team_Goals',
                       'Full_Time_Result_A', 'Full_Time_Result_D', 'Full_Time_Result_H',]

    X = df.drop(columns=[target] + columns_to_drop, axis=1)
    y = df[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Define TimeSeriesSplit
    tscv = TimeSeriesSplit(n_splits=5)

    # Perform random search
    random_search = RandomizedSearchCV(model, param_dist, n_iter=5, cv=tscv, scoring='neg_mean_squared_error', random_state=random_state)
    random_search.fit(X_train, y_train)

    # Evaluate model
    model = random_search.best_estimator_
    y_pred = model.predict(X_test)

    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Save results
    result = {
        'Model': model_name,
        'Best Parameters': random_search.best_params_ if param_dist else "No hyperparameter tuning",
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }

    return pd.DataFrame([result])


In [6]:


# Define models and hyperparameter distributions
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor()),
    ('Gradient Boosting', GradientBoostingRegressor()),
    ('XGBoost', XGBRegressor())
    # ('Support Vector Regression', SVR())
]

param_dist = {
    'Linear Regression': {'fit_intercept': [True, False]},

    'Random Forest': {'n_estimators': [10, 50, 100],
                      'max_depth': [None, 10, 20],
                      'min_samples_split': [2, 5, 10]},

    'Gradient Boosting': {'n_estimators': [50, 100, 200],
                          'learning_rate': [0.01, 0.1, 0.2],
                          'max_depth': [3, 5, 7],
                          'min_samples_split': [2, 5, 10]},

    'XGBoost': {'n_estimators': [50, 100, 200],
                'learning_rate': [0.01, 0.1, 0.2],
                'max_depth': [3, 5, 7],
                'min_child_weight': [1, 3, 5]}

    # 'Support Vector Regression': {'C': [0.1, 1, 10],
    #                               'kernel': ['linear', 'rbf']}
    # # 'gamma': ['scale', 'auto']}
}


dfs = [
    (df_all, 'df_all'),
    (df_league_0, 'df_league_0'),
    (df_league_1, 'df_league_1'),
    (df_league_2, 'df_league_2'),
    (df_league_3, 'df_league_3')
]

target = 'Full_Time_Total_Goals'

# Run the pipeline for each model

for df, df_name in dfs:
    result_df = pd.DataFrame()
    for model_name, model in tqdm(models, desc=f'Processing dataframe {df_name}'):
        result = run_pipeline_2(df, target, model_name, model, param_dist[model_name], test_size=0.2, random_state=42)
        # display(result)
        result_df = pd.concat([result_df, result], ignore_index=True)
    print('\nDataframe: ', df_name, '\n', sep='')
    display(result_df.sort_values(by='MSE', ascending=True))

# Sort results by MSE
# result_df.sort_values(by='MSE', ascending=True)




Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.142047,1.153477,0.211434


Processing dataframe df_all:  25%|██▌       | 1/4 [00:10<00:32, 10.97s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.088445,1.137311,0.231167


Processing dataframe df_all:  50%|█████     | 2/4 [05:42<06:39, 199.59s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Gradient Boosting,"{'n_estimators': 100, 'min_samples_split': 5, ...",2.078279,1.137565,0.234909


Processing dataframe df_all:  75%|███████▌  | 3/4 [09:57<03:44, 224.63s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,XGBoost,"{'n_estimators': 100, 'min_child_weight': 3, '...",2.07988,1.137563,0.23432


Processing dataframe df_all: 100%|██████████| 4/4 [11:32<00:00, 173.07s/it]


Dataframe: df_all





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
2,Gradient Boosting,"{'n_estimators': 100, 'min_samples_split': 5, ...",2.078279,1.137565,0.234909
3,XGBoost,"{'n_estimators': 100, 'min_child_weight': 3, '...",2.07988,1.137563,0.23432
1,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.088445,1.137311,0.231167
0,Linear Regression,{'fit_intercept': False},2.142047,1.153477,0.211434




Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.074526,1.143488,0.199864


Processing dataframe df_league_0:  25%|██▌       | 1/4 [00:03<00:09,  3.27s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.110251,1.159142,0.186085


Processing dataframe df_league_0:  50%|█████     | 2/4 [00:11<00:12,  6.44s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.091349,1.150484,0.193375


Processing dataframe df_league_0:  75%|███████▌  | 3/4 [00:21<00:08,  8.06s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.097303,1.152187,0.191079


Processing dataframe df_league_0: 100%|██████████| 4/4 [00:32<00:00,  8.13s/it]


Dataframe: df_league_0





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.074526,1.143488,0.199864
2,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.091349,1.150484,0.193375
3,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.097303,1.152187,0.191079
1,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.110251,1.159142,0.186085




Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.217429,1.174136,0.200144


Processing dataframe df_league_1:  25%|██▌       | 1/4 [00:05<00:17,  5.95s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.192114,1.168349,0.209275


Processing dataframe df_league_1:  50%|█████     | 2/4 [01:38<01:54, 57.02s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.167994,1.161574,0.217976


Processing dataframe df_league_1:  75%|███████▌  | 3/4 [03:00<01:08, 68.56s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.163154,1.16006,0.219722


Processing dataframe df_league_1: 100%|██████████| 4/4 [03:50<00:00, 57.60s/it]


Dataframe: df_league_1





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
3,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.163154,1.16006,0.219722
2,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.167994,1.161574,0.217976
1,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.192114,1.168349,0.209275
0,Linear Regression,{'fit_intercept': False},2.217429,1.174136,0.200144




Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.077191,1.136021,0.182639


Processing dataframe df_league_2:  25%|██▌       | 1/4 [00:05<00:15,  5.11s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.02309,1.116434,0.203927


Processing dataframe df_league_2:  50%|█████     | 2/4 [00:51<00:58, 29.39s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",1.997701,1.111882,0.213918


Processing dataframe df_league_2:  75%|███████▌  | 3/4 [01:35<00:36, 36.06s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",1.994353,1.110276,0.215235


Processing dataframe df_league_2: 100%|██████████| 4/4 [02:06<00:00, 31.71s/it]


Dataframe: df_league_2





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
3,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",1.994353,1.110276,0.215235
2,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",1.997701,1.111882,0.213918
1,Random Forest,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.02309,1.116434,0.203927
0,Linear Regression,{'fit_intercept': False},2.077191,1.136021,0.182639




Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Linear Regression,{'fit_intercept': False},2.307245,1.204637,0.10943


Processing dataframe df_league_3:  25%|██▌       | 1/4 [00:03<00:09,  3.08s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Random Forest,"{'n_estimators': 100, 'min_samples_split': 10,...",2.23134,1.174775,0.138729


Processing dataframe df_league_3:  50%|█████     | 2/4 [00:14<00:15,  7.79s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.157566,1.165352,0.167205


Processing dataframe df_league_3:  75%|███████▌  | 3/4 [00:24<00:09,  9.16s/it]

Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
0,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.167566,1.167314,0.163345


Processing dataframe df_league_3: 100%|██████████| 4/4 [00:36<00:00,  9.23s/it]


Dataframe: df_league_3





Unnamed: 0,Model,Best Parameters,MSE,MAE,R2
2,Gradient Boosting,"{'n_estimators': 50, 'min_samples_split': 5, '...",2.157566,1.165352,0.167205
3,XGBoost,"{'n_estimators': 50, 'min_child_weight': 3, 'm...",2.167566,1.167314,0.163345
1,Random Forest,"{'n_estimators': 100, 'min_samples_split': 10,...",2.23134,1.174775,0.138729
0,Linear Regression,{'fit_intercept': False},2.307245,1.204637,0.10943


In [7]:
df_all[target].describe()

count    32862.000000
mean         2.645944
std          1.637276
min          0.000000
25%          1.000000
50%          2.000000
75%          4.000000
max         13.000000
Name: Full_Time_Total_Goals, dtype: float64