# Regression

Perform a regression on the dataset stored in `FTML/Project/data/regression/`.

In [76]:
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV, cross_val_score

In [2]:
!ls '../../data/regression/'

X_test.npy  X_train.npy  y_test.npy  y_train.npy


In [3]:
dataset_folder = Path('../../data/regression/')
dataset_folder.resolve()

PosixPath('/home/gilles/Documents/Cours/EPITA/ING2/Fondamentaux_Theoriques_en_ML/Project/FTML/data/regression')

In [39]:
x_train_file = 'X_train.npy'
y_train_file = 'y_train.npy'

X_train = np.load(dataset_folder.joinpath(x_train_file))
y_train = np.load(dataset_folder.joinpath(y_train_file)).ravel()

print('Inputs_train', X_train)
print('Labels_train', y_train)

Inputs_train [[0.82921188 0.33620837 0.11307146 ... 0.26911057 0.39826737 0.69398368]
 [0.49605241 0.23338204 0.25629519 ... 0.64696802 0.03035201 0.76803069]
 [0.02905734 0.4937862  0.46724899 ... 0.44549826 0.41709255 0.47755807]
 ...
 [0.63059727 0.43887553 0.81276115 ... 0.95515152 0.86576297 0.43993964]
 [0.74742343 0.11276802 0.194641   ... 0.85089992 0.46785343 0.91046313]
 [0.01405356 0.65306567 0.59671311 ... 0.07302376 0.07829235 0.45123912]]
Labels_train [2.03744078 3.86866045 1.82416592 2.50305137 0.63476318 3.64501907
 2.10959603 2.53736946 2.36396361 1.8493183  0.26528054 3.29868999
 0.68686673 1.97761143 2.33217634 2.35332258 2.13428688 2.8661636
 2.60676028 1.84903027 2.39740233 1.84141611 1.66002837 2.32656456
 2.63244469 2.92972212 1.90925142 2.19895187 2.88423105 3.04904381
 2.40316574 3.23375526 0.66999314 2.37406771 1.90287596 3.87498277
 1.81649276 2.48875646 2.67020008 2.25751607 2.82685912 2.89131867
 2.33134054 1.3473857  1.44341091 2.62260362 1.80267415 1.4456

In [63]:
x_test_file = 'X_test.npy'
y_test_file = 'y_test.npy'

X_test = np.load(dataset_folder.joinpath(x_test_file))
y_test = np.load(dataset_folder.joinpath(y_test_file)).ravel()

In [6]:
#X_train, X_val, y_train, y_val = train_test_split(inputs_train, labels_train, test_size=0.2, random_state=42)

In [59]:
def compute(model, param_grid):
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train, y_train)

    # Print the best hyperparameters and score
    print("Best Hyperparameters:", grid_search.best_params_)
    print("Best R2 Score:", grid_search.best_score_)

    # Perform cross-validation with the best model
    cv_scores = cross_val_score(grid_search.best_estimator_, X_test, y_test, cv=5, scoring='r2')
    print("Cross-Validation Scores:", cv_scores)

## Linear Regression model

In [64]:
# Define the hyperparameters to tune
param_grid = {
    'fit_intercept': [True, False]
}

compute(LinearRegression(), param_grid)

Best Hyperparameters: {'fit_intercept': False}
Best R2 Score: 0.15111261266625625
Cross-Validation Scores: [-0.01065281  0.25674478  0.58253859  0.18507886  0.53897275]


## Random Forest Regression model

In [46]:
# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 1000],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

compute(RandomForestRegressor(), param_grid)

Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best R2 Score: 0.4047318422764591
Cross-Validation Scores: [0.40189335 0.45787189 0.43625472 0.45534033 0.06047542]


## SVR

In [65]:
# Define the hyperparameters to tune
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1.0, 10.0],
    'epsilon': [0.01, 0.1, 1.0]
}


compute(SVR(), param_grid)

Best Hyperparameters: {'C': 1.0, 'epsilon': 0.1, 'kernel': 'linear'}
Best R2 Score: 0.5125351012747161
Cross-Validation Scores: [0.24873256 0.5554407  0.76699458 0.59395155 0.49191999]


## SGDRegressor

In [66]:
param_grid = {
    'loss': ['squared_error', 'huber'],
    'penalty': ['l1', 'l2'],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [1000, 2000],
    'epsilon': [0.01, 0.1, 0.2]
}

compute(SGDRegressor(), param_grid)

Best Hyperparameters: {'alpha': 0.01, 'epsilon': 0.2, 'loss': 'squared_error', 'max_iter': 1000, 'penalty': 'l1'}
Best R2 Score: 0.4874629486401371
Cross-Validation Scores: [0.47538059 0.43292451 0.65360788 0.4147293  0.37759269]


## Ridge

In [67]:
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

compute(Ridge(), param_grid)

Best Hyperparameters: {'alpha': 1.0, 'solver': 'saga'}
Best R2 Score: 0.5398352936850455
Cross-Validation Scores: [0.28803107 0.55326009 0.76001826 0.60368298 0.47015371]


## Lasso

In [69]:
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'max_iter': [1000]
}

compute(Lasso(), param_grid)

Best Hyperparameters: {'alpha': 0.1, 'max_iter': 1000}
Best R2 Score: -0.06389917253307624
Cross-Validation Scores: [ 0.04332234  0.01645156  0.00731001 -0.04941587 -0.11656802]


## ElasticNet

In [71]:
param_grid = {
    'alpha': [0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.5, 0.9],
    'max_iter': [1000]
}

compute(ElasticNet(), param_grid)

Best Hyperparameters: {'alpha': 0.1, 'l1_ratio': 0.1, 'max_iter': 1000}
Best R2 Score: 0.5049860226461409
Cross-Validation Scores: [0.48577972 0.49674582 0.58494139 0.34655154 0.42175439]


## DecisionTreeRegressor

In [77]:
param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

compute(DecisionTreeRegressor(), param_grid)

Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10}
Best R2 Score: -0.08313260077329465
Cross-Validation Scores: [-0.2884576  -0.11766592 -0.37464356 -0.86650934 -0.21570891]


## GradientBoostingRegressor

In [79]:
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.1, 1.0],
    'max_depth': [3, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

compute(GradientBoostingRegressor(), param_grid)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best R2 Score: 0.5903055532257081
Cross-Validation Scores: [0.49831757 0.52183783 0.60774635 0.18598053 0.35309548]
