# Walmart Sales

## Install & import data

In [1]:
# Load in our libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
from datetime import date

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
#pio.renderers.default = "svg" # to be replaced by "iframe" if working on JULIE

import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

# import score & preprocessing
from sklearn.metrics import accuracy_score, f1_score, ConfusionMatrixDisplay, RocCurveDisplay, classification_report, confusion_matrix, r2_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer
# import methods
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('Walmart_Store_sales.csv')

In [3]:
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


## EDA

In [4]:
# Basic stats
print("Set with labels (our train+test) : {}".format(data.shape))
print()

print("Display of dataset: ")
display(data.head())
print()

print("Basics statistics: ")
data_desc = data.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*data.isnull().sum()/data.shape[0])


Set with labels (our train+test) : (150, 8)

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         150 non-null    float64
 1   Date          132 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Holiday_Flag  138 non-null    float64
 4   Temperature   132 non-null    float64
 5   Fuel_Price    136 non-null    float64
 6   CPI           138 non-null    float64
 7   Unemployment  135 non-null    float64
dtypes: float64(7), object(1)
memory usage: 9.5+ KB


In [6]:
# Drop lines containing outliers (using masks)

print('Dropping outliers in Target...')
to_keep = -(data['Weekly_Sales'].isnull())
data = data.loc[to_keep,:]
print('Done. Number of lines remaining : ', data.shape[0])
print()

Dropping outliers in Target...
Done. Number of lines remaining :  136



In [8]:
data = data.reset_index().drop(['index'], axis=1)
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
3,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092
4,4.0,28-05-2010,1857533.7,0.0,,2.756,126.160226,7.896


In [9]:
data['Date'] = [datetime.datetime.strptime(data['Date'][i], "%d-%m-%Y") if type(data['Date'][i]) != float else data['Date'][i] for i in range(len(data['Date']))]
data['Year'] = [data['Date'][i].year for i in range(len(data['Date']))]
data['Month'] = [data['Date'][i].month for i in range(len(data['Date']))]
data['Day'] = [data['Date'][i].day for i in range(len(data['Date']))]
data['Dayofweek'] = [data['Date'][i].dayofweek for i in range(len(data['Date']))]
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,11.0,NaT,1244390.03,0.0,84.57,,214.556497,7.346,,,,
3,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
4,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


In [10]:
pd.isnull(data['Day'][2])

True

In [11]:
data['Dayofweek'].unique()

array([ 4., nan])

In [12]:
list_featurestocheck = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
list_mean = [data[el].mean() for el in list_featurestocheck]
list_std = [data[el].std() for el in list_featurestocheck]
list_min = [(list_mean[i] - 3 * list_std[i]) for i in range(len(list_featurestocheck))]
list_max = [(list_mean[i] + 3 * list_std[i]) for i in range(len(list_featurestocheck))]

list_std

[18.51443186991069,
 0.47954023536384177,
 40.243104700979906,
 1.6194283750203011]

In [13]:
for el in list_featurestocheck:
    to_keep = ((data[el] > list_min[list_featurestocheck.index(el)]) & (data[el] < list_max[list_featurestocheck.index(el)]))
    data = data.loc[to_keep,:]
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,15.0,2011-06-03,695396.19,0.0,69.8,4.069,134.855161,7.658,2011.0,6.0,3.0,4.0
6,20.0,2012-02-03,2203523.2,0.0,39.93,3.617,213.023622,6.961,2012.0,2.0,3.0,4.0


In [14]:
data = data.reset_index().drop(['index'], axis=1)
data = data.drop(['Date'], axis=1)
data.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
2,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
3,15.0,695396.19,0.0,69.8,4.069,134.855161,7.658,2011.0,6.0,3.0,4.0
4,20.0,2203523.2,0.0,39.93,3.617,213.023622,6.961,2012.0,2.0,3.0,4.0


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         90 non-null     float64
 1   Weekly_Sales  90 non-null     float64
 2   Holiday_Flag  80 non-null     float64
 3   Temperature   90 non-null     float64
 4   Fuel_Price    90 non-null     float64
 5   CPI           90 non-null     float64
 6   Unemployment  90 non-null     float64
 7   Year          80 non-null     float64
 8   Month         80 non-null     float64
 9   Day           80 non-null     float64
 10  Dayofweek     80 non-null     float64
dtypes: float64(11)
memory usage: 7.9 KB


In [16]:
data['Store'].value_counts()

3.0     9
18.0    7
7.0     7
13.0    7
1.0     6
19.0    6
5.0     5
4.0     5
6.0     4
14.0    4
20.0    4
8.0     4
10.0    4
2.0     4
9.0     3
17.0    3
16.0    3
15.0    3
11.0    2
Name: Store, dtype: int64

In [17]:
store_CAinMilions = round((data.groupby('Store')['Weekly_Sales'].sum() / data.groupby('Store')['Weekly_Sales'].count()) / 1000000 , 3)
store_CAinMilions = pd.DataFrame(store_CAinMilions)
store_CAinMilions = store_CAinMilions.reset_index()
store_CAinMilions

Unnamed: 0,Store,Weekly_Sales
0,1.0,1.571
1,2.0,1.834
2,3.0,0.409
3,4.0,2.237
4,5.0,0.292
5,6.0,1.589
6,7.0,0.537
7,8.0,0.904
8,9.0,0.506
9,10.0,1.836


#### Visualisations

In [18]:
# Univariate analysis
# Distribution of each numeric variable
num_features = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Dayofweek']
for f in num_features:
    fig = px.histogram(data, f)
    fig.show()

In [19]:
cat_features = ['Store', 'Holiday_Flag']
for f in cat_features:
    fig = px.histogram(data, f)
    fig.show()

In [20]:
fig = px.histogram(store_CAinMilions, x= "Store", y="Weekly_Sales", nbins = 20)
fig.show()

In [100]:
fig = px.scatter(data, x="Unemployment", y="Weekly_Sales")
fig.show()

In [101]:
fig = px.scatter(data, x="CPI", y="Weekly_Sales")
fig.show()

#### Correlation

In [22]:
# Correlation matrix
corr_matrix = data.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

In [23]:
# Visualize pairwise dependencies
fig = px.scatter_matrix(data)
fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=1200, width = 1200)
fig.show()


iteritems is deprecated and will be removed in a future version. Use .items instead.



#### Tableau des scores des différents modèles

In [24]:
scores_df = pd.DataFrame(columns = ['model', 'r2_score_train', 'r2_score_test', 'std_dev', 'overfitting'])
scores_df

Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting


## Train a first Linear Regression Model with the most correlated feature

#### Preprocessing

In [25]:
# Separate target variable Y from features X
print("Separating labels from features...")
feature_variable = 'CPI'
target_variable = 'Weekly_Sales'

X = data.loc[:,feature_variable]
Y = data.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    1572117.54
1    1807545.43
2    1644470.66
3     695396.19
4    2203523.20
Name: Weekly_Sales, dtype: float64

X :
0    214.777523
1    128.616064
2    212.412888
3    134.855161
4    213.023622
Name: CPI, dtype: float64


In [26]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [27]:
print("Preprocessing X_train...")
print(X_train.head())
print()
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train.values.reshape(-1, 1))
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = scaler.transform(X_test.values.reshape(-1, 1)) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()


Preprocessing X_train...
77    189.523128
42    224.019287
22    133.958742
6     219.070197
61    210.337426
Name: CPI, dtype: float64

...Done!
[[ 0.20507788]
 [ 1.09260385]
 [-1.22449677]
 [ 0.96527239]
 [ 0.74059345]]

Performing preprocessings on test set...
2     212.412888
13    189.704822
53    223.917015
41    130.645793
66    225.306861
Name: CPI, dtype: float64
...Done.
[[ 0.79399146]
 [ 0.20975255]
 [ 1.08997257]
 [-1.30973316]
 [ 1.12573089]]



#### Train Model

In [28]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


#### Performance assessment

In [29]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[1212977.26674094 1034153.82841455 1501015.54653146 1059809.24593045
 1105078.75118178 1098703.22176547 1517254.25485388 1491087.94306829
 1081538.14534531 1479103.51567746 1526137.81077931 1496368.62928953
 1541550.99687886 1168541.58567243 1187801.77508464 1515791.68019155
 1482220.69091698 1064213.91389888 1075484.01193645 1082689.61708273
 1101307.68630896 1091153.87141413 1475340.07989891 1479894.10538859
 1211321.76935508 1524034.02569933 1033569.43782387 1525939.40248973
 1082871.11042049 1082061.88730627 1516564.63335684 1100388.6818916
 1097008.26946395 1098218.05398149 1056329.3452822  1078641.76854551
 1541056.45614839 1508464.11373269 1230515.04336304 1088826.89087522
 1170471.58996914 1528711.26544178 1078085.37082636 1046470.70277645
 1072954.67611307 1171166.92201197 1195828.05916863 1053558.81510187
 1134989.03177731 1018863.72334244 1249682.70026483 1134596.71271951
 1539358.74032839 1070464.69463836 1080822.29400926 1062402.3951

In [30]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[1094319.83667358 1212035.39007398 1034683.99266598 1518189.42619133
 1027479.2174565  1096838.6114086  1507985.50405645 1522065.55038456
 1525676.02896595 1480465.02155872 1507212.27199922 1480179.56373285
 1172075.54314754 1082018.84671113 1195855.26409267 1511878.92110527
 1526377.30534689 1027850.96227211]



In [31]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.09711365927096016
R2 score on test set :  0.2390451729318681


In [32]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score
print("3-fold cross-validation...")
scores = cross_val_score(regressor, X_train, Y_train, cv=5)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.008905345529459385
The standard deviation is :  0.022575733150278193


In [33]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > scores.std()):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'lr_1_feature', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : scores.std(),
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No


In [34]:
# Visualize the model
# Visualize predictions on training Set
fig = px.scatter(x = X_train.flatten().tolist(), y = Y_train, title = "training set")
fig.add_trace(go.Scatter(x = X_train.flatten().tolist(), y = Y_train_pred, name = "linear regression"))
fig.show()

# Visualize predictions on test Set
fig = px.scatter(x = X_test.flatten().tolist(), y = Y_test, title = "test set")
fig.add_trace(go.Scatter(x = X_test.flatten().tolist(), y = Y_test_pred, name = "linear regression"))
fig.show()

## Train a second Linear Regression Model with all features

#### Preprocessing

In [35]:
# Separate target variable Y from features X
print("Separating labels from features...")
target_variable = 'Weekly_Sales'

X = data.drop(target_variable, axis = 1)
Y = data.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    1572117.54
1    1807545.43
2    1644470.66
3     695396.19
4    2203523.20
Name: Weekly_Sales, dtype: float64

X :
   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0    6.0           NaN        59.61       3.045  214.777523         6.858   
1   13.0           0.0        42.38       3.435  128.616064         7.470   
2    6.0           0.0        78.89       2.759  212.412888         7.092   
3   15.0           0.0        69.80       4.069  134.855161         7.658   
4   20.0           0.0        39.93       3.617  213.023622         6.961   

     Year  Month   Day  Dayofweek  
0  2011.0    2.0  18.0        4.0  
1  2011.0    3.0  25.0        4.0  
2  2010.0    5.0  28.0        4.0  
3  2011.0    6.0   3.0        4.0  
4  2012.0    2.0   3.0        4.0  


In [36]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [37]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [38]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()


Performing preprocessings on train set...
    Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
77   16.0           0.0        61.79       2.711  189.523128         6.868   
42    5.0           0.0        69.17       3.594  224.019287         5.422   
22   19.0           0.0        33.26       3.789  133.958742         7.771   
6     8.0           0.0        82.92       3.554  219.070197         6.425   
61    1.0           0.0        74.78       2.854  210.337426         7.808   

      Year  Month   Day  Dayofweek  
77  2010.0    7.0   9.0        4.0  
42  2012.0   10.0  19.0        4.0  
22  2011.0    3.0  25.0        4.0  
6   2011.0    8.0  19.0        4.0  
61  2010.0    5.0  14.0        4.0  
...Done.
[[ 0.04260362 -1.26840641  0.20507788 -0.55534542 -1.1763434   0.147002
  -0.86859506  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.          0. 

#### Train model

In [39]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


#### Performance assessment

In [40]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 611364.67099396  370577.26212486 1275740.37137492  879179.76718068
 1536772.70829879 1514868.79536837 1965323.8723865   602145.54012832
  948687.87405245 1089144.04045663 2125262.41163193  650336.60787243
 2145312.0623884   610712.16639662  517258.85415893  778674.43751482
  621000.71199925 1637887.71082181  166083.77933535  532890.97130511
 1846150.02967255 2113342.41663076 1117874.96097089 1449549.93545643
 2064847.33029364 1946434.88789985  420203.37186409 2018205.31305823
  911972.28740893 1619671.09250448 2039633.27775499 1566247.21290487
 1544871.47814238 1918280.17346583  329688.5413809   513754.34016273
  930146.16563808 1520404.73250487 2020147.89985994 2062163.07944381
  523043.15752524 1942173.83959015 1592843.57093178  425386.35441876
  245875.50172863  503128.68941671  438285.84900604 1792986.20265494
 1965095.38313836  420314.62383058 2068359.18918786 1881633.53078707
  798003.7668471  1545014.05935314  471641.00853216  408800.486

In [41]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[1569238.89010088  687168.56801076 1896822.31339263 1788054.17144371
  393226.322252    222617.46706409 1220860.04363255 2170262.17112501
 2002295.45281567 1330643.0821048  1180897.01628345  972430.5482631
  438299.48822375  352427.85268942  532613.94359759 1356749.96165428
 2016167.58086591  419838.00337229]



In [42]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9868321417045137
R2 score on test set :  0.9352216314000095


In [43]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score
print("3-fold cross-validation...")
scores = cross_val_score(regressor, X_train, Y_train, cv=5)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.9541101472556666
The standard deviation is :  0.03078376582468145


In [44]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > scores.std()):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'lr_all_features', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : scores.std(),
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes


### Interpreting the model's coefficients

In [45]:
regressor.coef_

array([-1.14627040e+04, -5.79848294e+04,  7.17469905e+05,  3.24784968e+04,
       -6.89502249e+03,  1.72431945e+04, -4.95926514e+04,  1.21071935e-08,
        2.71338356e+05, -1.25098749e+06,  2.20417284e+06, -1.22743289e+06,
        1.00272537e+05, -6.19830015e+05, -6.56196703e+05, -1.10251725e+06,
        1.79845728e+06,  2.19379595e+05,  2.06681366e+06,  1.01734744e+06,
        5.89257732e+05, -5.77586044e+05,  8.56056660e+05,  9.87464428e+05,
        1.32821080e+06,  5.92403256e+05, -5.35303229e+04])

In [46]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Dayofweek', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0', 'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0', 'x0_20.0', 'x1_1.0']


In [47]:
# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,-11462.7
Fuel_Price,-57984.83
CPI,717469.9
Unemployment,32478.5
Year,-6895.022
Month,17243.19
Day,-49592.65
Dayofweek,1.210719e-08
x0_2.0,271338.4
x0_3.0,-1250987.0


In [48]:
coefs.index

Index(['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month',
       'Day', 'Dayofweek', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0',
       'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0',
       'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0',
       'x0_20.0', 'x1_1.0'],
      dtype='object')

In [49]:
column_names_bis = [sub.replace('x0', 'Store') for sub in column_names]
column_names_bis = [sub.replace('x1', 'Holiday_Flag') for sub in column_names_bis]
column_names_bis

['Temperature',
 'Fuel_Price',
 'CPI',
 'Unemployment',
 'Year',
 'Month',
 'Day',
 'Dayofweek',
 'Store_2.0',
 'Store_3.0',
 'Store_4.0',
 'Store_5.0',
 'Store_6.0',
 'Store_7.0',
 'Store_8.0',
 'Store_9.0',
 'Store_10.0',
 'Store_11.0',
 'Store_13.0',
 'Store_14.0',
 'Store_15.0',
 'Store_16.0',
 'Store_17.0',
 'Store_18.0',
 'Store_19.0',
 'Store_20.0',
 'Holiday_Flag_1.0']

In [50]:
coefs_bis = coefs.T
coefs_bis.columns = column_names_bis
coefs_bis = coefs_bis.T

In [51]:
# Compute abs() and sort values
feature_importance = abs(coefs_bis).sort_values(by = 'coefficients')
feature_importance

Unnamed: 0,coefficients
Dayofweek,1.210719e-08
Year,6895.022
Temperature,11462.7
Month,17243.19
Unemployment,32478.5
Day,49592.65
Holiday_Flag_1.0,53530.32
Fuel_Price,57984.83
Store_6.0,100272.5
Store_11.0,219379.6


In [52]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h', height= 700)
fig.update_layout(showlegend = False, 
                margin = {'l': 120} # to avoid cropping of column names
                )
fig.show()

In [53]:
df_X_train = pd.DataFrame(X_train, columns=coefs_bis.index)
df_X_train.head()

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek,Store_2.0,Store_3.0,...,Store_11.0,Store_13.0,Store_14.0,Store_15.0,Store_16.0,Store_17.0,Store_18.0,Store_19.0,Store_20.0,Holiday_Flag_1.0
0,0.042604,-1.268406,0.205078,-0.555345,-1.176343,0.147002,-0.868595,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.459277,0.580588,1.092604,-2.127344,1.470429,1.171274,0.42353,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.568194,0.988917,-1.224497,0.426339,0.147043,-1.218694,1.198805,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.235599,0.496829,0.965272,-1.036947,0.147043,0.488426,0.42353,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.776016,-0.968966,0.740593,0.466563,-1.176343,-0.535846,-0.222533,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Correlation matrix
corr_matrix = df_X_train.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                x = corr_matrix.columns.tolist(),
                                y = corr_matrix.index.tolist()
                                )


fig.show()

## Train Linear Regression model with feature Selector

#### Feature selection by the selector

In [55]:
selector_df = pd.DataFrame(X_train, columns=coefs.index)
selector_df

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek,x0_2.0,x0_3.0,...,x0_11.0,x0_13.0,x0_14.0,x0_15.0,x0_16.0,x0_17.0,x0_18.0,x0_19.0,x0_20.0,x1_1.0
0,0.042604,-1.268406,0.205078,-0.555345,-1.176343,0.147002,-0.868595,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.459277,0.580588,1.092604,-2.127344,1.470429,1.171274,0.423530,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.568194,0.988917,-1.224497,0.426339,0.147043,-1.218694,1.198805,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.235599,0.496829,0.965272,-1.036947,0.147043,0.488426,0.423530,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.776016,-0.968966,0.740593,0.466563,-1.176343,-0.535846,-0.222533,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0.987741,-1.280970,0.847582,-0.038956,-1.176343,-0.194422,-1.514658,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
68,0.665920,-0.805634,-1.259496,0.782919,-1.176343,0.147002,1.844867,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
69,0.492024,1.043360,-1.186212,1.642844,0.147043,0.147002,0.035892,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
70,-1.369456,-1.046443,-1.408849,0.452430,-1.176343,1.854122,-0.739383,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# sklearn
from sklearn.feature_selection import SequentialFeatureSelector
feature_selector =  SequentialFeatureSelector(regressor, n_features_to_select = 10)

feature_selector.fit(selector_df, Y_train)
features_list = selector_df.columns
best_features = features_list[feature_selector.support_]
print("According to the forward selection algorithm, the following features should be kept: ")
print(best_features.to_list())


According to the forward selection algorithm, the following features should be kept: 
['x0_3.0', 'x0_5.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_13.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0']


In [57]:
X_bis = preprocessor.transform(X)
X_bis = pd.DataFrame(X_bis, columns=coefs.index)
X_bis

Unnamed: 0,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Dayofweek,x0_2.0,x0_3.0,...,x0_11.0,x0_13.0,x0_14.0,x0_15.0,x0_16.0,x0_17.0,x0_18.0,x0_19.0,x0_20.0,x1_1.0
0,-0.080479,-0.569013,0.854829,-0.566217,0.147043,-1.560118,0.294317,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-1.053281,0.247644,-1.361955,0.099111,0.147043,-1.218694,1.198805,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008066,-1.167895,0.793991,-0.311827,-1.176343,-0.535846,1.586442,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.494847,1.575234,-1.201433,0.303492,0.147043,-0.194422,-1.643870,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-1.191607,0.628750,0.809705,-0.454242,1.470429,-1.560118,-1.643870,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,-0.589181,1.020327,0.412589,0.773135,1.470429,-0.535846,1.198805,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.987741,-1.280970,0.847582,-0.038956,-1.176343,-0.194422,-1.514658,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,0.700360,0.580588,1.168491,-1.462017,1.470429,1.171274,0.423530,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.654063,-1.123921,0.022904,1.652628,-1.176343,-0.194422,0.294317,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
X_best = X_bis.loc[:, best_features]
X_best

Unnamed: 0,x0_3.0,x0_5.0,x0_7.0,x0_8.0,x0_9.0,x0_13.0,x0_15.0,x0_16.0,x0_17.0,x0_18.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
85,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
87,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Preprocessing

In [59]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X_best, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

# Preprocessing
print("Preprocessing X_train...")
print(X_train.head())
print()
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array

print("Preprocessing X_test...")
print(X_test.head())
print()
X_test = scaler.transform(X_test) # don't fit again !
print("...Done!")
print(X_test[0:5,:]) # X_train is now a numpy array

Dividing into train and test sets...
...Done.

Preprocessing X_train...
    x0_3.0  x0_5.0  x0_7.0  x0_8.0  x0_9.0  x0_13.0  x0_15.0  x0_16.0  \
77     0.0     0.0     0.0     0.0     0.0      0.0      0.0      1.0   
42     0.0     1.0     0.0     0.0     0.0      0.0      0.0      0.0   
22     0.0     0.0     0.0     0.0     0.0      0.0      0.0      0.0   
6      0.0     0.0     0.0     1.0     0.0      0.0      0.0      0.0   
61     0.0     0.0     0.0     0.0     0.0      0.0      0.0      0.0   

    x0_17.0  x0_18.0  
77      0.0      0.0  
42      0.0      0.0  
22      0.0      0.0  
6       0.0      0.0  
61      0.0      0.0  

...Done!
[[-0.30151134 -0.24253563 -0.27317918 -0.24253563 -0.20851441 -0.30151134
  -0.20851441  5.91607978 -0.20851441 -0.24253563]
 [-0.30151134  4.12310563 -0.27317918 -0.24253563 -0.20851441 -0.30151134
  -0.20851441 -0.16903085 -0.20851441 -0.24253563]
 [-0.30151134 -0.24253563 -0.27317918 -0.24253563 -0.20851441 -0.30151134
  -0.20851441 -0.

#### Train model

In [60]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")


Train model...
...Done.


#### Performance assessment

In [61]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[ 524825.26        297834.2575     1771767.7453125   903848.1325
 1771767.7453125  1771767.7453125  2032628.91        624494.28666667
  903848.1325     1103643.         1771767.7453125   624494.28666667
 2032628.91        556282.324       556282.324       835608.12333333
  624494.28666667 1771767.7453125   297834.2575      506095.44
 1771767.7453125  1771767.7453125  1103643.         1771767.7453125
 1771767.7453125  2032628.91        411019.58       1771767.7453125
  903848.1325     1771767.7453125  2032628.91       1771767.7453125
 1771767.7453125  1771767.7453125   411019.58        506095.44
  835608.12333333 1771767.7453125  1771767.7453125  1771767.7453125
  556282.324      2032628.91       1771767.7453125   411019.58
  297834.2575      556282.324       524825.26       1771767.7453125
 1771767.7453125   411019.58       1771767.7453125  1771767.7453125
  835608.12333333 1771767.7453125   506095.44        297834.2575
 1771767.7453125   411019.

In [62]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9172448273076149
R2 score on test set :  0.8232008201840104


In [63]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score
print("3-fold cross-validation...")
scores = cross_val_score(regressor, X_train, Y_train, cv=5)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.894039967228526
The standard deviation is :  0.039904841547007866


In [64]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > scores.std()):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'lr_feature_selector', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : scores.std(),
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes


## Train Ridge model with GridSearch and all features

#### Preprocessing

In [65]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
# WARNING : don't forget stratify=Y for classification problems
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [66]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features)
    ])

In [67]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()


Performing preprocessings on train set...
    Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
77   16.0           0.0        61.79       2.711  189.523128         6.868   
42    5.0           0.0        69.17       3.594  224.019287         5.422   
22   19.0           0.0        33.26       3.789  133.958742         7.771   
6     8.0           0.0        82.92       3.554  219.070197         6.425   
61    1.0           0.0        74.78       2.854  210.337426         7.808   

      Year  Month   Day  Dayofweek  
77  2010.0    7.0   9.0        4.0  
42  2012.0   10.0  19.0        4.0  
22  2011.0    3.0  25.0        4.0  
6   2011.0    8.0  19.0        4.0  
61  2010.0    5.0  14.0        4.0  
...Done.
[[ 0.04260362 -1.26840641  0.20507788 -0.55534542 -1.1763434   0.147002
  -0.86859506  0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.          0. 

#### Train model and hyperparameter search with GridSearch

In [68]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
}
best_ridge = GridSearchCV(regressor, param_grid = params, cv = 5, scoring='r2') # cv : the number of folds to be used for CV
best_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.0005}
Best R2 score :  0.9542831154931168


In [69]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009, 0.001]
}
best_ridge = GridSearchCV(regressor, param_grid = params, cv = 5, scoring='r2') # cv : the number of folds to be used for CV
best_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.0004}
Best R2 score :  0.9542849898036847


#### Performance assessment

In [70]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = best_ridge.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = best_ridge.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[ 609780.87983325  365164.06134093 1279234.29431008  879981.76680569
 1541883.88262668 1517266.51361882 1968418.31429959  600396.78309823
  950027.28324619 1081428.43298632 2125952.34673721  651660.28951422
 2140365.73278653  610701.25914379  518091.65353468  786013.70956477
  621278.7191007  1635030.18750702  170108.61814578  533275.18442635
 1852054.69493968 2112153.60382459 1119828.93135876 1451686.81523954
 2062472.75657241 1946845.36703668  412623.8926297  2019311.90935649
  909374.90857749 1620848.60116643 2045098.06773461 1564850.12693458
 1545051.89025319 1918380.81368706  333028.77237622  514999.30856334
  928017.83129842 1517648.92676215 2021307.89496121 2054590.03328718
  523858.85830787 1940273.91789114 1596185.85627084  425152.84541357
  248615.25406221  504154.4344815   440126.92159348 1786355.0483081
 1967118.4240385   416356.00577916 2066822.81768108 1888142.96658018
  792546.25520217 1543616.52722684  470451.17206342  407940.0424

In [71]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9868053871595848
R2 score on test set :  0.9362616305225571


In [72]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > (best_ridge.cv_results_['std_test_score'][best_ridge.best_index_])):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'ridge_all_features', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : best_ridge.cv_results_['std_test_score'][best_ridge.best_index_],
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes
3,ridge_all_features,0.986805,0.936262,0.031577,Yes


#### Coefficients interpretation

In [73]:
data_dict = {
    'Feature': column_names_bis,
    'Best_Ridge': best_ridge.best_estimator_.coef_
            }

coefficients = pd.DataFrame(data = data_dict)
coefficients.head()

Unnamed: 0,Feature,Best_Ridge
0,Temperature,-11811.147836
1,Fuel_Price,-56312.363235
2,CPI,612296.892157
3,Unemployment,30804.852524
4,Year,-611.250661


In [74]:
coef_plt = coefficients
coef_plt['Best_Ridge'] = abs(coefficients['Best_Ridge'])
coef_plt = coef_plt.sort_values(by= 'Best_Ridge')
coef_plt.head()

Unnamed: 0,Feature,Best_Ridge
7,Dayofweek,0.0
4,Year,611.250661
0,Temperature,11811.147836
5,Month,18871.070508
3,Unemployment,30804.852524


In [75]:
# Plot coefficients
fig = px.bar(coef_plt, x='Best_Ridge', y='Feature', orientation = 'h', height= 700)
fig.update_layout(showlegend = False, 
                margin = {'l': 120} # to avoid cropping of column names
                )
fig.show()

## Train Lasso model with GridSearch and all features

#### Preprocessing

In [76]:
# Same set than GridSearch for Ridge Model

#### Train model and hyperparameter search with GridSearch

In [77]:
# Perform grid search
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [1, 2, 3, 5, 10, 20, 30, 40, 50, 60, 80, 100]
}
best_lasso = GridSearchCV(regressor, param_grid = params, cv = 5, scoring='r2') # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.594e+11, tolerance: 2.488e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.404e+11, tolerance: 2.426e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.670e+11, tolerance: 2.430e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.372e+11, tolerance: 2.225e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.306e+10, tolerance: 2.429e+09


Obje

...Done.
Best hyperparameters :  {'alpha': 40}
Best R2 score :  0.953225497718098



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 2.924e+10, tolerance: 2.429e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.001e+11, tolerance: 2.426e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.044e+11, tolerance: 2.430e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.226e+09, tolerance: 2.429e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.393e+11, tolerance: 3.010e+09



In [78]:
# Perform grid search
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50]
}
best_lasso = GridSearchCV(regressor, param_grid = params, cv = 5, scoring='r2') # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.455e+11, tolerance: 2.488e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.265e+11, tolerance: 2.426e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.441e+11, tolerance: 2.430e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.011e+11, tolerance: 2.225e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.716e+10, tolerance: 2.429e+09



...Done.
Best hyperparameters :  {'alpha': 42}
Best R2 score :  0.9532267180939652



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.436e+11, tolerance: 2.488e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.244e+11, tolerance: 2.426e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.411e+11, tolerance: 2.430e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 9.962e+10, tolerance: 2.225e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 7.568e+10, tolerance: 2.429e+09


Obje

#### Performance assessment

In [79]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = best_lasso.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = best_lasso.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[ 605407.41632791  346518.41562656 1290267.93618546  883653.54907979
 1558238.50921416 1525272.63403679 1978259.39069671  595931.59232268
  954026.52002718 1055357.4298555  2127986.6995597   656352.56241818
 2124720.71009825  611812.24934628  520723.40119433  811430.92828432
  624109.12129334 1625472.1461621   184942.24856806  534812.4481845
 1870889.61539882 2107073.15974433 1125437.00779506 1459325.60814222
 2054296.28379476 1948417.13111498  387128.04323422 2023273.10419154
  901744.53494272 1623559.68990046 2061984.97225239 1561684.53028208
 1544552.28749322 1917284.05852127  345899.75629649  519037.3777263
  922389.69229843 1507260.89057817 2025027.9830843  2027566.10153323
  525899.31998451 1935051.60913843 1607781.35984765  424732.36605622
  258170.57867633  507378.01545111  447195.102633   1763688.98063914
 1974079.1627762   402817.51488666 2061168.21831198 1910493.02571902
  775950.11144322 1538163.60592403  467214.70911059  404386.78250

In [80]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9863289458263613
R2 score on test set :  0.9393415336713338


In [81]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > (best_lasso.cv_results_['std_test_score'][best_lasso.best_index_])):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'lasso_all_features', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : best_lasso.cv_results_['std_test_score'][best_lasso.best_index_],
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes
3,ridge_all_features,0.986805,0.936262,0.031577,Yes
4,lasso_all_features,0.986329,0.939342,0.033395,Yes


#### Coefficients interpretation

In [82]:
data_dict = {
    'Feature': column_names_bis,
    'Best_Ridge': best_ridge.best_estimator_.coef_,
    'Best_Lasso': best_lasso.best_estimator_.coef_
            }

coefficients = pd.DataFrame(data = data_dict)
coefficients.head()

Unnamed: 0,Feature,Best_Ridge,Best_Lasso
0,Temperature,-11811.147836,-12484.183015
1,Fuel_Price,-56312.363235,-50793.032913
2,CPI,612296.892157,262601.201847
3,Unemployment,30804.852524,26873.889023
4,Year,-611.250661,21039.277816


In [83]:
coef_plt = coefficients
coef_plt['Best_Lasso'] = abs(coefficients['Best_Lasso'])
coef_plt['Best_Ridge'] = abs(coefficients['Best_Ridge'])
coef_plt = coef_plt.sort_values(by= 'Best_Lasso')
coef_plt.head()

Unnamed: 0,Feature,Best_Ridge,Best_Lasso
7,Dayofweek,0.0,0.0
0,Temperature,11811.147836,12484.183015
4,Year,611.250661,21039.277816
5,Month,18871.070508,24131.56856
3,Unemployment,30804.852524,26873.889023


In [84]:
# Plot coefficients
fig = px.bar(coef_plt, x='Best_Lasso', y='Feature', orientation = 'h', height= 700)
fig.update_layout(showlegend = False, 
                margin = {'l': 120} # to avoid cropping of column names
                )
fig.show()

In [85]:
fig = px.line(coef_plt, x = 'Feature', y = ['Best_Ridge', 'Best_Lasso'])
fig.show()

## Train Decision Tree model with GridSearch

#### Preprocessing

In [86]:
# Same set than GridSearch for Lasso Model

#### Train model and hyperparameter search with GridSearch

In [87]:
# Perform grid search
print("Grid search...")
des_tree = DecisionTreeRegressor()

# Grid of values to be tested
params = {
    'max_depth': [1, 5, 10, 20, 50, 100],
    'min_samples_leaf': [2, 5, 10, 20],
    'min_samples_split': [2, 5, 10, 20]
}
gs_des_tree = GridSearchCV(des_tree, param_grid = params, cv = 5, scoring='r2', verbose = 1)
gs_des_tree.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gs_des_tree.best_params_)
print("Best validation accuracy : ", gs_des_tree.best_score_)


Grid search...
Fitting 5 folds for each of 96 candidates, totalling 480 fits
...Done.
Best hyperparameters :  {'max_depth': 50, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best validation accuracy :  0.26950698622633096


In [88]:
# Perform grid search
print("Grid search...")
des_tree = DecisionTreeRegressor()

# Grid of values to be tested
params = {
    'max_depth': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [3, 4, 5, 6, 7, 8, 9]
}
gs_des_tree = GridSearchCV(des_tree, param_grid = params, cv = 5, scoring='r2', verbose = 1)
gs_des_tree.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gs_des_tree.best_params_)
print("Best validation accuracy : ", gs_des_tree.best_score_)


Grid search...
Fitting 5 folds for each of 273 candidates, totalling 1365 fits


...Done.
Best hyperparameters :  {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 8}
Best validation accuracy :  0.2985733743903642


#### Performance assessment

In [89]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gs_des_tree.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gs_des_tree.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[1235117.89666667  339129.26333333 1396746.755       903848.1325
 1678065.805      1235117.89666667 2040946.815       624494.28666667
  903848.1325     1103643.         2040946.815       624494.28666667
 1837546.13666667  556282.324       556282.324       574712.37
  624494.28666667 1678065.805       339129.26333333  574712.37
 1678065.805      2016366.18       1103643.         1396746.755
 2015065.688      1837546.13666667  339129.26333333 2040946.815
  903848.1325     1678065.805      2040946.815      1235117.89666667
 1678065.805      1678065.805       405669.7325      574712.37
  793473.015      1396746.755      2015065.688      2016366.18
  556282.324      1837546.13666667 1678065.805       405669.7325
  339129.26333333  556282.324       574712.37       1678065.805
 2015065.688       339129.26333333 2015065.688      2016366.18
  793473.015      1678065.805       574712.37        339129.26333333
 1837546.13666667  405669.7325      903848.1325

In [90]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.956237498322573
R2 score on test set :  0.7908858877656422


In [91]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > (gs_des_tree.cv_results_['std_test_score'][gs_des_tree.best_index_])):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'decision_tree', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : gs_des_tree.cv_results_['std_test_score'][gs_des_tree.best_index_],
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes
3,ridge_all_features,0.986805,0.936262,0.031577,Yes
4,lasso_all_features,0.986329,0.939342,0.033395,Yes
5,decision_tree,0.956237,0.790886,0.620396,No


## Train Random Forest model with GridSearch

#### Preprocessing

In [92]:
# Same set than GridSearch for Lasso Model

#### Train model and hyperparameter search with GridSearch

In [93]:
# Perform grid search
print("Grid search...")
random_frst = RandomForestRegressor()

# Grid of values to be tested
params = {
    'max_depth': [1, 5, 10, 20, 50, 100],
    'min_samples_leaf': [2, 5, 10, 20],
    'min_samples_split': [2, 5, 10, 20],
    'n_estimators': [10, 20, 40, 60, 80, 100]
}
gs_random_frst = GridSearchCV(random_frst, param_grid = params, cv = 5, scoring='r2', verbose= 2)
gs_random_frst.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gs_random_frst.best_params_)
print("Best validation accuracy : ", gs_random_frst.best_score_)


Grid search...
Fitting 5 folds for each of 576 candidates, totalling 2880 fits
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.0s
[CV] END max_depth=1, min_samples_leaf=2, min_samples_split=2, n_estimators=20; total time=   0.0s
[CV] END max_depth=1, min_samp

In [94]:
# Perform grid search
print("Grid search...")
random_frst = RandomForestRegressor()

# Grid of values to be tested
params = {
    'max_depth': [6, 8, 10, 12, 14, 16, 18],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [2, 3, 4],
    'n_estimators': [65, 70, 75, 80, 85, 90, 95]
}
gs_random_frst = GridSearchCV(random_frst, param_grid = params, cv = 5, scoring='r2', verbose= 2)
gs_random_frst.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gs_random_frst.best_params_)
print("Best validation accuracy : ", gs_random_frst.best_score_)


Grid search...
Fitting 5 folds for each of 441 candidates, totalling 2205 fits
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=65; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=   0.0s
[CV] END max_depth=6, min_samples_leaf=2, min_samples_split=2, n_estimators=70; total time=   0.0s
[CV] END max_depth=6, min_samp

#### Performance assessment

In [95]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = gs_random_frst.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gs_random_frst.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[1021459.27007213  475294.10277339 1319516.70019706  808325.3065458
 1512778.43319412 1373627.88374902 1770510.88882362  757499.40571186
  837918.5782909  1094410.42919678 1869138.85047857  787931.8365288
 1868166.45171606  808002.69561765  886371.64583725 1133738.96703978
  762106.33239743 1426346.78026489  511692.89968375  561382.73914435
 1709860.39339575 1876808.46973408 1029386.80078501 1179267.03865654
 1807096.70847563 1838014.67488964  514900.59309776 1885151.29542787
  952748.42597502 1423979.03546429 1786131.04457278 1516590.05660588
 1537826.56125602 1769675.86791116  687159.575362    604562.35936807
 1282134.35277339 1452197.1660479  1847316.04464902 1829371.95899542
  584932.7293253  1819132.29391018 1541376.32371849  585972.46514286
  418691.44183669  637751.55010168  607318.0569606  1613940.0697578
 1854757.18458824  464409.64771176 2002152.9878035  1765208.67683543
 1240040.62960476 1259655.19304412  560615.85014239  395692.032795

In [96]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9147119673293248
R2 score on test set :  0.6150005378240277


In [97]:
if (r2_score(Y_train, Y_train_pred) > r2_score(Y_test, Y_test_pred) and 
    (r2_score(Y_train, Y_train_pred) - r2_score(Y_test, Y_test_pred)) > (gs_random_frst.cv_results_['std_test_score'][gs_random_frst.best_index_])):
    overfit = 'Yes'
else:
    overfit = 'No'

scores_df = scores_df.append({'model' : 'random_forest', 
                            'r2_score_train' : r2_score(Y_train, Y_train_pred), 
                            'r2_score_test' : r2_score(Y_test, Y_test_pred), 
                            'std_dev' : gs_random_frst.cv_results_['std_test_score'][gs_random_frst.best_index_],
                            'overfitting' : overfit}, 
                            ignore_index = True)
scores_df


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting
0,lr_1_feature,0.097114,0.239045,0.022576,No
1,lr_all_features,0.986832,0.935222,0.030784,Yes
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes
3,ridge_all_features,0.986805,0.936262,0.031577,Yes
4,lasso_all_features,0.986329,0.939342,0.033395,Yes
5,decision_tree,0.956237,0.790886,0.620396,No
6,random_forest,0.914712,0.615001,0.142182,Yes


## Conclusion

In [98]:
scores_df['ecart'] = scores_df['r2_score_train'] - scores_df['r2_score_test'] - scores_df['std_dev']
scores_df['%_std_dev'] = scores_df['ecart'] / scores_df['std_dev']
scores_df

Unnamed: 0,model,r2_score_train,r2_score_test,std_dev,overfitting,ecart,%_std_dev
0,lr_1_feature,0.097114,0.239045,0.022576,No,-0.164507,-7.286906
1,lr_all_features,0.986832,0.935222,0.030784,Yes,0.020827,0.67655
2,lr_feature_selector,0.917245,0.823201,0.039905,Yes,0.054139,1.356707
3,ridge_all_features,0.986805,0.936262,0.031577,Yes,0.018967,0.600669
4,lasso_all_features,0.986329,0.939342,0.033395,Yes,0.013592,0.407003
5,decision_tree,0.956237,0.790886,0.620396,No,-0.455044,-0.733474
6,random_forest,0.914712,0.615001,0.142182,Yes,0.15753,1.107943


Overfitting semble inévitable étant donné le nombre de ligne faible (150 lignes). 
Même avec les modèles de régularisation Ridge et Lasso, et les rechecrches d'hyperparamètres via GridSearch avec CrossValidation, les scores R2 de test sont quelques peu améliorés, mais pas suffisamment pour limiter l'écart entre les R2 scores de train et de test en dessous des écarts types des scores après CrossValidation.
Le modèle le plus performant et le plus proche d'éviter l'overfitting semble être le model lasso dont le score de test se rapproche le plus de celui de train en comblant plus de 60 % de l'écart type des scores.