In [154]:
import pandas as pd
import numpy as np
import datetime

import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge,Lasso,LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score


In [155]:
#Lecture du fichier
dataset = pd.read_csv("Walmart_Store_sales.csv")

In [156]:
#Affichage du dataset
dataset.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [157]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 150

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

In [158]:
#Suppression des lignes de target "Weekly Sales" qui sont nulles
print("Number of rows Avant: {}".format(dataset.shape[0]))
dataset = dataset[~dataset['Weekly_Sales'].isnull()]
print("Number of rows Apres: {}".format(dataset.shape[0]))

Number of rows Avant: 150
Number of rows Apres: 136


In [159]:
#Pour les colonnes Temperature, Fuel_price, CPI and Unemployment on supprimes les outliners à +/- 3std
#[dataset.drop(dataset[(-3*data_desc.loc['std',c]>=dataset[c]) | (dataset[c]>=3*data_desc.loc['std',c])].index,inplace=True) for c in ['Temperature','Fuel_Price','CPI','Unemployment']]

for c in ['Temperature','Fuel_Price','CPI','Unemployment']:
    fig = px.histogram(dataset,x=c,nbins=30,width=500)
    fig.show()
    print("Number of rows Avant: {}".format(dataset.shape[0]))
    mask = (data_desc.loc['mean',c]-3*data_desc.loc['std',c]>=dataset[c]) | (dataset[c]>=data_desc.loc['mean',c]+3*data_desc.loc['std',c])
    dataset.drop(dataset[mask].index,inplace=True)
    print("Number of rows Apres: {}".format(dataset.shape[0]))
    fig = px.histogram(dataset,x=c,nbins=30,width=500)
    fig.show()

Number of rows Avant: 136
Number of rows Apres: 136


Number of rows Avant: 136
Number of rows Apres: 136


Number of rows Avant: 136
Number of rows Apres: 136


Number of rows Avant: 136
Number of rows Apres: 131


In [160]:
dataset['Date'] = pd.to_datetime(dataset['Date'],format='%d-%m-%Y')

In [161]:
type(dataset['Date'].dtype)

numpy.dtype[datetime64]

In [162]:
dataset.describe(include='all',datetime_is_numeric=True)

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,131.0,113,131.0,120.0,117.0,119.0,120.0,117.0
mean,9.938931,2011-04-24 21:52:33.982300928,1257990.0,0.066667,60.405897,3.302908,180.175755,7.399427
min,1.0,2010-02-05 00:00:00,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,2010-07-30 00:00:00,584243.9,0.0,44.82,2.824,132.579257,6.664
50%,9.0,2011-04-22 00:00:00,1366396.0,0.0,61.79,3.435,197.655672,7.368
75%,16.0,2012-01-13 00:00:00,1809576.0,0.0,75.54,3.7085,214.904838,8.099
max,20.0,2012-10-19 00:00:00,2771397.0,1.0,91.65,4.17,226.968844,9.524
std,6.228663,,657746.3,0.25049,18.46674,0.475435,39.723167,0.994117


In [163]:
#Suppression "DATE" qui sont nulles
print("Number of rows Avant: {}".format(dataset.shape[0]))
dataset = dataset[~dataset['Date'].isnull()]
print("Number of rows Apres: {}".format(dataset.shape[0]))

Number of rows Avant: 131
Number of rows Apres: 113


In [164]:
#Ajout de valeur interessante pour le modele 
#Year
#Month
#Day
#Week_day

#Ajout des colonnes
dataset['Year'] = None
dataset['Month'] = None
dataset['Day'] = None
dataset['Week_day'] = None

#Alimentation des colonnes
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day
dataset['Week_day'] = dataset['Date'].dt.weekday

In [165]:
#Affichage dataset propre
dataset.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Week_day
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,2011-06-03,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4


In [166]:
#Suppression de la Date
#TO DO : Remplir les YEAR MONTH etc avec les données de la date la plus représentée
dataset.drop(columns=['Date'],axis=1,inplace=True)

In [167]:
dataset.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Week_day
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4


In [168]:
dataset.dtypes

Store           float64
Weekly_Sales    float64
Holiday_Flag    float64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
Year              int64
Month             int64
Day               int64
Week_day          int64
dtype: object

Les données semblent assez peu corrélées entre elles

In [169]:
# Correlation matrix
corr_matrix = dataset.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

In [170]:
#On note que le prix de l'essence et l'année sont fortement liés

In [171]:
# Visualize pairwise dependencies
fig = px.scatter_matrix(dataset)
fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=800, width = 800)
fig.show()

In [172]:
#On voit que le weekday est tjrs à 4 donc cette colonne n'a pas beaucoup d'intéret
#dataset.drop(columns=['Week_day'],axis=1,inplace=True)

In [173]:
dataset.describe(include='all')

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Week_day
count,113.0,113.0,104.0,103.0,102.0,104.0,102.0,113.0,113.0,113.0,113.0
mean,9.858407,1267415.0,0.067308,60.197087,3.273863,180.105389,7.376775,2010.831858,6.274336,16.530973,4.0
std,6.184467,674682.4,0.251767,17.878511,0.481421,39.201866,0.973078,0.822699,3.179869,8.238705,0.0
min,1.0,268929.0,0.0,18.79,2.514,126.111903,5.143,2010.0,1.0,1.0,4.0
25%,4.0,563460.8,0.0,45.02,2.81475,132.579257,6.64225,2010.0,4.0,10.0,4.0
50%,9.0,1420405.0,0.0,61.11,3.3025,197.500965,7.4045,2011.0,6.0,17.0,4.0
75%,15.0,1847431.0,0.0,75.255,3.6835,214.809008,8.09675,2012.0,9.0,24.0,4.0
max,20.0,2771397.0,1.0,91.65,4.17,226.968844,9.524,2012.0,12.0,31.0,4.0


In [174]:
#clean de la date 
#dataset['Year'] = str(dataset['Year'])

In [175]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = dataset.drop(columns="Weekly_Sales",axis=1).columns.to_list()
target_variable = ["Weekly_Sales"]

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
   Weekly_Sales
0    1572117.54
1    1807545.43
4    1644470.66
5    1857533.70
6     695396.19

X :
   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0    6.0           NaN        59.61       3.045  214.777523         6.858   
1   13.0           0.0        42.38       3.435  128.616064         7.470   
4    6.0           0.0        78.89       2.759  212.412888         7.092   
5    4.0           0.0          NaN       2.756  126.160226         7.896   
6   15.0           0.0        69.80       4.069  134.855161         7.658   

   Year  Month  Day  Week_day  
0  2011      2   18         4  
1  2011      3   25         4  
4  2010      5   28         4  
5  2010      5   28         4  
6  2011      6    3         4  


In [187]:
# Automatically detect names of numeric/categorical columns
numeric_features = []
categorical_features = []
for i,t in X.dtypes.items():
    if ('float' in str(t)) or ('int' in str(t)) :
        numeric_features.append(i)
    else :
        categorical_features.append(i)

print('Found numeric features ', numeric_features)
print('Found categorical features ', categorical_features)

#La detection automatique ne nous trouve aucune categorical feature ce qui est normal car le holiday est codé en 0,1 
#On le force

numeric_features = [ 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment','Year', 'Month', 'Day','Week_day']
categorical_features = ['Holiday_Flag','Store']
print('Forced numeric features ', numeric_features)
print('Forced categorical features ', categorical_features)


Found numeric features  ['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Week_day']
Found categorical features  []
Forced numeric features  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Week_day']
Forced categorical features  ['Holiday_Flag', 'Store']


In [188]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [189]:
# Create pipeline for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])

# Create pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by columns' mean
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [190]:
# Preprocessings on train set
print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5]) # MUST use this syntax because X_train is a numpy array and not a pandas DataFrame anymore
print()


Performing preprocessings on train set...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
39     5.0           0.0        60.71       3.297  218.569962         6.300   
90     9.0           NaN        78.51       2.642  214.656430         6.442   
147   17.0           0.0        57.14       2.841  126.111903           NaN   
143    3.0           0.0        78.53       2.705  214.495838         7.343   
27    16.0           0.0        43.95       3.828  192.831317         6.339   

     Year  Month  Day  Week_day  
39   2011     11   11         4  
90   2010      7    9         4  
147  2010      6   11         4  
143  2010      6    4         4  
27   2011      5   20         4  
...Done.
[[ 0.07085874  0.05195161  1.04658292 -1.24080326  0.23572492  1.39501821
  -0.65585347  0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0

In [191]:

# Preprocessings on test set
print("Performing preprocessings on test set...")
print(X_test.head()) 
X_test = preprocessor.transform(X_test) # Don't fit again !! The test set is used for validating decisions
# we made based on the training set, therefore we can only apply transformations that were parametered using the training set.
# Otherwise this creates what is called a leak from the test set which will introduce a bias in all your results.
print('...Done.')
print(X_test[0:5,:]) # MUST use this syntax because X_test is a numpy array and not a pandas DataFrame anymore
print()

Performing preprocessings on test set...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
107    8.0           1.0        33.34       2.548  214.621419         6.299   
6     15.0           0.0        69.80       4.069  134.855161         7.658   
49    16.0           0.0        48.29       3.750  197.413326         6.162   
95     1.0           0.0        74.78       2.854  210.337426         7.808   
13     1.0           0.0        64.74       3.734  221.211813         7.348   

     Year  Month  Day  Week_day  
107  2010      2   12         4  
6    2011      6    3         4  
49   2012      3   30         4  
95   2010      5   14         4  
13   2012      3   16         4  
...Done.
[[-1.53228135 -1.58349895  0.93894098 -1.24189996 -1.01223053 -1.39501821
  -0.5331366   0.          1.          0.          0.          0.
   0.          0.          0.          1.          0.          0.
   0.          0.          0.          0.          0.          0.

In [192]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [193]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[[ 439664.93431987]
 [ 547164.76068008]
 [ 811200.8333214 ]
 [ 469735.24896041]
 [ 450636.7957676 ]
 [1997707.78621654]
 [ 433872.46629569]
 [1365604.54436315]
 [ 645623.40961892]
 [1963896.36444988]
 [1486045.42996678]
 [ 535273.61514081]
 [ 536436.46456174]
 [1623280.15295287]
 [1936159.66608764]
 [ 348713.16699275]
 [2114973.09867026]
 [1941814.9672147 ]
 [ 454088.46673456]
 [ 614823.84110905]
 [1548024.15538601]
 [1522611.61717938]
 [ 230366.74794664]
 [ 902008.63668222]
 [ 504018.8707628 ]
 [1314566.19077943]
 [ 895066.49999999]
 [ 392602.1106887 ]
 [2051985.53265523]
 [1916934.50878321]
 [1322237.51157226]
 [1388311.94401109]
 [2134611.34308229]
 [1435359.66824388]
 [ 506130.48153519]
 [1207957.01305293]
 [1974894.0339923 ]
 [2011944.53192687]
 [2032108.93896211]
 [2010421.5483339 ]
 [1587894.45426032]
 [1967210.43005864]
 [1150863.6181198 ]
 [1527509.83478215]
 [ 571400.70396531]
 [2370741.68857081]
 [2052351.80080461]
 [ 589102.41967066]


In [194]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[[ 939743.99192721]
 [ 695378.75146698]
 [ 341422.75865179]
 [1548337.8451615 ]
 [1441052.64808995]
 [2020762.32504814]
 [ 937193.66938141]
 [ 343818.03068365]
 [ 950090.06037736]
 [ 958310.59101486]
 [1991063.85702919]
 [1829178.97500525]
 [1320543.77299513]
 [ 885419.15956845]
 [1359399.43984426]
 [1542081.66395655]
 [1874324.50753794]
 [2011817.83389317]
 [1071799.51309808]
 [2268885.15490408]
 [ 441448.62563879]
 [1025598.21758198]
 [1528520.56221378]]



In [195]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9744917065004568
R2 score on test set :  0.930918323133179


In [139]:
#La régression linéaire nous donne déjà un très bon R2

In [140]:
#Cherchons les features les plus interessante pour le modèle 

regressor.coef_[0][0:7]

array([-32156.28289626, -29494.36529511,  37151.56191437, -32703.00605001,
       -18590.65911912,  64239.00034695, -44022.32338203])

In [141]:
len(dataset['Store'].unique())

19

In [196]:
#On peut matcher les colonnes avec nos données d'entrée: 
#Les 7 premieres sont nos numerique features
for i in range(0,len(numeric_features)):
    print(numeric_features[i],':',regressor.coef_[0][i])



Temperature : -32156.282896258046
Fuel_Price : -29494.365295109514
CPI : 37151.561914368984
Unemployment : -32703.006050005108
Year : -18590.659119118285
Month : 64239.00034694914
Day : -44022.32338203008
Week_day : 1.0477378964424133e-09


In [None]:
# On voit clairement que la features weekday n'a pas d'impacte.

In [197]:
for i in categorical_features:
    nb_feat =  dataset[i].nunique(dropna=True)-1 #(OneHotEncoder) 
    for j in range (len(numeric_features),len(numeric_features) + nb_feat):
        print(i+'_'+str(j),':',regressor.coef_[0][j])

Holiday_Flag_8 : -55037.01078085424
Store_8 : -55037.01078085424
Store_9 : 411200.0442518152
Store_10 : -1175961.537368167
Store_11 : 547824.42621763
Store_12 : -1281559.3971901457
Store_13 : 43455.37785493534
Store_14 : -962398.955137266
Store_15 : -660836.612228464
Store_16 : -1128007.3400384972
Store_17 : 568597.737649105
Store_18 : 178892.19201501252
Store_19 : 494795.44216754095
Store_20 : 611528.3644780599
Store_21 : -769162.1368608929
Store_22 : -1072384.3108449562
Store_23 : -735524.0146542635
Store_24 : -342683.0792961309
Store_25 : -45604.40983157167


In [None]:
#Le modèle utilise le store avec des coeff qui ont une difference forte d'ordre de grandeur et donc d'importance

RIDGE

In [203]:
# Perform grid search
print("Grid search...")
rig_regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [1,100,1000,5000,10000,50000,100000], # 0 corresponds to no regularization
    'max_iter': [10000],
    'solver':['svd', 'lsqr', 'sag','saga']
}
gridsearch = GridSearchCV(rig_regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 1, 'max_iter': 10000, 'solver': 'svd'}
Best R2 score :  0.8493570925636774


In [204]:
# Predictions on train set
print("Predictions on test set...")
Y_train_pred_reg = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred_reg)
print()

Predictions on test set...
...Done.
[[ 532487.58578007]
 [ 662300.19678057]
 [ 973994.01581256]
 [ 523545.01450056]
 [ 636432.72075462]
 [1953199.71115588]
 [ 430857.85991337]
 [1352820.80000528]
 [ 737517.14397981]
 [1812117.95257666]
 [1415871.59865716]
 [ 639708.44038288]
 [ 722567.60670065]
 [1396698.71013223]
 [1861189.44261979]
 [ 477253.4325218 ]
 [2015661.82264307]
 [1765253.05761272]
 [ 508233.58081537]
 [ 749348.65087761]
 [1327746.12206248]
 [1378760.24998532]
 [ 349224.9769254 ]
 [1066551.42062286]
 [ 619629.54119063]
 [1288866.68040744]
 [1038385.3115749 ]
 [ 621515.15705342]
 [2007524.76164548]
 [1812034.96550927]
 [1312713.07346435]
 [1437251.01477947]
 [1964095.04236271]
 [1400277.95604571]
 [ 702758.50257295]
 [1290098.1708813 ]
 [1803709.35118428]
 [1980806.38075267]
 [1913822.59849005]
 [1976900.76205341]
 [1527631.74929344]
 [1976153.17290044]
 [1197739.7665271 ]
 [1491115.27989776]
 [ 729318.53600786]
 [2167558.59200686]
 [2043984.03043322]
 [ 780740.969637  ]
 [ 5

In [205]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred_reg = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred_reg)
print()

Predictions on test set...
...Done.
[[1121038.8458681 ]
 [ 921813.83400877]
 [ 550614.06360115]
 [1283158.04225589]
 [1195950.87917905]
 [1690222.48712165]
 [1023251.21012559]
 [ 400860.72505518]
 [1050797.98527612]
 [1058292.20608058]
 [1921681.96838975]
 [1695171.4682108 ]
 [1383814.09624365]
 [1126026.54729835]
 [1367808.7790655 ]
 [1470308.46592869]
 [1720172.99365529]
 [2094149.76930888]
 [ 971153.73510768]
 [2025590.13737022]
 [ 538580.55814469]
 [1118279.75131291]
 [1453239.14193844]]



In [206]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred_reg))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred_reg))

R2 score on training set :  0.9380167925632867
R2 score on test set :  0.8318887091976797


Lasso

In [201]:
# Perform grid search
print("Grid search...")
las_regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [1,100,1000,5000,10000,50000,100000], # 0 corresponds to no regularization
    'max_iter': [10000]
}
gridsearch = GridSearchCV(las_regressor, param_grid = params, cv = 10) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 1000, 'max_iter': 10000}
Best R2 score :  0.9075979411969639


In [149]:
# Predictions on train set
print("Predictions on test set...")
Y_train_pred_reg = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred_reg)
print()

Predictions on test set...
...Done.
[ 451662.45553088  573328.51350833  865506.43934747  474222.87290976
  495437.95922991 1992304.50989168  431997.91418525 1384848.74575483
  644492.4074099  1928244.79239262 1447026.49632733  537441.8769299
  594044.35179894 1573391.6204028  1918985.83575594  384223.13010352
 2087658.45213606 1884964.10404373  453284.44051961  640344.00312336
 1499025.31895331 1516011.20056087  250072.64158397  956295.67141866
  522903.32605081 1322368.4764996  1049048.08274218  463677.16806333
 2050557.9980014  1888110.70214913 1329426.35774196 1417009.29615987
 2084488.75030804 1444123.38416173  574096.86473222 1237153.23082297
 1926533.05500985 2016257.36876268 2023632.3088092  2001990.4556475
 1551544.01724931 1981192.40823408 1171028.23996841 1527504.95608159
  628106.73151271 2298133.13897365 2048205.5093205   640678.60397203
  504165.42892821 1212642.47746166 2063248.22485336 2226791.93613767
 1952139.10806621  340185.75171544  677871.47451932 2097651.937527
  

In [150]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred_reg = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred_reg)
print()

Predictions on test set...
...Done.
[1118105.96926396  759498.49470057  409775.52617942 1484238.13604298
 1391809.12120226 1866210.51204209 1056856.77643456  359398.60327638
 1092444.68030976  987785.75705033 1968335.97716863 1805538.45917486
 1366861.80835813 1068875.75357706 1375976.90777743 1506095.0182704
 1773252.67382666 2074085.10521114 1051112.53634943 2191445.21968674
  458271.20290963 1162114.32352363 1491547.52865176]



In [151]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred_reg))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred_reg))

R2 score on training set :  0.9702703393286651
R2 score on test set :  0.900592861948124


Conclusion

In [None]:
#RIDGE et LASSO n'ameliore pas notre score sur le test set.
