In [1]:
!pip install plotly



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
# setting Jedha color palette as default
pio.templates["jedha"] = go.layout.Template(
    layout_colorway=["#4B9AC7", "#4BE8E0", "#9DD4F3", "#97FBF6", "#2A7FAF", "#23B1AB", "#0E3449", "#015955"]
)
pio.templates.default = "jedha"
pio.renderers.default = "iframe" # to be replaced by "iframe" if working on JULIE

In [3]:
df = pd.read_csv("Walmart_Store_sales.csv")

In [4]:
print(df.shape)
df.head()

(150, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [5]:
df = df.dropna(subset = ['Weekly_Sales', 'Date'])
df["Holiday_Flag"] = df["Holiday_Flag"].fillna(0)

In [6]:
print(df.shape)
df.head()

(118, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,0.0,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092
5,4.0,28-05-2010,1857533.7,0.0,,2.756,126.160226,7.896
6,15.0,03-06-2011,695396.19,0.0,69.8,4.069,134.855161,7.658


In [7]:
df["Date"] = pd.to_datetime(df['Date'])

In [8]:
df['year']= df['Date'].dt.year
df['month']= df['Date'].dt.month
df['day']= df['Date'].dt.day
df['day_of_month']= df['Date'].dt.weekday
df

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_month
0,6.0,2011-02-18,1572117.54,0.0,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.470,2011,3,25,4
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,2010-05-28,1857533.70,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,2011-03-06,695396.19,0.0,69.80,4.069,134.855161,7.658,2011,3,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...
144,3.0,2012-10-19,424513.08,0.0,73.44,3.594,226.968844,6.034,2012,10,19,4
145,14.0,2010-06-18,2248645.59,0.0,72.62,2.780,182.442420,8.899,2010,6,18,4
147,17.0,2010-11-06,845252.21,0.0,57.14,2.841,126.111903,,2010,11,6,5
148,8.0,2011-12-08,856796.10,0.0,86.05,3.638,219.007525,,2011,12,8,3


In [9]:
#Suppression de la colone Date
df = df.drop("Date", axis=1)

In [10]:
df

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_month
0,6.0,1572117.54,0.0,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.470,2011,3,25,4
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,1857533.70,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,695396.19,0.0,69.80,4.069,134.855161,7.658,2011,3,6,6
...,...,...,...,...,...,...,...,...,...,...,...
144,3.0,424513.08,0.0,73.44,3.594,226.968844,6.034,2012,10,19,4
145,14.0,2248645.59,0.0,72.62,2.780,182.442420,8.899,2010,6,18,4
147,17.0,845252.21,0.0,57.14,2.841,126.111903,,2010,11,6,5
148,8.0,856796.10,0.0,86.05,3.638,219.007525,,2011,12,8,3


In [11]:
# deleting outliers

desc = df.describe()
desc

col_out = ["Temperature", "Fuel_Price", "CPI", "Unemployment" ]
mask = True
for col in col_out:
    q1 = desc.loc["25%", col]
    q3 = desc.loc["75%", col]
    ecart = q3 - q1
    cond1 = q1 - 3*ecart < df[col]
    cond2 = q3 + 3*ecart > df[col]
    mask = mask & cond1 & cond2


In [12]:
mask.value_counts()

True     80
False    38
dtype: int64

In [13]:
df_clean = df.loc[mask, :]
df_clean.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_month
0,6.0,1572117.54,0.0,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
6,15.0,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,3,6,6
7,20.0,2203523.2,0.0,39.93,3.617,213.023623,6.961,2012,3,2,4


In [14]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 149
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         80 non-null     float64
 1   Weekly_Sales  80 non-null     float64
 2   Holiday_Flag  80 non-null     float64
 3   Temperature   80 non-null     float64
 4   Fuel_Price    80 non-null     float64
 5   CPI           80 non-null     float64
 6   Unemployment  80 non-null     float64
 7   year          80 non-null     int64  
 8   month         80 non-null     int64  
 9   day           80 non-null     int64  
 10  day_of_month  80 non-null     int64  
dtypes: float64(7), int64(4)
memory usage: 7.5 KB


In [15]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["Store", "Holiday_Flag", "Temperature","Fuel_Price", "CPI",
                 "Unemployment", "month", "day_of_month"]
numeric_indices = [2,3,4,5]
categorical_indices = [0,1,6,7]
target_variable = "Weekly_Sales"

X = df_clean.loc[:,features_list]
Y = df_clean.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

Y : 
0    1572117.54
1    1807545.43
4    1644470.66
6     695396.19
7    2203523.20
Name: Weekly_Sales, dtype: float64

X :
   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0    6.0           0.0        59.61       3.045  214.777523         6.858   
1   13.0           0.0        42.38       3.435  128.616064         7.470   
4    6.0           0.0        78.89       2.759  212.412888         7.092   
6   15.0           0.0        69.80       4.069  134.855161         7.658   
7   20.0           0.0        39.93       3.617  213.023623         6.961   

   month  day_of_month  
0      2             4  
1      3             4  
4      5             4  
6      3             6  
7      3             4  


In [16]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, 
                                                    random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [17]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(Y_train[0:5])
print(Y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[[ 15.          0.         69.8         4.069     134.8551613   7.658
    3.          6.       ]
 [  4.          0.         81.85        3.57      129.0663      5.946
    6.          4.       ]
 [  7.          0.         36.61        3.767     192.826069    8.595
    5.          4.       ]
 [  1.          1.         38.51        2.548     211.2421698   8.106
   12.          3.       ]
 [ 18.          0.         73.67        2.792     132.6141935   9.342
    6.          1.       ]]
[[  2.          0.         44.69        2.976     211.0648881   8.028
    7.          4.       ]
 [  2.          0.         54.63        3.555     220.275944    7.057
    2.          4.       ]]

[695396.19, 2008344.92, 414094.05, 1641957.44, 1166117.85]
[1758050.79, 1861802.7]


In [18]:
# Missing values
print("Imputing missing values...")
print(X_train[0:5,:])
print()
imputer = SimpleImputer(strategy="mean")
X_train[:,numeric_indices] = imputer.fit_transform(X_train[:,numeric_indices])
print("...Done!")
print(X_train[0:5,:]) 
print() 

Imputing missing values...
[[ 15.          0.         69.8         4.069     134.8551613   7.658
    3.          6.       ]
 [  4.          0.         81.85        3.57      129.0663      5.946
    6.          4.       ]
 [  7.          0.         36.61        3.767     192.826069    8.595
    5.          4.       ]
 [  1.          1.         38.51        2.548     211.2421698   8.106
   12.          3.       ]
 [ 18.          0.         73.67        2.792     132.6141935   9.342
    6.          1.       ]]

...Done!
[[ 15.          0.         69.8         4.069     134.8551613   7.658
    3.          6.       ]
 [  4.          0.         81.85        3.57      129.0663      5.946
    6.          4.       ]
 [  7.          0.         36.61        3.767     192.826069    8.595
    5.          4.       ]
 [  1.          1.         38.51        2.548     211.2421698   8.106
   12.          3.       ]
 [ 18.          0.         73.67        2.792     132.6141935   9.342
    6.          1. 

In [19]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])

# Normalization
numeric_transformer = StandardScaler()

# OHE / dummyfication
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_indices),    
        ('num', numeric_transformer, numeric_indices)
        ]
    )

X_train = featureencoder.fit_transform(X_train)
print("...Done")
print(X_train[0:5,:])

Encoding categorical features and standardizing numerical features...

[[ 15.          0.         69.8         4.069     134.8551613   7.658
    3.          6.       ]
 [  4.          0.         81.85        3.57      129.0663      5.946
    6.          4.       ]
 [  7.          0.         36.61        3.767     192.826069    8.595
    5.          4.       ]
 [  1.          1.         38.51        2.548     211.2421698   8.106
   12.          3.       ]
 [ 18.          0.         73.67        2.792     132.6141935   9.342
    6.          1.       ]]
...Done
  (0, 12)	1.0
  (0, 20)	1.0
  (0, 34)	1.0
  (0, 35)	0.5858922363885097
  (0, 36)	1.5619981885198218
  (0, 37)	-1.0553617637599562
  (0, 38)	0.30814093975278123
  (1, 2)	1.0
  (1, 23)	1.0
  (1, 32)	1.0
  (1, 35)	1.2843080223114072
  (1, 36)	0.5704464950143056
  (1, 37)	-1.2039825405701505
  (1, 38)	-1.5030487221126148
  (2, 5)	1.0
  (2, 22)	1.0
  (2, 32)	1.0
  (2, 35)	-1.3377940652530642
  (2, 36)	0.9619007708070645
  (2, 37)	0.4329

In [20]:
# Regression Linéaire
print("3-fold cross-validation...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.25257761719353644
The standard deviation is :  0.27053338002025706


In [21]:
print(scores)

[0.00465456 0.12418407 0.62889422]


In [22]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.0, 0.1, 0.5, 1.0] # 0 corresponds to no regularization
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3, verbose = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] alpha=0.0 .......................................................
[CV] ........................... alpha=0.0, score=0.507, total=   0.0s
[CV] alpha=0.0 .......................................................
[CV] ........................... alpha=0.0, score=0.406, total=   0.0s
[CV] alpha=0.0 .......................................................
[CV] ........................... alpha=0.0, score=0.629, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.392, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.612, total=   0.0s
[CV] alpha=0.1 .......................................................
[CV] ........................... alpha=0.1, score=0.854, total=   0.0s
[CV] alpha=0.5 .......................................................
[C

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed:    0.1s finished


In [23]:
# Predictions on training set
# The model has already be re-trained on all the training set at the end of the grid search, so we can directly use it !
print("Predictions on training set...")
Y_train_pred = gridsearch.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 683298.67352528 2122319.02447614  530857.77123125 1537832.99855294
 1082943.90669445 2093427.41147623 2405404.55026704 1326937.49988936
 2013708.35001695  460513.43918497  994504.64261884 1714450.02530471
 1064445.84365919  430021.18504779 2141130.5428909  1979635.22138289
 1906817.31838729 1998196.73955575  381941.05845417  642335.62677028
 2397514.67190448  353653.2744997  1577086.40421506  553314.18663737
  466585.95304139 1988952.57906928 1964614.61943681 1974748.67335048
  517945.48908998 1418323.19255684 1416559.8772907   685659.14233739
 1460318.51214885  555110.05620848  924744.69889396 1840110.50047487
  740915.64620775 1606901.5185763  2075055.88633565 1307646.99430116
  866927.62874238 1363675.04504727  975292.84869388  404306.57741915
 1390689.43263883 1523516.54937574 1495884.27565281 1191766.00220405
  560142.77519404  924178.61912224 1961476.63733963 1894750.65044464
  895789.62757151  418326.72942506  247399.96007746 2069469.438

In [24]:
# Missing values
print("Imputing missing values...")
print(X_test[0:5,:])
print()

X_test[:,numeric_indices] = imputer.transform(X_test[:,numeric_indices])
print("...Done!")
print(X_test[0:5,:]) 
print() 

Imputing missing values...
[[  2.          0.         44.69        2.976     211.0648881   8.028
    7.          4.       ]
 [  2.          0.         54.63        3.555     220.275944    7.057
    2.          4.       ]
 [  1.          0.         91.65        3.684     215.544618    7.962
    5.          6.       ]
 [  3.          0.         73.44        3.594     226.9688442   6.034
   10.          4.       ]
 [  1.          0.         85.22        2.619     211.5673056   7.787
    8.          4.       ]]

...Done!
[[  2.          0.         44.69        2.976     211.0648881   8.028
    7.          4.       ]
 [  2.          0.         54.63        3.555     220.275944    7.057
    2.          4.       ]
 [  1.          0.         91.65        3.684     215.544618    7.962
    5.          6.       ]
 [  3.          0.         73.44        3.594     226.9688442   6.034
   10.          4.       ]
 [  1.          0.         85.22        2.619     211.5673056   7.787
    8.          4. 

In [25]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_test[0:5,:])

X_test = featureencoder.transform(X_test)
print("...Done")
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...

[[  2.          0.         44.69        2.976     211.0648881   8.028
    7.          4.       ]
 [  2.          0.         54.63        3.555     220.275944    7.057
    2.          4.       ]
 [  1.          0.         91.65        3.684     215.544618    7.962
    5.          6.       ]
 [  3.          0.         73.44        3.594     226.9688442   6.034
   10.          4.       ]
 [  1.          0.         85.22        2.619     211.5673056   7.787
    8.          4.       ]]
...Done
  (0, 0)	1.0
  (0, 24)	1.0
  (0, 32)	1.0
  (0, 35)	-0.8694787498790384
  (0, 36)	-0.6098775649902554
  (0, 37)	0.9012145317385521
  (0, 38)	0.6995779577961211
  (1, 0)	1.0
  (1, 19)	1.0
  (1, 32)	1.0
  (1, 35)	-0.29335817626297167
  (1, 36)	0.5406403318828779
  (1, 37)	1.1376952854059645
  (1, 38)	-0.32767973009599305
  (2, 22)	1.0
  (2, 34)	1.0
  (2, 35)	1.8523142216511916
  (2, 36)	0.7969733348131607
  (2, 37)	1.0162252226386526


In [26]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = gridsearch.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[1603202.99667555 1591270.30662487 1427098.21989512  492748.34251667
 1460869.36116005 1955251.9498799   321022.65770276  375162.48659718
 1468164.01353219  614325.7956426   470939.46652112  604355.53555661
 1488163.4661406  2097263.21631434  606196.9878278  2068196.83542153]



In [27]:
# Print R^2 scores on train/test sets for the Ridge model with optimal value of the regularization strength
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))


R2 score on training set :  0.9772395163852005
R2 score on test set :  0.9625115503828109
