In [389]:
!pip install plotly -q

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline


import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [390]:
#Afficher les 5 premières lignes du dataframe
df = pd.read_csv('Walmart_Store_sales.csv')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [391]:
#Compter les valeurs Holiday Flag (on voit un désequilibre)
df["Holiday_Flag"].value_counts()

0.0    127
1.0     11
Name: Holiday_Flag, dtype: int64

In [392]:
#Suppression de la colonne Holiday_Flag car trop desequilibrée
df = df.drop("Holiday_Flag", axis=1)

In [393]:
#Afficher les détails du dataframe
df.describe(include='all')

Unnamed: 0,Store,Date,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,132.0,136.0,138.0,135.0
unique,,85,,,,,
top,,19-10-2012,,,,,
freq,,4,,,,,
mean,9.866667,,1249536.0,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,76.345,3.70625,214.934616,8.15


In [394]:
#Drop les lignes où on a un Nan en Weekly_Sales
df = df.dropna(subset=['Weekly_Sales'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 149
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         136 non-null    float64
 1   Date          118 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Temperature   121 non-null    float64
 4   Fuel_Price    124 non-null    float64
 5   CPI           125 non-null    float64
 6   Unemployment  122 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.5+ KB


In [395]:
#Transformer la colonne Date en un format datetime
df['Date'] = pd.to_datetime(df['Date'])

In [396]:
#Créer des colonnes supp pour le jour le mois et l'année
#df['Day'] = pd.DatetimeIndex(df['Date']).day
#df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Year'] = pd.DatetimeIndex(df['Date']).year
df = df.drop('Date', axis=1)

df.head()

Unnamed: 0,Store,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Year
0,6.0,1572117.54,59.61,3.045,214.777523,6.858,2011.0
1,13.0,1807545.43,42.38,3.435,128.616064,7.47,2011.0
3,11.0,1244390.03,84.57,,214.556497,7.346,
4,6.0,1644470.66,78.89,2.759,212.412888,7.092,2010.0
5,4.0,1857533.7,,2.756,126.160226,7.896,2010.0


In [397]:
#Affichage écart type Weekly sales (en $)
Ec_type = df['Weekly_Sales'].std()
Ec_type

647463.0423486971

In [398]:
# 3 Ecart-type à la moyenne sur la colonne Weekly_sales (target)
for i in df:
    for a in df.loc[:,i]:
        if a > (df[i].std()*3) or a < (df[i].std()*3):
            df.dropna(subset=['Weekly_Sales'])

In [399]:
#Fonction permettant de drop les valeurs supérieures à 3 écarts-types sur les colonnes souhaitées
def ecart_type(dataframe, colonnes):
    for i in dataframe[colonnes]:
        for a in dataframe.loc[:,i].index:
            Moyenne = dataframe[i].mean()
            Ecart_type = dataframe[i].std()
            Ecart_type_min = Moyenne - (Ecart_type*3)
            Ecart_type_max = Moyenne + (Ecart_type*3)

            if dataframe.loc[a,i] < Ecart_type_min or dataframe.loc[a,i] > Ecart_type_max:
                dataframe = dataframe.drop(a)
    return dataframe

In [400]:
#Suppression des valeurs abérrantes
ecart_type(df, ['Temperature','Fuel_Price','CPI','Unemployment','Weekly_Sales'])

Unnamed: 0,Store,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Year
0,6.0,1572117.54,59.61,3.045,214.777523,6.858,2011.0
1,13.0,1807545.43,42.38,3.435,128.616064,7.470,2011.0
3,11.0,1244390.03,84.57,,214.556497,7.346,
4,6.0,1644470.66,78.89,2.759,212.412888,7.092,2010.0
5,4.0,1857533.70,,2.756,126.160226,7.896,2010.0
...,...,...,...,...,...,...,...
145,14.0,2248645.59,72.62,2.780,182.442420,8.899,2010.0
146,7.0,716388.81,20.74,2.778,,,
147,17.0,845252.21,57.14,2.841,126.111903,,2010.0
148,8.0,856796.10,86.05,3.638,219.007525,,2011.0


In [401]:
df.describe()

Unnamed: 0,Store,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Year
count,136.0,136.0,121.0,124.0,125.0,122.0,118.0
mean,10.014706,1249536.0,60.853967,3.316992,178.091144,7.665582,2010.822034
std,6.124614,647463.0,18.514432,0.47954,40.243105,1.619428,0.812628
min,1.0,268929.0,18.79,2.514,126.111903,5.143,2010.0
25%,4.0,605075.7,45.22,2.8385,131.637,6.69,2010.0
50%,10.0,1261424.0,62.25,3.451,196.919506,7.477,2011.0
75%,15.25,1806386.0,75.95,3.724,214.878556,8.15,2011.75
max,20.0,2771397.0,91.65,4.193,226.968844,14.313,2012.0


In [402]:
#Selection de X et Y
features_list = ["Store","Temperature","Fuel_Price","CPI","Unemployment","Year"]
target_variable = "Weekly_Sales"

X = df.loc[:, features_list]
Y = df.loc[:,target_variable]

In [403]:
X.head()

Unnamed: 0,Store,Temperature,Fuel_Price,CPI,Unemployment,Year
0,6.0,59.61,3.045,214.777523,6.858,2011.0
1,13.0,42.38,3.435,128.616064,7.47,2011.0
3,11.0,84.57,,214.556497,7.346,
4,6.0,78.89,2.759,212.412888,7.092,2010.0
5,4.0,,2.756,126.160226,7.896,2010.0


In [404]:
#Features numériques et catégorielles
categorical_features = ['Store', "Year"]
numeric_features = df.select_dtypes(exclude='object').drop(categorical_features, axis = 1).columns.drop('Weekly_Sales').tolist()

#Definition des indices numériques et categoriels pour les intégrer au preprocessor
numeric_indices = [1,2,3,4] 
categorical_indices = [0,5]

In [405]:
df.head()

Unnamed: 0,Store,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment,Year
0,6.0,1572117.54,59.61,3.045,214.777523,6.858,2011.0
1,13.0,1807545.43,42.38,3.435,128.616064,7.47,2011.0
3,11.0,1244390.03,84.57,,214.556497,7.346,
4,6.0,1644470.66,78.89,2.759,212.412888,7.092,2010.0
5,4.0,1857533.7,,2.756,126.160226,7.896,2010.0


In [406]:
numeric_features, categorical_features

(['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'], ['Store', 'Year'])

In [407]:
# Division du test set avec le train test split -> 30% de données pour le test et random test à 0

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [408]:
#Conversion du dataframe en array numpy

X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.tolist()
Y_test = Y_test.tolist()

In [409]:
# Creation du pipeline pour la feature categorielle
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # missing values will be replaced by most frequent value
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

In [410]:
# Creation du pipeline pour la feature numérique
numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler())
    ])

In [411]:
# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_indices),
        ('cat', categorical_transformer, categorical_indices)
    ])

In [412]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [413]:
X_train

<95x25 sparse matrix of type '<class 'numpy.float64'>'
	with 518 stored elements in Compressed Sparse Row format>

In [414]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, Y_train)

LinearRegression()

In [415]:
#r2 score
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

print("R2 score sur le train set est à",r2_score(Y_train, Y_train_pred))
print("R2 score sur le test set est à",r2_score(Y_test, Y_test_pred))

R2 score sur le train set est à 0.96214656286074
R2 score sur le test set est à 0.9495582387375263


In [416]:
from sklearn.model_selection import cross_val_score

#Cross validation score

model = LinearRegression() 
model.fit(X_train, Y_train)

score_train = cross_val_score(model,X_train, Y_train, cv=7)
score_test = cross_val_score(model,X_test, Y_test, cv=7)
#F1 score
print("cross validated r2-score on train set : {} ".format(score_train.mean()))
print("cross validated r2-score on test set : {} ".format(score_test.mean()))

cross validated r2-score on train set : 0.8360519364643997 
cross validated r2-score on test set : 0.5337824103795062 


In [417]:
# Perform Ridge Regressor

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

#Ridge Regressor
regressor = Ridge(alpha=0.5)
regressor.fit(X_train, Y_train)

r2_scores_train = regressor.score(X_train, Y_train)
r2_scores_test = regressor.score(X_test, Y_test)

print('Ridge -> The R2 score on train set is : ', r2_scores_train)
print('Ridge -> The R2 score on test set is : ', r2_scores_test)

#----------------------------------------------------------------------------------------------------

#Lasso Regressor
regressor = Lasso(alpha=0.5)
regressor.fit(X_train, Y_train)

r2_scores_train = regressor.score(X_train, Y_train)
r2_scores_test = regressor.score(X_test, Y_test)

print('Lasso -> The R2 score on train set is : ', r2_scores_train)
print('Lasso -> The R2 score on test set is : ', r2_scores_test)

Ridge -> The R2 score on train set is :  0.9463881223176103
Ridge -> The R2 score on test set is :  0.9271634262962751
Lasso -> The R2 score on train set is :  0.9621465613343273
Lasso -> The R2 score on test set is :  0.9495539264387325


In [418]:
# Cross validation with Ridge regularization
print("3-fold cross-validation...")
regressor = Ridge(alpha=0.5)
scores_train = cross_val_score(regressor, X_train, Y_train, cv=7)
scores_test = cross_val_score(regressor, X_test, Y_test, cv=7)

print('The cross-validated train R2-score is : ', scores_train.mean())
print('The cross-validated test R2-score is : ', scores_test.mean())

3-fold cross-validation...
The cross-validated train R2-score is :  0.838408508095316
The cross-validated test R2-score is :  0.4840493886190142


In [419]:
# Cross validation with Lasso regularization x Grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.1, 0.2, 0.5, 1 ,100] # Au delà de 1 : le score ne s'améliore pas
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print()

grid_fit = gridsearch.fit(X_train, Y_train)
print('Train score for the best model : ', grid_fit.best_estimator_.score(X_train,Y_train))
print('Test score for the best model : ', grid_fit.best_estimator_.score(X_test,Y_test))

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.1}

Train score for the best model :  0.9608987902147126
Test score for the best model :  0.9464666791582315


In [420]:
# Cross validation with Lasso regularization x Grid search
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [10**(-a) for a in range(10)] # Au delà de 1 : le score ne s'améliore pas
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print()

grid_fit = gridsearch.fit(X_train, Y_train)
print('Train score for the best model : ', grid_fit.best_estimator_.score(X_train,Y_train))
print('Test score for the best model : ', grid_fit.best_estimator_.score(X_test,Y_test))

Grid search...


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


...Done.
Best hyperparameters :  {'alpha': 1}



  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


Train score for the best model :  0.962146556755579
Test score for the best model :  0.949549607593257


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
