In [39]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

import plotly.express as px
import plotly.graph_objects as go

In [40]:
df = pd.read_csv('Walmart_Store_sales.csv')

In [41]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


Store - Numéro du magasin (store number)

Date - Semaine de vente (the week of sales)

Weekly_Sales - Montant de la semaine de vente pour le magasin (sales for the given store)

Holiday_Flag - 1 si la semaine est une semaine de vacances, 0 sinon (whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week)

Temperature - Temperature le jour de la vente (Temperature on the day of sale)

Fuel_Price - Coût du carburant dans la région (Cost of fuel in the region)

CPI – Indice des prix à la consommation en vigueur (Prevailing consumer price index)

Unemployment - Taux de chômage actuel (Prevailing unemployment rate)

# Part 1 : EDA and data preprocessing

In [42]:
# Basic stats
print("Number of rows : {}".format(df.shape[0]))
print()

print("Display of dataset: ")
display(df.head())
print()

print("Basics statistics: ")
data_desc = df.describe(include="all")
display(data_desc)
print()

print("Missing values: ")
display(df.isna().sum())

print("Percentage of missing values: ")
display(100 * df.isnull().sum() / df.shape[0])

Number of rows : 150

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Missing values: 


Store            0
Date            18
Weekly_Sales    14
Holiday_Flag    12
Temperature     18
Fuel_Price      14
CPI             12
Unemployment    15
dtype: int64

Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

On observe environ 10% de valeurs manquantes pour chaque colonne, sauf `Store`.

------

Variable cible (Target variable) => `Weekly_Sales`

In [43]:
df["Weekly_Sales"].head()

0    1572117.54
1    1807545.43
2           NaN
3    1244390.03
4    1644470.66
Name: Weekly_Sales, dtype: float64

In [44]:
df["Weekly_Sales"].isna().sum()

14

`Weekly_Sales` contient 14 valeurs vides : étant donné que c'est notre valeur cible, je ne peux pas remplacer les valeurs manquantes donc je les supprime

In [45]:
mask = (~df["Weekly_Sales"].isna())
df = df[mask]
df.shape[0]

136

----

Gestion de la colonne `Date` : je l'explose en 4 données plus exploitables : l'année, le mois, le jour et le jour de la semaine

In [46]:
df["Date"] = pd.to_datetime(df["Date"], format='%d-%m-%Y')

df["Year"] = df["Date"].dt.year.astype('Int64')
df["Month"] = df["Date"].dt.month.astype('Int64')
df["Day"] = df["Date"].dt.day.astype('Int64')
df["Day_Of_Week"] = df["Date"].dt.dayofweek.astype('Int64')

df = df.drop(columns=["Date"])

df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_Of_Week
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11.0,1244390.03,0.0,84.57,,214.556497,7.346,,,,
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


---

Type des variables

`Store`

In [47]:
df.dtypes

Store           float64
Weekly_Sales    float64
Holiday_Flag    float64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
Year              Int64
Month             Int64
Day               Int64
Day_Of_Week       Int64
dtype: object

In [48]:
# L'identifiant du magasin sera traité comme une variable catégorielle
df["Store"].value_counts()

Store
3.0     12
18.0    10
13.0     9
14.0     9
1.0      9
7.0      8
19.0     8
5.0      8
2.0      8
17.0     7
6.0      6
8.0      6
4.0      6
20.0     5
12.0     5
10.0     5
15.0     4
16.0     4
9.0      4
11.0     3
Name: count, dtype: int64

In [49]:
df["Store"] = df["Store"].astype(int).astype(str)

`Holiday_Flag`

In [50]:
# Le flag Vacances est 0 ou 1
df["Holiday_Flag"].value_counts()

Holiday_Flag
0.0    116
1.0      9
Name: count, dtype: int64

In [51]:
# J'ai également des valeurs Null, que je vais remplacer par la valeur la plus fréquante dans la colonne
mode_holiday = df["Holiday_Flag"].mode()[0]
df["Holiday_Flag"]=df["Holiday_Flag"].fillna(mode_holiday).astype(int)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 0 to 149
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         136 non-null    object 
 1   Weekly_Sales  136 non-null    float64
 2   Holiday_Flag  136 non-null    int32  
 3   Temperature   121 non-null    float64
 4   Fuel_Price    124 non-null    float64
 5   CPI           125 non-null    float64
 6   Unemployment  122 non-null    float64
 7   Year          118 non-null    Int64  
 8   Month         118 non-null    Int64  
 9   Day           118 non-null    Int64  
 10  Day_Of_Week   118 non-null    Int64  
dtypes: Int64(4), float64(5), int32(1), object(1)
memory usage: 12.8+ KB


In [53]:
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_Of_Week
0,6,1572117.54,0,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13,1807545.43,0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11,1244390.03,0,84.57,,214.556497,7.346,,,,
4,6,1644470.66,0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4,1857533.7,0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


----

Distribution des variables

In [54]:
# Distribution des variables numériques
num_features = df.select_dtypes(include=["float", "int"]).columns
for i in range(len(num_features)):
    fig = px.histogram(df[num_features[i]])
    fig.show()

In [55]:
df["Day_Of_Week"].value_counts()

Day_Of_Week
4    118
Name: count, dtype: Int64

Notre datasat contient des données pour 1 semaine, la colonne Day_Of_Week a donc toujours le même jour de la semaine, elle n'est donc pas pertinante.

In [56]:
df = df.drop("Day_Of_Week", axis=1)

In [57]:
# Distribution des variables qualitatives
cat_features = df.select_dtypes(include="object").columns
for i in range(len(cat_features)):
    fig = px.histogram(df[cat_features[i]])
    fig.show()

Gaphique bivarié de chaque paire de variables afin d'analyser la relation de chaque variable avec la cible `Weekly_Sales`

In [58]:
fig = px.scatter_matrix(df)

fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=1400, width = 1400)

fig.show()

Visuellement, je ne vois pas de corrélation évidente entre une variable du datset et la variable cible.

Matrice de corrélation

In [59]:
# Correlation matrix
corr_matrix = df.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())

fig.show()

Concernant la variable cible `Weekly_Sales`, l'indice de correlation le plus élevé est avec `CPI` puis `Temperature`, puis `Store`.

# Part 2 : Baseline model

Pour notre baseline, on va garder l'ensemble des données du dataset

In [60]:
target = "Weekly_Sales"

x = df.drop(target, axis=1)
y = df[target]

print(x.head)
print(y)

<bound method NDFrame.head of     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0       6             0        59.61       3.045  214.777523         6.858   
1      13             0        42.38       3.435  128.616064         7.470   
3      11             0        84.57         NaN  214.556497         7.346   
4       6             0        78.89       2.759  212.412888         7.092   
5       4             0          NaN       2.756  126.160226         7.896   
..    ...           ...          ...         ...         ...           ...   
145    14             0        72.62       2.780  182.442420         8.899   
146     7             0        20.74       2.778         NaN           NaN   
147    17             0        57.14       2.841  126.111903           NaN   
148     8             0        86.05       3.638  219.007525           NaN   
149    19             0        55.20       4.170  137.923067         8.150   

     Year  Month   Day  
0    201

In [61]:
# Je stratifie sur la variable catégorielle
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=x["Store"])

---

Gestion des valeurs extrêmes pour les colonnes `Temperature`, `Fuel_price`, `CPI` and `Unemployment`. On applique la règle des 3 sigmas : on va détecter les valeurs aberrantes (ou outliers) en considérant que si une valeur se trouve à plus de 3 écarts-types de la moyenne, alors cela peut être un signe qu'elle pourrait être une anomalie.

Cette règle sera appliquée sur x_train puis les mêmes formules seront appliquées sur x_test.

In [62]:
col_outliers = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]

for col in col_outliers:
    # Calcul des outliers pour les colonnes concernées
    valeur_palier_haut = x_train[col].mean() + 3 * x_train[col].std()
    valeur_palier_bas = x_train[col].mean() - 3 * x_train[col].std()

    # Application su x_train, y_train
    outlier_condition_train = (x_train[col] > valeur_palier_haut) | (x_train[col] < valeur_palier_bas)
    x_train = x_train[~outlier_condition_train]
    y_train = y_train[~outlier_condition_train]
    
    # Application su x_test, y_test
    outlier_condition_test = (x_test[col] > valeur_palier_haut) | (x_test[col] < valeur_palier_bas)
    x_test = x_test[~outlier_condition_test]
    y_test = y_test[~outlier_condition_test]

In [63]:
x_train.describe(include="all")

Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
count,104.0,104.0,92.0,92.0,93.0,91.0,91.0,91.0,91.0
unique,19.0,,,,,,,,
top,3.0,,,,,,,,
freq,10.0,,,,,,,,
mean,,0.067308,60.850435,3.32325,180.347693,7.353813,2010.835165,6.571429,15.758242
std,,0.251767,17.414542,0.472322,40.146197,1.00099,0.820078,3.211586,8.146357
min,,0.0,20.74,2.548,126.111903,5.143,2010.0,1.0,1.0
25%,,0.0,47.2525,2.8385,132.598387,6.6495,2010.0,4.0,10.0
50%,,0.0,62.31,3.4745,198.095048,7.343,2011.0,7.0,16.0
75%,,0.0,75.2225,3.731,214.929625,8.09,2012.0,9.0,22.0


In [64]:
x_test.describe(include="all")

Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
count,27.0,27.0,25.0,27.0,27.0,26.0,22.0,22.0,22.0
unique,18.0,,,,,,,,
top,18.0,,,,,,,,
freq,2.0,,,,,,,,
mean,,0.037037,58.77,3.233593,179.583524,7.559077,2010.818182,5.045455,19.727273
std,,0.19245,22.244904,0.48847,38.969155,0.971777,0.852803,2.785631,8.018917
min,,0.0,18.79,2.514,126.128355,5.943,2010.0,2.0,1.0
25%,,0.0,39.69,2.8125,133.868484,6.7125,2010.0,3.0,14.75
50%,,0.0,59.61,3.112,196.919506,7.8005,2011.0,5.0,22.0
75%,,0.0,79.97,3.6885,214.601202,8.292,2011.75,6.0,25.75


In [65]:
print("shapes : ", x_train.shape, x_test.shape)

shapes :  (104, 9) (27, 9)


---

In [66]:
numerical_columns = ["Temperature", "Fuel_Price", "CPI", "Unemployment", "Year", "Month", "Day"]
categorical_columns = ["Store", "Holiday_Flag"]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test) 

In [67]:
lr = LinearRegression()
lr.fit(x_train, y_train)

# Dans un modèle de régression linéaire, on utilise le score R2. Ce score analyse dans quelle mesure les prédictions sont proches des valeurs réelles.
print("R2 score training :", lr.score(x_train, y_train))
print("R2 score test :",  lr.score(x_test, y_test))

R2 score training : 0.9689585431721391
R2 score test : 0.968908358549527


In [68]:
scores = cross_val_score(lr, x_train, y_train, cv = 10)

print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

The cross-validated R2-score is :  0.9231793812477008
The standard deviation is :  0.04765697746707891


La validation croisée montre que les scores peuvent varier de +/- 0.04, l'incertitude est donc limitée.

On observe de bons résultats sur le set d'entrainement et sur le set de test. Je continue mon analyse.

Nous pouvons utiliser les coefficients de régression pour estimer l'importance de chaque colonne pour la prédiction.

Les coefficients de régression quantifient l'impact de chaque prédicteur sur la variable cible.

In [69]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = lr.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,-34151.76
Fuel_Price,-6421.11
CPI,68461.32
Unemployment,-77671.14
Year,-38542.44
Month,52791.47
Day,-33646.66
Store_10,465049.9
Store_11,-129395.6
Store_13,450419.4


In [70]:
feature_importance = abs(coefs).sort_values(by = 'coefficients')
fig = px.bar(feature_importance, orientation = 'h', height=1200)
fig.show()

Conclusion : La variable qui a le plus d'impact sur la valeur cible est le magasin (`Store`). Certains magasins vendent donc beaucoup plus que d'autres.

# Part 3 : Fight overfitting

In [71]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV

In [72]:
# Je teste plusieurs alpha pour Ridge
regressor = Ridge()
params = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100]
    }

best_ridge = GridSearchCV(regressor, param_grid=params, cv=5)
best_ridge.fit(x_train, y_train)
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Best hyperparameters :  {'alpha': 0.05}
Best R2 score :  0.9224237491839432


In [73]:
# Idem pour Lasso
regressor = Lasso(max_iter=2000)
params = {
    'alpha': [1, 2, 3, 5, 10, 20, 30, 40, 50, 100]
    }

best_lasso = GridSearchCV(regressor, param_grid=params, cv=5)
best_lasso.fit(x_train, y_train)
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Best hyperparameters :  {'alpha': 100}
Best R2 score :  0.9217024633331512


In [74]:
scores_ridge = cross_val_score(best_ridge.best_estimator_, x_train, y_train, cv = 10)
print('RIDGE // The cross-validated R2-score is : ', scores_ridge.mean())
print('RIDGE // The standard deviation is : ', scores_ridge.std())

scores_lasso = cross_val_score(best_lasso.best_estimator_, x_train, y_train, cv = 10)
print('LASSO // The cross-validated R2-score is : ', scores_lasso.mean())
print('LASSO // The standard deviation is : ', scores_lasso.std())

RIDGE // The cross-validated R2-score is :  0.9236425381735108
RIDGE // The standard deviation is :  0.05034248667565541
LASSO // The cross-validated R2-score is :  0.923660370689032
LASSO // The standard deviation is :  0.0483827214478881


In [75]:
# Print R^2 scores
print("RIDGE // R2 score on training set : ", best_ridge.score(x_train, y_train))
print("RIDGE // R2 score on test set : ", best_ridge.score(x_test, y_test))
print("LASSO // R2 score on training set : ", best_lasso.score(x_train, y_train))
print("LASSO // R2 score on test set : ", best_lasso.score(x_test, y_test))

RIDGE // R2 score on training set :  0.9685350804140205
RIDGE // R2 score on test set :  0.970198827577907
LASSO // R2 score on training set :  0.9689307225034322
LASSO // R2 score on test set :  0.9693100291758621


On voit que les 2 techniques pour éviter l'overfitting donnent sensiblement les mêmes résultats. 

In [76]:
# Dans le cas de Lasso, j'analyse les coefficients et je les associe aux colonnes d'origine
coeffs = pd.DataFrame()
coeffs["Feature"] = column_names
coeffs["Best_Lasso"] = best_lasso.best_estimator_.coef_

# Je regarde les coefficients 0 : colonnes supprimées par Lasso
mask = (coeffs["Best_Lasso"] == 0)
lasso_droped_columns = coeffs["Feature"][mask].tolist()
lasso_droped_columns

[]

La seule colonne a utilisé tutes les colonnes

Les résultats restent les mêmes :

Sans overfitting : R2 score training : 0.9689585431721391 R2 score test : 0.968908358549527

Avec overfitting (Ridge) : R2 score training : 0.9685350804140205 R2 score test : 0.970198827577907 (léger underfitting)

Avec overfitting (Lasso) : R2 score training : 0.9689307225034322 R2 score test : 0.9693100291758621

# Conclusion

D’après l'analyse, le facteur dominant dans la prédiction des ventes hebdomadaires est le magasin (`Store`).

Les données montrent que les ventes varient fortement selon les magasins. Actuellement, nous ne disposons que d’un identifiant de magasin sans détails supplémentaires.

Il nous faudrait plus d'informations sur ces magasins pour affiner notre démarche : emplacement, surface, ...

Avec ces informations, nous pourrions estimer le montant des ventes hebdomadaires avec plus de précisions.