# 1. Lecture CSV

In [503]:
import pandas as pd

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [504]:
df = pd.read_csv("../datas/Walmart_Store_sales.csv")

In [505]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


Store - Numéro du magasin (store number)

Date - Semaine de vente (the week of sales)

Weekly_Sales - Montant de la semaine de vente pour le magasin (sales for the given store)

Holiday_Flag - 1 si la semaine est une semaine de vacances, 0 sinon (whether the week is a special holiday week 1 – Holiday week 0 – Non-holiday week)

Temperature - Temperature le jour de la vente (Temperature on the day of sale)

Fuel_Price - Coût du carburant dans la région (Cost of fuel in the region)

CPI – Indice des prix à la consommation en vigueur (Prevailing consumer price index)

Unemployment - Taux de chômage actuel (Prevailing unemployment rate)

# 2. EDA et préprocessing

In [506]:
# Basic stats
print("Nombre de lignes : {}".format(df.shape[0]))
print()

print("Affichage du dataset : ")
display(df.head())
print()

print("Statistiques basiques : ")
data_desc = df.describe(include="all")
display(data_desc)
print()

print("Valeurs manquantes : ")
display(df.isna().sum())

print("Pourcentage de valeurs manquantes : ")
display(100 * df.isnull().sum() / df.shape[0])

Nombre de lignes : 150

Affichage du dataset : 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Statistiques basiques : 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Valeurs manquantes : 


Store            0
Date            18
Weekly_Sales    14
Holiday_Flag    12
Temperature     18
Fuel_Price      14
CPI             12
Unemployment    15
dtype: int64

Pourcentage de valeurs manquantes : 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

On observe environ 10% de valeurs manquantes pour chaque colonne, sauf `Store`.

------

### Weekly_Sales

Variable cible (Target variable) => `Weekly_Sales`

In [507]:
df["Weekly_Sales"].head()

0    1572117.54
1    1807545.43
2           NaN
3    1244390.03
4    1644470.66
Name: Weekly_Sales, dtype: float64

In [508]:
df["Weekly_Sales"].isna().sum()

np.int64(14)

`Weekly_Sales` contient 14 valeurs vides : étant donné que c'est notre valeur cible, je ne peux pas remplacer les valeurs manquantes donc je les supprime

In [509]:
mask = (~df["Weekly_Sales"].isna())
df = df[mask]
df.shape[0]

136

----

### Date

Gestion de la colonne `Date` : je l'explose en 4 données plus exploitables : l'année, le mois, le jour et le jour de la semaine

In [510]:
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")

df["Year"] = df["Date"].dt.year.astype("Int64")
df["Month"] = df["Date"].dt.month.astype("Int64")
df["Day"] = df["Date"].dt.day.astype("Int64")
df["Day_Of_Week"] = df["Date"].dt.dayofweek.astype("Int64")

df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_Of_Week
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11.0,NaT,1244390.03,0.0,84.57,,214.556497,7.346,,,,
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


---

### Type des variables

In [511]:
df.dtypes

Store                  float64
Date            datetime64[ns]
Weekly_Sales           float64
Holiday_Flag           float64
Temperature            float64
Fuel_Price             float64
CPI                    float64
Unemployment           float64
Year                     Int64
Month                    Int64
Day                      Int64
Day_Of_Week              Int64
dtype: object

`Store`

L'identifiant du magasin sera traité comme une variable qualitative

In [512]:
df["Store"].value_counts()

Store
3.0     12
18.0    10
13.0     9
1.0      9
14.0     9
7.0      8
5.0      8
19.0     8
2.0      8
17.0     7
6.0      6
4.0      6
8.0      6
10.0     5
12.0     5
20.0     5
16.0     4
15.0     4
9.0      4
11.0     3
Name: count, dtype: int64

In [513]:
df["Store"] = df["Store"].astype(int).astype(str)

`Holiday_Flag`

In [514]:
# Le flag Vacances est 0 ou 1
df["Holiday_Flag"].value_counts()

Holiday_Flag
0.0    116
1.0      9
Name: count, dtype: int64

Le flag Vacances est une variable qualitative nominale, il n'y a pas d'ordre entre les valeurs

In [515]:
# Il existe des valeurs Null, nous allons les remplacer par la valeur la plus fréquante dans la colonne
mode_holiday = df["Holiday_Flag"].mode()[0]
df["Holiday_Flag"] = df["Holiday_Flag"].fillna(mode_holiday).astype(bool)

In [516]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 136 entries, 0 to 149
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Store         136 non-null    object        
 1   Date          118 non-null    datetime64[ns]
 2   Weekly_Sales  136 non-null    float64       
 3   Holiday_Flag  136 non-null    bool          
 4   Temperature   121 non-null    float64       
 5   Fuel_Price    124 non-null    float64       
 6   CPI           125 non-null    float64       
 7   Unemployment  122 non-null    float64       
 8   Year          118 non-null    Int64         
 9   Month         118 non-null    Int64         
 10  Day           118 non-null    Int64         
 11  Day_Of_Week   118 non-null    Int64         
dtypes: Int64(4), bool(1), datetime64[ns](1), float64(5), object(1)
memory usage: 13.4+ KB


In [517]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_Of_Week
0,6,2011-02-18,1572117.54,False,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13,2011-03-25,1807545.43,False,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11,NaT,1244390.03,False,84.57,,214.556497,7.346,,,,
4,6,2010-05-28,1644470.66,False,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4,2010-05-28,1857533.7,False,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


---

### Gestion des valeurs extrêmes

 Pour les colonnes `Temperature`, `Fuel_price`, `CPI` and `Unemployment`. On applique la règle des 3 sigmas : on va détecter les valeurs aberrantes (ou outliers) en considérant que si une valeur se trouve à plus de 3 écarts-types de la moyenne, alors cela peut être un signe qu'elle pourrait être une anomalie.

In [518]:
col_outliers = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]

for col in col_outliers:
    # Calcul des outliers pour les colonnes concernées
    valeur_palier_haut = df[col].mean() + 3 * df[col].std()
    valeur_palier_bas = df[col].mean() - 3 * df[col].std()

    # Application sur le dataframe
    outlier_condition = (df[col] > valeur_palier_haut) | (df[col] < valeur_palier_bas)
    df = df[~outlier_condition]

In [519]:
# Basic stats
print("Nombre de lignes : {}".format(df.shape[0]))
print()

print("Statistiques basiques : ")
data_desc = df.describe(include="all")
display(data_desc)
print()

print("Valeurs manquantes : ")
display(df.isna().sum())

Nombre de lignes : 131

Statistiques basiques : 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_Of_Week
count,131.0,113,131.0,131,117.0,119.0,120.0,117.0,113.0,113.0,113.0,113.0
unique,19.0,,,2,,,,,,,,
top,3.0,,,False,,,,,,,,
freq,12.0,,,123,,,,,,,,
mean,,2011-04-24 21:52:33.982300928,1257990.0,,60.405897,3.302908,180.175755,7.399427,2010.831858,6.274336,16.530973,4.0
min,,2010-02-05 00:00:00,268929.0,,18.79,2.514,126.111903,5.143,2010.0,1.0,1.0,4.0
25%,,2010-07-30 00:00:00,584243.9,,44.82,2.824,132.579257,6.664,2010.0,4.0,10.0,4.0
50%,,2011-04-22 00:00:00,1366396.0,,61.79,3.435,197.655672,7.368,2011.0,6.0,17.0,4.0
75%,,2012-01-13 00:00:00,1809576.0,,75.54,3.7085,214.904838,8.099,2012.0,9.0,24.0,4.0
max,,2012-10-19 00:00:00,2771397.0,,91.65,4.17,226.968844,9.524,2012.0,12.0,31.0,4.0



Valeurs manquantes : 


Store            0
Date            18
Weekly_Sales     0
Holiday_Flag     0
Temperature     14
Fuel_Price      12
CPI             11
Unemployment    14
Year            18
Month           18
Day             18
Day_Of_Week     18
dtype: int64

----

### Distribution des variables

##### Distribution des variables numériques

In [520]:
num_features = df.select_dtypes(include=["float", "int"]).columns

for feature in num_features:
    fig = px.histogram(df, x=feature, color_discrete_sequence=px.colors.qualitative.Pastel, nbins=30)
    fig.update_layout(title=f"Distribution de {feature}", title_x=0.5, xaxis_title=feature, yaxis_title="Nombre d'occurences")
    fig.show()

In [521]:
df["Day_Of_Week"].value_counts()

Day_Of_Week
4    113
Name: count, dtype: Int64

Notre datasat contient des données de ventes hebdomadaires, transmises une fois par semaine, la colonne Day_Of_Week a donc toujours le même jour de la semaine, elle n'est donc pas pertinante.

In [522]:
df = df.drop("Day_Of_Week", axis=1)

##### Répartition des variables qualitatives

In [523]:
cat_features = df.select_dtypes(include=["object", "bool"]).columns

for feature in cat_features:
    df_grouped = df[feature].value_counts().reset_index()
    df_grouped.columns = [feature, "count"]

    fig = px.bar(df_grouped, x=feature, y="count", color_discrete_sequence=px.colors.qualitative.Pastel)
    fig.update_layout(title=f"Répartition de {feature}", title_x=0.5,
        xaxis_title=feature, xaxis=dict(tickformat="d"),
        yaxis_title="Nombre d'occurences")
    fig.show()

---

### Analyse des données

##### Saisonnalité : Nombre de données mensuelles, par année

In [524]:
df["Year"].unique().tolist()

[2011, <NA>, 2010, 2012]

In [525]:
fig = make_subplots(rows = 3, cols = 1)

years = df["Year"].sort_values().dropna().unique().tolist()

for i in range(len(years)):
    mask = (df["Year"] == years[i])
    df_year = df[mask]

    fig.add_trace(go.Histogram(x = df_year["Month"], name = str(years[i]), nbinsx=12), row = i + 1, col = 1)

fig.update_layout(title = go.layout.Title(text = "Données mensuelles par année", x = 0.5), autosize = False, height = 700)

fig.show()

##### Exploration des ventes en fonction du temps

In [526]:
df_sales = df.groupby("Date")["Weekly_Sales"].mean().reset_index()
df_sales["Mean_Sales"] = df_sales["Weekly_Sales"].mean()
df_sales.head()

Unnamed: 0,Date,Weekly_Sales,Mean_Sales
0,2010-02-05,461622.22,1255114.0
1,2010-02-12,1318379.42,1255114.0
2,2010-02-19,1392645.145,1255114.0
3,2010-02-26,2095591.63,1255114.0
4,2010-03-12,860336.16,1255114.0


In [527]:
fig = go.Figure(data = 
                go.Scatter(x = df_sales["Date"], y = df_sales["Weekly_Sales"]),
                layout = go.Layout(title = go.layout.Title(text = "Moyenne des ventes par jour", x = 0.5),
                                   xaxis = go.layout.XAxis(title = "Temps", rangeslider = go.layout.xaxis.Rangeslider(visible = True))))

fig.add_trace(go.Scatter(x = df_sales["Date"], y = df_sales["Mean_Sales"]))

fig.update_layout(showlegend = False)

fig.show()

### Gaphique bivarié afin d'analyser la relation de chaque variable avec la cible `Weekly_Sales`

In [528]:
fig = px.scatter_matrix(df)

fig.update_layout(title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=1400, width = 1400)

fig.show()

Visuellement, je ne vois pas de corrélation évidente entre une variable du datset et la variable cible. Je vais regarder la matrice de corrélation.

### Matrice de corrélation

In [529]:
# Correlation matrix
corr_matrix = df.corr().round(2)

fig = ff.create_annotated_heatmap(corr_matrix.values, x = corr_matrix.columns.tolist(), y = corr_matrix.index.tolist())
fig.show()

A l'aide de la matrice de corrélation, on observe qu'avec la variable cible `Weekly_Sales`, l'indice de correlation le plus élevé est avec `CPI` puis `Unemployment` puis `Temperature`, puis `Store`.

---

L'EDA étant terminé, la variable `Date`, qui a été explosée en 3 colonnes distinctes `Year`, `Month` et `Day`, peut être supprimée

In [530]:
df = df.drop(columns=["Date"])
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,6,1572117.54,False,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0
1,13,1807545.43,False,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0
3,11,1244390.03,False,84.57,,214.556497,7.346,,,
4,6,1644470.66,False,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0
5,4,1857533.7,False,,2.756,126.160226,7.896,2010.0,5.0,28.0


---

Sauvegarde du fichier

In [531]:
df.to_csv("../datas/Walmart_Store_sales_clean.csv", index=False)