# 1. Lecture CSV

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

import plotly.express as px

In [2]:
df = pd.read_csv("../datas/Walmart_Store_sales_clean.csv")
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,6,1572117.54,0,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0
1,13,1807545.43,0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0
2,11,1244390.03,0,84.57,,214.556497,7.346,,,
3,6,1644470.66,0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0
4,4,1857533.7,0,,2.756,126.160226,7.896,2010.0,5.0,28.0


---

# 2. Baseline model

Pour notre baseline, on va garder l'ensemble des données du dataset

In [3]:
target = "Weekly_Sales"

x = df.drop(target, axis=1)
y = df[target]

print(x.head)
print(y)

<bound method NDFrame.head of      Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0        6             0        59.61       3.045  214.777523         6.858   
1       13             0        42.38       3.435  128.616064         7.470   
2       11             0        84.57         NaN  214.556497         7.346   
3        6             0        78.89       2.759  212.412888         7.092   
4        4             0          NaN       2.756  126.160226         7.896   
..     ...           ...          ...         ...         ...           ...   
126     14             0        72.62       2.780  182.442420         8.899   
127      7             0        20.74       2.778         NaN           NaN   
128     17             0        57.14       2.841  126.111903           NaN   
129      8             0        86.05       3.638  219.007525           NaN   
130     19             0        55.20       4.170  137.923067         8.150   

       Year  Month   

On sépare les données en 2 : 1 set d'entrainement (80% des données), 1 set de test (20% des données)

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

Préprocessing :
- Imputer : Imputation des valeurs manquantes par la moyenne
- Standardisation / Mise à l'échelle des données
- Les variables catégorielles sont encodées en variables numériques entre 0 et 1

In [5]:
numerical_columns = ["Temperature", "Fuel_Price", "CPI", "Unemployment", "Year", "Month", "Day"]
categorical_columns = ["Store", "Holiday_Flag"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test) 

Modèle de régression linéaire

In [6]:
lr = LinearRegression()
lr.fit(x_train, y_train)

# Dans un modèle de régression linéaire, on utilise le score R2. Ce score analyse dans quelle mesure les prédictions sont proches des valeurs réelles.
print("R2 score training :", lr.score(x_train, y_train))
print("R2 score test :",  lr.score(x_test, y_test))

R2 score training : 0.9771347825598194
R2 score test : 0.890889978226036


Le score R2 est plus élevé sur les données d'entraînement (0.97) que sur les données de test (0.89), ce qui suggère que le modèle a surappris (overfitting) les données d'entraînement.

Validation croisée

In [7]:
scores = cross_val_score(lr, x_train, y_train, cv = 10)

print("The cross-validated R2-score is : ", scores.mean())
print("The standard deviation is : ", scores.std())

The cross-validated R2-score is :  0.9343470731723833
The standard deviation is :  0.045857269928014845


La faible variance (0.04) des scores de validation croisée indique que les performances du modèle sont assez stables entre les différentes partitions des données.

---

Sauvegarde des résultats dans un Dataframe

In [8]:
scores_df = pd.DataFrame(columns = ["model", "R2"])
new_rows = [{"model": "baseline", "R2": lr.score(x_test, y_test)}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)
scores_df.to_csv("../datas/Walmart_Scores.csv", mode="w", index=False)

  scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)


---

# 3. Analyse des coefficients

Nous pouvons utiliser les coefficients de régression pour estimer l'importance de chaque colonne pour la prédiction.

Les coefficients de régression quantifient l'impact de chaque prédicteur sur la variable cible.

In [9]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == "num": # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps["encoder"].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = lr.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,-60993.5
Fuel_Price,-15278.68
CPI,5777.049
Unemployment,-65854.96
Year,-39125.25
Month,70364.79
Day,-13407.37
Store_2,357742.9
Store_3,-1202338.0
Store_4,487598.0


In [10]:
feature_importance = abs(coefs).sort_values(by = "coefficients")
fig = px.bar(feature_importance, orientation = "h", height=1200, color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

Conclusion : La variable qui a le plus d'influence sur la valeur cible est le magasin (`Store`).