### Import des bibliothéques
---

In [75]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

### Chargement des données
---

In [76]:
file_path = "data/walmart_clean.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0
1,11.0,1244390.03,0.0,84.57,,214.556497,7.346,,,
2,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0
3,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0
4,15.0,695396.19,0.0,69.8,4.069,134.855161,7.658,2011.0,6.0,3.0


### Modéle Baseline
---

In [77]:
x = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

numerical_columns = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day']
categorial_columns = ['Store', 'Holiday_Flag']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorial_transformer = Pipeline (steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorial_transformer, categorial_columns)
    ])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test) 

lr = LinearRegression()
lr.fit(x_train, y_train)

print("... Résultat Baseline LR ...")
print("R2 score on training set : ", lr.score(x_train, y_train))
print("R2 score on test set : ", lr.score(x_test, y_test))
# le modéle est bon avec 0.07 d'écart, petit overfiting.


... Résultat Baseline LR ...
R2 score on training set :  0.9746932192952995
R2 score on test set :  0.9088872007844017


In [78]:
# Validation Croisée
scores = cross_val_score(lr, x_train, y_train, cv=5, scoring='r2')

print(f"Scores de validation croisée : {scores}")
print(f"Score moyen : {scores.mean():.4f}")
print(f"Écart-type : {scores.std():.4f}")
# moy de 0.92 avec un std de 0.028, le modele est fiable.

Scores de validation croisée : [0.94312053 0.94007541 0.95186728 0.94600645 0.95313232]
Score moyen : 0.9468
Écart-type : 0.0050


### Sauvegarde des scores
---

In [79]:
scores_df = pd.DataFrame(columns = ["model", "Train R2", "Test R2"])

new_rows = [
    {"model": "LR baseline", "Train R2":lr.score(x_train, y_train), "Test R2":lr.score(x_test, y_test)}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)
scores_df.to_csv("data/walmart_scores.csv", mode="a", header=False, index=False)


The behavior of DataFrame concatenation with empty or all-NA entries is deprecated. In a future version, this will no longer exclude empty or all-NA columns when determining the result dtypes. To retain the old behavior, exclude the relevant entries before the concat operation.



### Calcul des coefficients
---

In [80]:
lr.coef_ # Chaque valeur indique :Combien la variable correspondante influence la prédiction y. positif → augmente y, négatif → diminue y.

# Coefficient par importance d'influence sur la prédiction de notre Target
nom_col_num = numerical_columns  # colonnes numériques
nom_col_cat = preprocessor.named_transformers_['cat'].get_feature_names_out()
nom_col_cat_num = np.concatenate([nom_col_num, nom_col_cat])

# création df
df_coef=pd.DataFrame({'Variable' : nom_col_cat_num, 'Coefficient' : lr.coef_})

# trie par valeur absolue pour meilleur comprehension des resultats
df_coef['Importance']=df_coef['Coefficient'].abs()
df_coef=df_coef.sort_values(by='Importance', ascending=False)

print(df_coef[['Variable', 'Coefficient', 'Importance']])

# Visualisation de l'importance des coefficients
fig = px.bar(
    df_coef,
    x="Importance",      
    y="Variable",        
    orientation="h",    
    title="Importance des variables sur la prédiction Y"
)
fig.show()

            Variable   Coefficient    Importance
10         Store_5.0 -1.314973e+06  1.314973e+06
14         Store_9.0 -1.143830e+06  1.143830e+06
8          Store_3.0 -1.139241e+06  1.139241e+06
21        Store_16.0 -1.079697e+06  1.079697e+06
12         Store_7.0 -9.316206e+05  9.316206e+05
13         Store_8.0 -7.486968e+05  7.486968e+05
19        Store_14.0  7.060123e+05  7.060123e+05
9          Store_4.0  6.880060e+05  6.880060e+05
15        Store_10.0  6.638032e+05  6.638032e+05
18        Store_13.0  6.475392e+05  6.475392e+05
20        Store_15.0 -6.460261e+05  6.460261e+05
22        Store_17.0 -6.270290e+05  6.270290e+05
25        Store_20.0  4.678283e+05  4.678283e+05
7          Store_2.0  4.631910e+05  4.631910e+05
23        Store_18.0 -1.595118e+05  1.595118e+05
3       Unemployment -1.117674e+05  1.117674e+05
17        Store_12.0  1.112810e+05  1.112810e+05
24        Store_19.0  1.106182e+05  1.106182e+05
2                CPI  8.212983e+04  8.212983e+04
26  Holiday_Flag_1.0

### Modèles linéaires de régression régularisé Ridge et Lasso
---

In [85]:
# Ridge 
ridge = Ridge()
ridge.fit(x_train, y_train)
print('... Résultat Ridge ...')
print('R2 score on training set :', ridge.score(x_train,y_train))
print('R2 score on test set : ', ridge.score(x_test, y_test))

# Lasso
lasso = Lasso()
lasso.fit(x_train, y_train)
print('\n... Résultat Lasso ...')
print('R2 score on training set :', lasso.score(x_train,y_train))
print('R2 score on test set : ', lasso.score(x_test, y_test))

... Résultat Ridge ...
R2 score on training set : 0.9320324794465517
R2 score on test set :  0.8893821336942374

... Résultat Lasso ...
R2 score on training set : 0.9727736514638491
R2 score on test set :  0.9502052554701538


### Méthode de Validation Croisée GridSearch
---

In [86]:
# pour Ridge
params = {'alpha': [0.0, 0.1, 1.0, 10]}
grid_ridge = GridSearchCV(Ridge(), param_grid=params, cv=5)
grid_ridge.fit(x_train, y_train)

print("... Resultat GridSearch Ridge ...")
print("Best hyperparameters:", grid_ridge.best_params_)
print("Best R2 score (CV):", grid_ridge.best_score_)
print("R2 score on training set:", grid_ridge.score(x_train, y_train))
print("R2 score on test set:", grid_ridge.score(x_test, y_test))

# pour Lasso
params = {'alpha': [0.01, 0.1, 1.0, 10]}  # éviter 0 pour Lasso
grid_lasso = GridSearchCV(Lasso(), param_grid=params, cv=5)
grid_lasso.fit(x_train, y_train)

print("\n... Resultat GridSearch Lasso ...")
print("Best hyperparameters:", grid_lasso.best_params_)
print("Best R2 score (CV):", grid_lasso.best_score_)
print("R2 score on training set:", grid_lasso.score(x_train, y_train))
print("R2 score on test set:", grid_lasso.score(x_test, y_test))

... Resultat GridSearch Ridge ...
Best hyperparameters: {'alpha': 0.1}
Best R2 score (CV): 0.9257798700404318
R2 score on training set: 0.9712686112789481
R2 score on test set: 0.9479054770776458

... Resultat GridSearch Lasso ...
Best hyperparameters: {'alpha': 10}
Best R2 score (CV): 0.9226230094660192
R2 score on training set: 0.9727731312257096
R2 score on test set: 0.9501558183707313


### Sauvegarde des résultats Ridge - Lasso + visualisation
---

In [83]:
# Sauvegarde dans scores_df
new_rows = [
    {"model": "Ridge", "Train R2": ridge.score(x_train, y_train), "Test R2": ridge.score(x_test, y_test)},
    {"model": "Lasso", "Train R2": lasso.score(x_train, y_train), "Test R2": lasso.score(x_test, y_test)},
    {"model": "grid_ridge", "Train R2": grid_ridge.score(x_train, y_train), "Test R2": grid_ridge.score(x_test, y_test)},
    {"model": "grid_lasso", "Train R2": grid_lasso.score(x_train, y_train), "Test R2": grid_lasso.score(x_test, y_test)}
]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)

scores_df.to_csv("data/walmart_scores.csv", mode="a", header=False, index=False)
scores_df.head()

fig = px.bar(
    scores_df,
    x='model',
    y=['Train R2', 'Test R2'],
    title='Comparaison des Résultats scores R2',
    barmode='group',  # Barres groupées
    text_auto=True  # Affiche les valeurs sur les barres
)
fig.show()
# les performances des modéles restent trés proche. La baseline de départ avait déjà de trés bon résultat

### Bonus - Features Engineering

In [84]:
# pour améliorer la performance du modéle, on stratify la variable qui influe le plus sur la Target.
# Comme cela la repartition entre le train et le test est respecté sur cette variable

x = df.drop("Weekly_Sales", axis=1)
y = df["Weekly_Sales"]
stratify_column = df['Store']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=stratify_column)
# stratify = stratify_column)

numerical_columns = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day']
categorial_columns = ['Store', 'Holiday_Flag']

numeric_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='median')),
     ('scaler', StandardScaler())
 ])

categorial_transformer = Pipeline (steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorial_transformer, categorial_columns)
    ])

x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test) 

lr = LinearRegression()
lr.fit(x_train, y_train)

print("... Résultat LR Store ...")
print("R2 score on training set : ", lr.score(x_train, y_train))
print("R2 score on test set : ", lr.score(x_test, y_test))

... Résultat LR Store ...
R2 score on training set :  0.9727736567222601
R2 score on test set :  0.9502104106533493
