# Part 1 : EDA and data preprocessing

## 1) Import libraries
---

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error

import plotly.express as px
import plotly.graph_objects as go

## 2) Import files
---

In [2]:
#import
data= pd.read_csv('../data/Walmart/Walmart_Store_sales.csv')
data.head()


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


## 3) EDA
---

In [3]:
# Basic stats
print(f"Number of rows : {data.shape[0]}")
print(f"Number of columns : {data.shape[1]}")

print("Basics statistics: ")
display(data.describe(include='all'))

print("Percentage of missing values: ")
print(data.isna() .sum() / data.shape[0])

Number of rows : 150
Number of columns : 8
Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15


Percentage of missing values: 
Store           0.000000
Date            0.120000
Weekly_Sales    0.093333
Holiday_Flag    0.080000
Temperature     0.120000
Fuel_Price      0.093333
CPI             0.080000
Unemployment    0.100000
dtype: float64


In [4]:
# Create a clean dataset

# Drop lines where target values are missing : Weekly_Sales
df = data.dropna(subset=['Weekly_Sales'])


#Create usable features from the *Date* column :
df['Date'] = pd.to_datetime(df["Date"], format='%d-%m-%Y')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day
df['DayOfWeek'] = df['Date'].dt.day_of_week
df = df.drop('Date', axis=1)


#Drop lines containing outliers :
def drop_outlier(dataset, col):
    upper_outlier = dataset[col].mean() + 3 * dataset[col].std()
    lower_outlier = dataset[col].mean() - 3 * dataset[col].std()
    outlier_condition = (dataset[col] > upper_outlier) | (dataset[col] < lower_outlier)
    dataset = dataset[~outlier_condition]

    return dataset

Columns_to_clean = ['Temperature','Fuel_Price','CPI','Unemployment']
for col in Columns_to_clean:
    df = drop_outlier(df,col)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Date'] = pd.to_datetime(df["Date"], format='%d-%m-%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = df['Date'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Month'] = df['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [5]:
# Distribution of each numeric variable
num_features = ['Temperature','Fuel_Price','CPI','Unemployment']
for i in range(len(num_features)):
    fig = px.histogram(df[num_features[i]])
    fig.show()

In [6]:
#Barplot of each qualitative variable

# Score 
df_store = pd.DataFrame(df['Store'].value_counts()).reset_index()
df_store.columns = ['Store', 'Count']
fig = px.bar(df_store, x='Store', y='Count', title='Bar Plot of Store')
fig.show()


# Holiday_Flag
df_holiday= pd.DataFrame(df['Holiday_Flag'].value_counts()).reset_index()
df_holiday.columns = ['Holiday_Flag', 'Count']
fig = px.bar(df_holiday, x='Holiday_Flag', y='Count', title='Bar Plot of Holiday Flag')
fig.show()

In [7]:
# Correlation matrix
corr_matrix = df.corr().round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

## 4) Preprocessing
---

In [8]:
#Separate target from other explanatory variable :
target_variable = "Weekly_Sales"
X = df.drop(target_variable , axis = 1)
y = df[target_variable]

my_features_list = X.columns.tolist() #for later


#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Numeric/categorical columns
categorical_features = ['Store', 'Holiday_Flag']
numeric_features = [feature for feature in my_features_list if feature not in categorical_features]


#Transformer
numerical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

#Preprocess
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Part 2 : Baseline model (linear regression)
---

In [9]:
scoring_df = pd.DataFrame()

def log_model_score(df, model_name, R2_train, R2_test):
    rows = pd.DataFrame({
        'model_name': [model_name, model_name],
        'series': ['Train', 'Test'],
        'r2_score': [R2_train, R2_test]
    })
    return pd.concat([df, rows], ignore_index=True)


In [22]:
# Train model
regressor = LinearRegression()
regressor.fit(X_train, y_train)


# Predictions
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

# Print and save R2 scores
R2_train = r2_score(y_train, y_train_pred)
R2_test = r2_score(y_test, y_test_pred)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Linear Regression"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)


R2 score on training set :  0.9771347825598194
R2 score on test set :  0.890889978226036


**==> We are in overfitting with this model**

In [24]:
# Create a pandas DataFrame
column_names = []
for name, pipeline, features_list in preprocessor.transformers_:
    if name == 'num':
        features = features_list
    else:
        features = pipeline.named_steps['encoder'].get_feature_names_out()
    column_names.extend(features)
        
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
feature_importance = abs(coefs).sort_values(by = 'coefficients')


# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()

**==> All x0_XX   are the stores references. We can show that stores are very important**

# Part 3 : Fight overfitting
---

In [25]:
# Ridge
regressor = Ridge()
regressor.fit(X_train, y_train)

# Print and save R2 scores
R2_train = regressor.score(X_train, y_train)
R2_test = regressor.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Ridge"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)


R2 score on training set :  0.9412462646173458
R2 score on test set :  0.8643355320423586


In [26]:
regressor = Lasso()
regressor.fit(X_train, y_train)

# Print and save R2 scores
R2_train = regressor.score(X_train, y_train)
R2_test = regressor.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Lasso"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

R2 score on training set :  0.9771347799674606
R2 score on test set :  0.8909039991430255


**==> Standard Ridge and Lasso have the sames resultats that the linear Regression in this case**

# Bonus question
---

In [27]:
# Ridge with gridsearch

regressor = Ridge()
params = {
    'alpha': [0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100] # 0 corresponds to no regularization
}
gridsearch_ridge = GridSearchCV(regressor, param_grid=params, cv=3) # cv : the number of folds to be used for CV
gridsearch_ridge.fit(X_train, y_train)

print("Best hyperparameters : ", gridsearch_ridge.best_params_)
print("Best R2 score : ", gridsearch_ridge.best_score_)

# Print and save R2 scores
R2_train = gridsearch_ridge.score(X_train, y_train)
R2_test = gridsearch_ridge.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Ridge with gridsearch"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

Best hyperparameters :  {'alpha': 0.1}
Best R2 score :  0.8793825420490723
R2 score on training set :  0.9757656348662551
R2 score on test set :  0.8953392486092068


In [28]:
# Lasso with gridsearch

regressor = Lasso()
params = {
    'alpha': [100, 500, 1000, 1500, 2000] # 0 corresponds to no regularization
}
gridsearch_lasso = GridSearchCV(regressor, param_grid=params, cv=3) # cv : the number of folds to be used for CV
gridsearch_lasso.fit(X_train, y_train)

print("Best hyperparameters : ", gridsearch_lasso.best_params_)
print("Best R2 score : ", gridsearch_lasso.best_score_)

# Print and save R2 scores
R2_train = gridsearch_lasso.score(X_train, y_train)
R2_test = gridsearch_lasso.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Lasso with gridsearch"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)

Best hyperparameters :  {'alpha': 1500}
Best R2 score :  0.8797758093295597
R2 score on training set :  0.9737493802361675
R2 score on test set :  0.9032987627937343


In [29]:
#show the feature coefficients from Ridge and Lasso
data_dict = {
    'Feature': column_names,
    'Best_Ridge': gridsearch_ridge.best_estimator_.coef_,
    'Best_Lasso': gridsearch_lasso.best_estimator_.coef_,
            }

coefficients_ridge_lasso = pd.DataFrame(data=data_dict)

fig = px.line(coefficients_ridge_lasso, x = 'Feature', y = ['Best_Ridge', 'Best_Lasso'])
fig.show()


**==> We will now focus on the store feature, with lasso**

In [32]:
#Separate target from other explanatory variable :
target_variable = "Weekly_Sales"
X = df[['Store']]
y = df[target_variable]


#Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#Numeric/categorical columns
categorical_features = ['Store']


#Transformers
categorical_transformer = Pipeline(steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features)
    ])

#Preprocess
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

#train model
regressor = Lasso()
params = {
    'alpha': [100, 500, 1000, 1500, 2000] # 0 corresponds to no regularization
}
gridsearch_lasso = GridSearchCV(regressor, param_grid=params, cv=3) # cv : the number of folds to be used for CV
gridsearch_lasso.fit(X_train, y_train)

print("Best hyperparameters : ", gridsearch_lasso.best_params_)
print("Best R2 score : ", gridsearch_lasso.best_score_)

# Print and save R2 scores
R2_train = gridsearch_lasso.score(X_train, y_train)
R2_test = gridsearch_lasso.score(X_test, y_test)
print("R2 score on training set : ", R2_train)
print("R2 score on test set : ", R2_test)

model_name = "Lasso with gridsearch_only_store"
scoring_df = log_model_score(scoring_df,model_name,R2_train, R2_test)


Best hyperparameters :  {'alpha': 1500}
Best R2 score :  0.874397774309478
R2 score on training set :  0.9547564479145718
R2 score on test set :  0.9301269254224589


# Conclusion
---

In [33]:
# Création du graphique en ligne
fig = px.line(scoring_df, x='model_name', y='r2_score', color='series',
              markers=True, title="Scores R² pour chaque modèle")
fig.show()


**The best score is for lasso using only the store feature. This is the best R2 score, and the overfitting is limited. \
Using coeff visualization helps for choosing the more importants features**