PROJET WALMART SALES - MACHINE LEARNING SUPERVISE

1- Import des librairies usuelles

In [72]:
# Import Librairies

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, GridSearchCV

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio





2- Exploration du dataset

In [73]:
# File reading and basic exploration
# Import dataset

print("Loading dataset...")
dataset = pd.read_csv("Walmart_Store_sales.csv")
print("...Done.")
print()


Loading dataset...
...Done.



In [74]:
dataset.shape

(150, 8)

In [75]:
# Basic stats

print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
print(data_desc)

print("Percentage of missing values: ")
display(100 * dataset.isnull().sum() / dataset.shape[0])



Number of rows : 150

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 
             Store        Date  Weekly_Sales  Holiday_Flag  Temperature  \
count   150.000000         132  1.360000e+02    138.000000   132.000000   
unique         NaN          85           NaN           NaN          NaN   
top            NaN  19-10-2012           NaN           NaN          NaN   
freq           NaN           4           NaN           NaN          NaN   
mean      9.866667         NaN  1.249536e+06      0.079710    61.398106   
std       6.231191         NaN  6.474630e+05      0.271831    18.378901   
min       1.000000         NaN  2.689290e+05      0.000000    18.790000   
25%       4.000000         NaN  6.050757e+05      0.000000    45.587500   
50%       9.000000         NaN  1.261424e+06      0.000000    62.985000   
75%      15.750000         NaN  1.806386e+06      0.000000    76.345000   
max      20.000000         NaN  2.771397e+06      1.000000    91.650000   

        Fuel_Price         CPI  Unemployment  
count   136.000000  138.000000 

Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

In [76]:
#!pip install nbformat


In [77]:
# Univariate analysis
# Distribution of each numeric variable

numeric_features = ["Temperature","Fuel_Price","CPI","Unemployment"]
for feature in numeric_features:
    fig = px.histogram(dataset, x=feature, title=f'Distribution of {feature}')
    fig.update_layout(width=600, height=400)  # Modifiez les valeurs de width et height selon vos besoins
    fig.show()


In [78]:
# Univariate analysis
# Barplot of each qualitative variable

cat_features = ['Store', 'Holiday_Flag']
for feature in cat_features:
    fig = px.bar(dataset, x=feature, title=f'Bar Chart for {feature}')
    fig.update_layout(width=600, height=800)
    fig.show()

In [79]:
# Correlation matrix

corr_matrix = dataset.corr(numeric_only=True).round(2)

import plotly.figure_factory as ff

fig = ff.create_annotated_heatmap(corr_matrix.values,
                                  x = corr_matrix.columns.tolist(),
                                  y = corr_matrix.index.tolist())


fig.show()

In [80]:
# Visualize pairwise dependencies

fig = px.scatter_matrix(dataset)
fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), showlegend = False, 
            autosize=False, height=800, width = 800)
fig.show()

3- Preprocessings usuels

In [81]:
# Preprocessing
# Transformations on column Date

dataset['Date']=pd.to_datetime(dataset['Date'])
dataset=dataset.dropna(subset=["Date"])
dataset['year']=dataset['Date'].dt.year.astype(int)
dataset['month']=dataset['Date'].dt.month.astype(int)
dataset['day']=dataset['Date'].dt.day.astype(int)
dataset['day_of_week']=dataset['Date'].dt.dayofweek.astype(int)

dataset.head()






Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,day,day_of_week
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
2,17.0,2012-07-27,,0.0,,,130.719581,5.936,2012,7,27,4
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4


In [82]:
# Preprocessing
# Drop useless rows/columns with too many missing values / outliers

print('Dropping outliers in Temperature ...')
low =  dataset['Temperature'].mean() - 3 *  dataset['Temperature'].std()
high =  dataset['Temperature'].mean() + 3 *  dataset['Temperature'].std()
print('The number of outliers in {} is {}'.format('Temperature', dataset.loc[(dataset['Temperature'] <= low) | (dataset['Temperature'] >= high)].shape[0]))


print('Dropping outliers in Fuel_price ...')
low =  dataset['Fuel_Price'].mean() - 3 *  dataset['Fuel_Price'].std()
high =  dataset['Fuel_Price'].mean() + 3 *  dataset['Fuel_Price'].std()
print('The number of outliers in {} is {}'.format('Fuel_Price', dataset.loc[(dataset['Fuel_Price'] <= low) | (dataset['Fuel_Price'] >= high)].shape[0]))


print('Dropping outliers in CPI ...')
low =  dataset['CPI'].mean() - 3 *  dataset['CPI'].std()
high =  dataset['CPI'].mean() + 3 *  dataset['CPI'].std()
print('The number of outliers in {} is {}'.format('CPI', dataset.loc[(dataset['CPI'] <= low) | (dataset['CPI'] >= high)].shape[0]))



print('Dropping outliers in Unemployment ...')
low =  dataset['Unemployment'].mean() - 3 *  dataset['Unemployment'].std()
high =  dataset['Unemployment'].mean() + 3 *  dataset['Unemployment'].std()
print('The number of outliers in {} is {}'.format('Unemployment', dataset.loc[(dataset['Unemployment'] <= low) | (dataset['Unemployment'] >= high)].shape[0]))


low =  dataset['Unemployment'].mean() - 3 *  dataset['Unemployment'].std()
high =  dataset['Unemployment'].mean() + 3 *  dataset['Unemployment'].std()
dataset=dataset.drop(dataset.loc[(dataset['Unemployment'] <= low) | (dataset['Unemployment'] >= high)].index)

dataset=dataset.dropna(subset=['Weekly_Sales'], axis=0)

dataset.shape

Dropping outliers in Temperature ...
The number of outliers in Temperature is 0
Dropping outliers in Fuel_price ...
The number of outliers in Fuel_Price is 0
Dropping outliers in CPI ...
The number of outliers in CPI is 0
Dropping outliers in Unemployment ...
The number of outliers in Unemployment is 5


(113, 12)

In [83]:
# Preprocessing
# Separate target variable Y from features X

print("Separating labels from features...")

features_list = ["Store","Holiday_Flag","Temperature","Fuel_Price","CPI","Unemployment","year","month","day","day_of_week"]
target_variable = "Weekly_Sales"

X = dataset.loc[:, features_list]
Y = dataset.loc[:, target_variable]

print("...Done.")
print()

print("Y : ")
print(Y.head())
print()
print("X :")
print(X.head())

Separating labels from features...
...Done.

Y : 
0    1572117.54
1    1807545.43
4    1644470.66
5    1857533.70
6     695396.19
Name: Weekly_Sales, dtype: float64

X :
   Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
0    6.0           NaN        59.61       3.045  214.777523         6.858   
1   13.0           0.0        42.38       3.435  128.616064         7.470   
4    6.0           0.0        78.89       2.759  212.412888         7.092   
5    4.0           0.0          NaN       2.756  126.160226         7.896   
6   15.0           0.0        69.80       4.069  134.855161         7.658   

   year  month  day  day_of_week  
0  2011      2   18            4  
1  2011      3   25            4  
4  2010      5   28            4  
5  2010      5   28            4  
6  2011      6    3            4  


In [84]:
# Preprocessings
# Divide dataset into train set & test set 

print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [85]:
# Preprocessings
# Create a pipeline for numeric features

numeric_features = ["Temperature","Fuel_Price","CPI","Unemployment","year"]
categorical_features = ["Store", "Holiday_Flag","day_of_week","month","day"]

numeric_transformer=Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="mean"),
        ),
        ("scaler", StandardScaler()),
    ]
)

In [86]:
# Preprocessings
# Create pipeline for categorical features

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),  
        (
            "encoder",
            OneHotEncoder(drop="first"),
        ), 
    ]
)

In [87]:
#Preprocessings
#Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [88]:
# Preprocessings
# Preprocessings on train set

print("Performing preprocessings on train set...")
print(X_train.head())
X_train = preprocessor.fit_transform(X_train)
print("...Done.")
print(
    X_train[0:5]
)  
print()

# Preprocessings on test set

print("Performing preprocessings on test set...")
print(X_test.head())
X_test = preprocessor.transform(
    X_test
) 
print("...Done.")
print(
    X_test[0:5, :]
) 
print()


Performing preprocessings on train set...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
52     9.0           0.0        85.02       2.653  214.896576           NaN   
11    18.0           0.0        52.02       2.878  132.763355         9.331   
105   19.0           0.0        72.83       2.932  132.598387         8.099   
110   20.0           1.0        28.85       3.179  204.643227         7.484   
75    20.0           0.0        75.17       2.808  204.567546         7.856   

     year  month  day  day_of_week  
52   2010      6   25            4  
11   2010     10   15            4  
105  2010      7   30            4  
110  2010     12   31            4  
75   2010      6   25            4  
...Done.
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 41 stored elements and shape (5, 64)>
  Coords	Values
  (0, 0)	1.528249450013191
  (0, 1)	-1.3674775902898229
  (0, 2)	0.9862814513028467
  (0, 3)	-9.673304082775928e-16
  (0, 4)	-1.05558715

4- Application des modèles de régression linéaire et calcul des scores

In [89]:
# Train model
regressor = LinearRegression()

print("Training model...")
regressor.fit(X_train, Y_train)  
print("...Done.")

Training model...
...Done.


In [90]:
# Predictions on training set - Linear Regression
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 499886.60536534 1021985.92081206 1367011.21863225 1799737.08093709
 2004472.77092902 1912292.59407358 1957069.29649522 1514974.19594555
  526525.36990022 2632039.60514146 1425041.11554731 2077305.24865859
 1827403.26606365 1970490.16747835 1286479.13867348 1817815.13746072
  558957.9946203  1252013.68379285 1470651.26851328  837135.14052372
 2066294.26191568  443794.39931915 1993283.5123964   435420.52952271
 1515468.83246095 1779149.42304828 1968178.36796802 1893054.7111785
 2027502.37908677  726732.08578845  574526.44595539 1121077.04777855
  316759.40105877  485096.07757842 1522560.54086154 2172186.91338296
 2686575.65101647  402834.22091556 1550459.32766036 1639454.1446898
  599640.39052211  310250.34027316  458577.60600206 1614959.85729437
  795257.76942217  320683.75608817 2028479.39130327 2436248.66869999
 1757242.56506164  379992.90965034  983580.19977169 1450370.88537942
 2511257.09275413  275140.61792286 1399142.05501875  981156.34249

In [91]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[ 286877.50539641 1541143.30429539 1647840.81703009  869189.16386402
  395831.47821841 1553458.33583019 2082429.19992605 2516701.21943118
 2130721.52650489 1604509.82502457 1004176.74487056 2002819.91993942
 1161439.65577517  678917.36765659  492426.96989577  111559.41667081
  451866.03575473  277861.06864715 1804070.94646167  483030.2031397
 1838120.62198231  648782.63770831 2062830.79247493]



In [92]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9971436155769904
R2 score on test set :  0.9553406434802537


In [93]:
# Model coefficients

regressor.coef_

array([ 3.92787013e+04, -4.67186621e+04, -4.86166734e+04, -5.49635899e+04,
        3.23502061e+04,  3.18909253e+05, -1.21001072e+06,  3.14962133e+05,
       -1.31957952e+06,  2.30214741e+04, -9.53082296e+05, -6.95876864e+05,
       -1.05253628e+06,  2.77967986e+05, -4.10635646e+04,  2.83130896e+05,
        5.64264137e+05, -9.83453142e+05, -1.04674738e+06, -8.46134257e+05,
       -4.24741386e+05, -1.62080925e+05,  5.03866642e+05,  8.69658965e+04,
        1.19009087e+05,  9.57634230e+04,  7.48550723e+04,  1.48534142e+05,
        1.20115724e+05,  6.64863401e+04,  8.17069169e+04,  1.45868280e+05,
        9.52031592e+04,  1.38392626e+05,  7.24211958e+05,  1.07120895e+05,
        1.20492567e+05, -1.36495715e+04,  1.07042127e+05,  3.39033046e+04,
       -3.76632075e+04, -7.60861857e+03, -9.50339519e+04, -5.37317590e+04,
       -2.13338148e+02,  2.08086932e+04, -2.17897331e+04, -1.14268138e+05,
       -6.06554025e+04,  4.60048127e+03, -1.83286645e+04,  8.37173679e+02,
       -3.58694085e+04, -

In [94]:
# Model coefficients

column_names = []
for name, pipeline, features_list in preprocessor.transformers_:
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'year', 'x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0', 'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0', 'x0_20.0', 'x1_1.0', 'x3_2.0', 'x3_3.0', 'x3_4.0', 'x3_5.0', 'x3_6.0', 'x3_7.0', 'x3_8.0', 'x3_9.0', 'x3_10.0', 'x3_11.0', 'x3_12.0', 'x4_2.0', 'x4_3.0', 'x4_4.0', 'x4_5.0', 'x4_6.0', 'x4_7.0', 'x4_8.0', 'x4_9.0', 'x4_10.0', 'x4_11.0', 'x4_12.0', 'x4_13.0', 'x4_14.0', 'x4_15.0', 'x4_16.0', 'x4_17.0', 'x4_18.0', 'x4_19.0', 'x4_20.0', 'x4_22.0', 'x4_23.0', 'x4_24.0', 'x4_25.0', 'x4_26.0', 'x4_27.0', 'x4_28.0', 'x4_29.0', 'x4_30.0', 'x4_31.0']


In [95]:
# Create a pandas DataFrame for coefficients

coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Temperature,39278.701336
Fuel_Price,-46718.662137
CPI,-48616.673391
Unemployment,-54963.589922
year,32350.206052
...,...
x4_27.0,-93234.241370
x4_28.0,-86434.077469
x4_29.0,9368.746788
x4_30.0,-66206.479846


In [96]:
# Compute abs() and sort values for coefficients
feature_importance = abs(coefs).sort_values(by = 'coefficients')
feature_importance

Unnamed: 0,coefficients
x4_11.0,2.133381e+02
x4_18.0,8.371737e+02
x4_16.0,4.600481e+03
x4_8.0,7.608619e+03
x4_29.0,9.368747e+03
...,...
x0_15.0,9.834531e+05
x0_16.0,1.046747e+06
x0_9.0,1.052536e+06
x0_3.0,1.210011e+06


In [97]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()

In [98]:
# Perform 3-fold cross-validation to evaluate the generalized R2 score obtained with a Ridge model
print("3-fold cross-validation...")
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print('The cross-validated R2-score is : ', scores.mean())
print('The standard deviation is : ', scores.std())

3-fold cross-validation...
The cross-validated R2-score is :  0.6426981028318849
The standard deviation is :  0.10151484967179152


In [99]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.03, 1, 5, 100] 
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.03}
Best R2 score :  0.846639700235912


In [100]:
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

R2 score on training set :  0.9967110523618387
R2 score on test set :  0.9666354826449067


In [103]:
# Perform grid search - Lasso
print("Grid search...")
regressor = Lasso(max_iter=10000)
# Grid of values to be tested
params = {
    'alpha': [1, 2, 3, 5, 10, 20, 600]
}
best_lasso = GridSearchCV(regressor, param_grid = params, cv = 5) # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 600}
Best R2 score :  0.9274836417491301


In [104]:
# Print R^2 scores
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

R2 score on training set :  0.9967110523618387
R2 score on test set :  0.9666354826449067
