In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, Lasso
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import dataset
print("Loading dataset...")
dataset = pd.read_csv("/content/drive/MyDrive/Fichiers/2.Scolarité/1. Jedha_Data_Science/CERTIF_PROJECTS/04_SUPERVISED_ML/Walmart/Src/Walmart_Store_sales.csv")
# dataset = pd.read_csv("G:\Mon Drive\Fichiers\2.Scolarité\1. Jedha_Data_Science\CERTIF_PROJECTS\04_SUPERVISED_ML\Walmart\Src\Walmart_Store_sales.csv")
print("...Done.")
dataset.head()

Loading dataset...
...Done.


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [None]:
# Basic stats
print("general info : ")
display(dataset.info())
print()

print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

general info : 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         150 non-null    float64
 1   Date          132 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Holiday_Flag  138 non-null    float64
 4   Temperature   132 non-null    float64
 5   Fuel_Price    136 non-null    float64
 6   CPI           138 non-null    float64
 7   Unemployment  135 non-null    float64
dtypes: float64(7), object(1)
memory usage: 9.5+ KB


None


Number of rows : 150

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

In [None]:
# Drop rows where weekly sales is missing
mask_1 = ~dataset['Weekly_Sales'].isnull()
dataset = dataset.loc[mask_1, :]
print(dataset.shape[0])

136


In [None]:
# Create date column in date Year / month / day / day of the week
dataset['Year'] = pd.to_datetime(dataset['Date']).dt.year
dataset['Month'] = pd.to_datetime(dataset['Date']).dt.month
dataset['Day'] = pd.to_datetime(dataset['Date']).dt.day
dataset['Week_day'] = pd.to_datetime(dataset['Date']).dt.dayofweek


dataset = dataset.drop(columns='Date')











In [None]:
display(dataset.head())
print(dataset.shape[0])

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Week_day
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11.0,1244390.03,0.0,84.57,,214.556497,7.346,,,,
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


136


In [None]:
display(100*dataset.isnull().sum()/dataset.shape[0])

Store            0.000000
Weekly_Sales     0.000000
Holiday_Flag     8.088235
Temperature     11.029412
Fuel_Price       8.823529
CPI              8.088235
Unemployment    10.294118
Year            13.235294
Month           13.235294
Day             13.235294
Week_day        13.235294
dtype: float64

In [None]:
def drop_outliers (dataset, columns):
    for col in columns:
        mean = dataset[col].mean()
        std = dataset[col].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        filtered_df = dataset[dataset[col].between(lower_bound, upper_bound)]

    return filtered_df

In [None]:
outlier_columns = ["Temperature", "Fuel_Price", "CPI", "Unemployment"]

dataset_filtered = drop_outliers(dataset.copy(), outlier_columns)

In [None]:
display(dataset_filtered.head())
print(dataset_filtered.shape[0])

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Week_day
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0,4.0
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0,4.0
3,11.0,1244390.03,0.0,84.57,,214.556497,7.346,,,,
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0,4.0
5,4.0,1857533.7,0.0,,2.756,126.160226,7.896,2010.0,5.0,28.0,4.0


117


### Remove missing values in date. We will create a new dataset, and test later without
### this pre-processing to compare performances.

In [None]:
print((dataset_filtered["Year"].isnull()).value_counts())
dataset_time_notnull = dataset_filtered[dataset_filtered['Year'].notnull()]
print(dataset_time_notnull.shape[0])

Year
False    102
True      15
Name: count, dtype: int64
102


### Model_1 Training

In [None]:
target_variable = "Weekly_Sales"

X = dataset_time_notnull.drop(target_variable, axis = 1)
Y = dataset_time_notnull.loc[:,target_variable]

print(X.shape)
print(Y.shape)

(117, 10)
(117,)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [None]:
categorical_features = ['Store','Holiday_Flag']
numeric_features = list(set(X.columns) - set(categorical_features))


print(numeric_features)
print(categorical_features)

['CPI', 'Unemployment', 'Year', 'Week_day', 'Fuel_Price', 'Temperature', 'Month', 'Day']
['Store', 'Holiday_Flag']


In [None]:
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(drop="first")),
    ]
)


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])



print("Performing preprocessings on train set...")
X_train = preprocessor.fit_transform(X_train)
print('...Done.')
print(X_train[0:5])
print()

print("Performing preprocessings on test set...")
X_test = preprocessor.transform(X_test)
print('...Done.')
print(X_test[0:5,:])
print()


print("Reshaping target")
Y_train = Y_train.values.reshape(-1,1)
print("...Done")
print(Y_train[0:5])
print()
Y_test = Y_test.values.reshape(-1,1)
print("...Done")
print(Y_test[0:5])

Performing preprocessings on train set...
...Done.
[[-1.17687782 -1.27967381  1.44500204  0.          0.39879691 -0.46334062
  -1.03735861  0.08336632  0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   1.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 1.07335276 -1.08588404  0.15283675  0.         -0.11433439  0.02216649
   1.67069333 -0.55213106  0.          0.          0.          1.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [ 1.10339719  0.16682846  0.15283675  0.          0.26166699  1.17413706
   0.99368035  0.97306267  0.          1.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          0.          0.        ]
 [-1.27008423  1

In [None]:
print("Train model...")
model_1 = LinearRegression()
model_1.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


In [None]:
print("Predictions on training set...")
Y_train_pred = model_1.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()


print("Predictions on test set...")
Y_test_pred = model_1.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on training set...
...Done.
[[1977316.35145768]
 [ 422250.21594885]
 [ 317478.96252554]
 [1849782.50363257]
 [1943580.17373052]
 [ 829483.28781078]
 [ 141838.64798291]
 [ 527622.99383183]
 [1592018.83805379]
 [1605834.81632934]
 [ 590152.82071129]
 [2003671.35843041]
 [1569607.94      ]
 [1912843.886687  ]
 [ 923075.12512625]
 [ 575370.6267858 ]
 [2290825.98274591]
 [1597363.68241054]
 [1008375.70231887]
 [ 589454.42861373]
 [ 233242.57755454]
 [1555191.70621771]
 [2116717.8412783 ]
 [ 617984.91130859]
 [ 384919.39543256]
 [1973178.88331851]
 [2064178.90318471]
 [1861174.52156373]
 [ 814706.46523404]
 [1587908.86843596]
 [1525780.65844486]
 [1981670.53144971]
 [1144788.055055  ]
 [ 669001.33946909]
 [2099819.78547879]
 [ 222577.38803094]
 [1591093.05936514]
 [ 692234.08492174]
 [1902078.9901286 ]
 [2199157.16890371]
 [ 477589.45616818]
 [1658093.47561686]
 [ 973467.32896063]
 [2056743.44632983]
 [2395677.15039951]
 [ 565407.8632142 ]
 [ 296176.31172495]
 [ 990968.76225245]


### Model_1 Performance assessment

In [None]:
print("R2 score on training set : ", model_1.score(X_train, Y_train))
print("R2 score on test set : ", model_1.score(X_test, Y_test))

mse = mean_squared_error(Y_test, Y_test_pred)
print("Mean Squared Error:", mse)

R2 score on training set :  0.978630906684949
R2 score on test set :  0.9237370956190581
Mean Squared Error: 30922755090.512615


### Extracting coefficients and ploting them

In [None]:
coefficients = model_1.coef_
print("Coefficients:", coefficients)

Coefficients: [[  110310.0846669    -79933.61399182   -17975.17944661        0.
    -24207.79163501   -40919.99445791    41156.0555793    -47801.7243941
    200790.09767936 -1257441.75348087   676126.74598893 -1406501.00111791
     35929.77802619  -932966.48136393  -832899.52994894 -1300516.47667715
    524300.6635185   -145994.52511388   538910.28033891   633197.75963187
   -707981.69321626 -1134732.05005485  -650071.53920031  -236959.89010727
     55709.03807892   309706.47830534   -59786.6316401 ]]


In [None]:
# Check
print((X_train).shape)
print((coefficients).shape)

(93, 27)
(1, 27)


In [None]:
encoded_feature_names = preprocessor.named_transformers_['cat'].named_steps['encoder'].get_feature_names_out(categorical_features)

final_feature_names = numeric_features + list(encoded_feature_names)

In [None]:
# Check
print(len(final_feature_names))
final_feature_names

27


['CPI',
 'Unemployment',
 'Year',
 'Week_day',
 'Fuel_Price',
 'Temperature',
 'Month',
 'Day',
 'Store_2.0',
 'Store_3.0',
 'Store_4.0',
 'Store_5.0',
 'Store_6.0',
 'Store_7.0',
 'Store_8.0',
 'Store_9.0',
 'Store_10.0',
 'Store_11.0',
 'Store_13.0',
 'Store_14.0',
 'Store_15.0',
 'Store_16.0',
 'Store_17.0',
 'Store_18.0',
 'Store_19.0',
 'Store_20.0',
 'Holiday_Flag_1.0']

In [None]:
feature_coef = pd.DataFrame({'feature': final_feature_names, 'coefficient': coefficients.flatten()})
feature_coef.head()

Unnamed: 0,feature,coefficient
0,CPI,110310.084667
1,Unemployment,-79933.613992
2,Year,-17975.179447
3,Week_day,0.0
4,Fuel_Price,-24207.791635


In [None]:
fig = px.bar(feature_coef, x="feature", y="coefficient",  title='Histogram of Coefficients')
fig.update_xaxes(title_text='Coefficient')
fig.show()

### Model_2 using GridSearchCV, trying both Lasso and Ridge regularizations

In [None]:
#Pipeline
ridge_pipeline = Pipeline([
    ('ridge', Ridge())
])

lasso_pipeline = Pipeline([
    ('lasso', Lasso())
])

#Parameters
ridge_params = {
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}
lasso_params = {
    'lasso__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
}

#Training models
ridge_gridsearch = GridSearchCV(estimator = ridge_pipeline,
                                param_grid = ridge_params,
                                cv = 3)
ridge_gridsearch.fit(X_train, Y_train)

lasso_gridsearch = GridSearchCV(estimator = lasso_pipeline,
                                param_grid = lasso_params,
                                cv = 3)
lasso_gridsearch.fit(X_train, Y_train)

print("...Done.")

print("Best parameters for Ridge:", ridge_gridsearch.best_params_)
print("Best score for Ridge:", ridge_gridsearch.best_score_)

print("Best parameters for Lasso:", lasso_gridsearch.best_params_)
print("Best score for Lasso:", lasso_gridsearch.best_score_)

...Done.
Best parameters for Ridge: {'ridge__alpha': 0.1}
Best score for Ridge: 0.9409223600030688
Best parameters for Lasso: {'lasso__alpha': 100}
Best score for Lasso: 0.9315283251889289



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.532e+11, tolerance: 2.585e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.532e+11, tolerance: 2.585e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.532e+11, tolerance: 2.585e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.533e+11, tolerance: 2.585e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.533e+11, tolerance: 2.585e+09


Obje