In [1]:
import numpy as np
import pandas as pd 
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn import metrics
from sklearn import linear_model
from sklearn.pipeline import Pipeline



In [2]:
dataset = pd.read_csv('Walmart_Store_sales.csv')

In [3]:
dataset.head()


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         150 non-null    float64
 1   Date          132 non-null    object 
 2   Weekly_Sales  136 non-null    float64
 3   Holiday_Flag  138 non-null    float64
 4   Temperature   132 non-null    float64
 5   Fuel_Price    136 non-null    float64
 6   CPI           138 non-null    float64
 7   Unemployment  135 non-null    float64
dtypes: float64(7), object(1)
memory usage: 9.5+ KB


In [5]:
dataset=dataset.drop(["Date","Holiday_Flag"], axis=1) #suppression des colonnes inutiles

In [6]:
dataset = dataset.dropna() #suppression des valeurs Nan

In [7]:
dataset.shape


(94, 6)

In [8]:
z = np.abs(stats.zscore(dataset)) # Nettoyage des outliers avec Z score méthode

In [9]:
data_clean = dataset[(z<3).all(axis=1)]

In [10]:
data_clean.shape

(90, 6)

In [11]:
cols = list(data_clean.columns.values) #Make a list of all of the columns in the df
cols.pop(cols.index('Weekly_Sales'))
cols.pop(cols.index('Store')) 
cols.pop(cols.index('Temperature'))
cols.pop(cols.index('Fuel_Price'))
cols.pop(cols.index('CPI'))
cols.pop(cols.index('Unemployment'))
data_clean = data_clean[cols+['Weekly_Sales','Store','Temperature','Fuel_Price','CPI','Unemployment']] #Create new dataframe with columns in the order you want

In [12]:
data_clean

Unnamed: 0,Weekly_Sales,Store,Temperature,Fuel_Price,CPI,Unemployment
0,1572117.54,6.0,59.61,3.045,214.777523,6.858
1,1807545.43,13.0,42.38,3.435,128.616064,7.470
4,1644470.66,6.0,78.89,2.759,212.412888,7.092
6,695396.19,15.0,69.80,4.069,134.855161,7.658
7,2203523.20,20.0,39.93,3.617,213.023623,6.961
...,...,...,...,...,...,...
139,532739.77,7.0,50.60,3.804,197.588605,8.090
143,396968.80,3.0,78.53,2.705,214.495838,7.343
144,424513.08,3.0,73.44,3.594,226.968844,6.034
145,2248645.59,14.0,72.62,2.780,182.442420,8.899


In [13]:
# Separate target variable Y from features X
target_name = 'Weekly_Sales'

print("Separating labels from features...")
Y = data_clean.loc[:,target_name]
X = data_clean.loc[:,[c for c in data_clean.columns if c!=target_name]] #Toutes les colonnes du df sauf la target Y
print("...Done.")
print(Y.head())
print()
print(X.head())
print()


Separating labels from features...
...Done.
0    1572117.54
1    1807545.43
4    1644470.66
6     695396.19
7    2203523.20
Name: Weekly_Sales, dtype: float64

   Store  Temperature  Fuel_Price         CPI  Unemployment
0    6.0        59.61       3.045  214.777523         6.858
1   13.0        42.38       3.435  128.616064         7.470
4    6.0        78.89       2.759  212.412888         7.092
6   15.0        69.80       4.069  134.855161         7.658
7   20.0        39.93       3.617  213.023623         6.961



In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [15]:

sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test) ### NO fit on X_test
X_train

array([[ 0.59601449, -1.28727936,  0.74688342,  0.27062811,  0.98416506],
       [-1.50915367,  0.78594741, -1.01456877,  0.79358246,  0.33085375],
       [ 0.59601449,  0.66788335, -1.16955981,  0.08180614,  1.48793597],
       [-1.18528165, -0.01535774,  0.45365714,  1.14556894, -0.70320231],
       [ 0.7579505 , -0.70953069,  1.42968178, -1.06280323,  0.47403074],
       [-0.86140962,  0.01689124, -0.08671702,  1.00364601, -1.26848629],
       [-0.05172956, -0.15200596,  1.13854997, -1.23985022,  0.05192376],
       [ 0.43407848,  0.84989878,  0.56047529, -1.27670113, -0.65653722],
       [-0.86140962,  0.92806156,  1.17415601,  0.95170508, -1.0680385 ],
       [-1.34721766,  0.87558864,  0.93748051,  0.91649573,  0.4613039 ],
       [ 1.40569455, -1.4835062 ,  0.94376393, -1.15531615,  0.29161265],
       [ 0.59601449, -1.01890226,  0.60027028,  0.14692232,  1.11673635],
       [ 1.24375854,  0.5110112 ,  0.99822024, -1.11734647,  1.47839083],
       [ 1.56763057, -1.72455366, -0.3

In [16]:
regressor = LinearRegression()
regressor.fit(X_train, Y_train)

LinearRegression()

In [17]:
regressor.score(X_train, Y_train)

0.14813442423116552

In [18]:
regressor.score(X_test, Y_test)

0.06518039676435239

Mon modèle est en surapprentissage
score(train)>>score(test) : the model is overfitting


In [19]:
regressor.coef_

array([  14851.08620487,   -9863.90793364, -117946.87651835,
       -227157.60561974,     649.34446255])

In [20]:
coefs = pd.DataFrame(index=data_clean.columns[1: ], data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
Store,14851.086205
Temperature,-9863.907934
Fuel_Price,-117946.876518
CPI,-227157.60562
Unemployment,649.344463


In [21]:
coefs.apply(lambda x : abs(x)).sort_values(by="coefficients", ascending=False)

Unnamed: 0,coefficients
CPI,227157.60562
Fuel_Price,117946.876518
Store,14851.086205
Temperature,9863.907934
Unemployment,649.344463


The most influential column is CPI

In [22]:
from sklearn.feature_selection import f_regression
feature_importance = f_regression(X_train, Y_train)
feature_importance

(array([2.64623589, 0.42332524, 1.28415263, 9.22504907, 1.20176969]),
 array([0.10828907, 0.51741243, 0.26099474, 0.00335283, 0.27672546]))

In [23]:
# Create DataFrame with feature importance
feature_ranking = pd.DataFrame(columns=data_clean.columns[1: ], data=feature_importance, index=["f-score", "p-value"])
# Reshape DataFrame and sort by f-score
feature_ranking = feature_ranking.transpose().sort_values(["f-score", "p-value"], ascending=False)
# Create column with feature names
feature_ranking = feature_ranking.reset_index().rename(columns = {'index': 'X'})
feature_ranking

Unnamed: 0,X,f-score,p-value
0,CPI,9.225049,0.003353
1,Store,2.646236,0.108289
2,Fuel_Price,1.284153,0.260995
3,Unemployment,1.20177,0.276725
4,Temperature,0.423325,0.517412


The most influential column is CPI


Rappels définitions : 
    f-score : Le score F ou mesure F est une mesure de l’exactitude d’un test. Il est calculé à partir de la précision et du rappel du test, où la précision est le nombre de vrais résultats positifs divisé par le nombre de tous les résultats positifs, y compris ceux qui n’ont pas été identifiés correctement, et le rappel est le nombre de vrais résultats positifs divisé par le nombre de tous les échantillons qui auraient dû être identifiés comme positifs.
    p-value : La p-valeur est utilisée pour quantifier la significativité statistique d'un résultat dans le cadre d'une hypothèse nulle. L'idée générale est de déterminer si l'hypothèse nulle est ou n'est pas vérifiée car dans le cas où elle le serait, le résultat observé serait fortement improbable. Il s'agit à ce titre d'une extension du principe de preuve par l'absurde.

LASSO (le lasso est une méthode de contraction des coefficients de la régression)


In [24]:
lin = LinearRegression()
lasso1 = Lasso(alpha = 1)
lasso001 = Lasso(alpha = 0.01)
lasso00001 = Lasso(alpha = 0.0001)

In [25]:
lin.fit(X_train, Y_train)
lasso1.fit(X_train, Y_train)
lasso001.fit(X_train, Y_train)
lasso00001.fit(X_train, Y_train)

Lasso(alpha=0.0001)

In [26]:
baseline_pred = [Y_test.values.mean()]*len(Y_test)
print("Train scores \n")
print("Score Baseline : {} \nScore Lin : {} \nScore Lasso1 : {} \nScore Lasso001 : {} \nScore Lasso00001 : {}".format(r2_score(Y_test, baseline_pred),
lin.score(X_train, Y_train),
lasso1.score(X_train, Y_train),
lasso001.score(X_train, Y_train),
lasso00001.score(X_train,Y_train)))

Train scores 

Score Baseline : 0.0 
Score Lin : 0.14813442423116552 
Score Lasso1 : 0.14813442422540712 
Score Lasso001 : 0.1481344242311644 
Score Lasso00001 : 0.1481344242311653


In [27]:
baseline_pred = [Y_test.values.mean()]*len(Y_test)
print("Test scores \n")
print("Score Baseline : {} \nScore Lin : {} \nScore Lasso1 : {} \nScore Lasso001 : {} \nScore Lasso00001 : {}".format(r2_score(Y_test, baseline_pred),
lin.score(X_test, Y_test),
lasso1.score(X_test, Y_test),
lasso001.score(X_test, Y_test),
lasso00001.score(X_test,Y_test)))

Test scores 

Score Baseline : 0.0 
Score Lin : 0.06518039676435239 
Score Lasso1 : 0.06518185947776534 
Score Lasso001 : 0.06518040761152577 
Score Lasso00001 : 0.0651803968703688


In [28]:
print("Train MSE \n")

print("MSE Baseline : {} \nMSE Lin : {} \nMSE Lasso1 : {} \nMSE Lasso001 : {} \nMSE Lasso00001 : {}".format(mean_squared_error(Y_test, baseline_pred),
mean_squared_error(Y_train,lin.predict(X_train)),
mean_squared_error(Y_train,lasso1.predict(X_train)),
mean_squared_error(Y_train,lasso001.predict(X_train)),
mean_squared_error(Y_train,lasso00001.predict(X_train))))

Train MSE 

MSE Baseline : 454495325402.7852 
MSE Lin : 365472987619.06616 
MSE Lasso1 : 365472987621.5367 
MSE Lasso001 : 365472987619.06665 
MSE Lasso00001 : 365472987619.0662


In [29]:
print("Test MSE \n")

print("MSE Baseline : {} \nMSE Lin : {} \nMSE Lasso1 : {} \nMSE Lasso001 : {} \nMSE Lasso00001 : {}".format(mean_squared_error(Y_test, baseline_pred),
mean_squared_error(Y_test,lin.predict(X_test)),
mean_squared_error(Y_test,lasso1.predict(X_test)),
mean_squared_error(Y_test,lasso001.predict(X_test)),
mean_squared_error(Y_test,lasso00001.predict(X_test))))

Test MSE 

MSE Baseline : 454495325402.7852 
MSE Lin : 424871139765.4882 
MSE Lasso1 : 424870474969.07965 
MSE Lasso001 : 424871134835.49866 
MSE Lasso00001 : 424871139717.30426


In [30]:
print("Linear Regression \n Proportion of coefficients equal to zero")
print(np.sum(lin.coef_==0)/len(lin.coef_))


Linear Regression 
 Proportion of coefficients equal to zero
0.0


RIDGE (La régression de crête est une méthode d’estimation des coefficients des modèles de régression multiple dans des scénarios où les variables indépendantes sont fortement corrélées) 

In [31]:
linear_regressor = LinearRegression()
ridge_regressor_small_alpha = Ridge(alpha = 10)
ridge_regressor_large_alpha = Ridge(alpha = 10000)

In [32]:
linear_regressor.fit(X_train, Y_train)
ridge_regressor_small_alpha.fit(X_train, Y_train)
ridge_regressor_large_alpha.fit(X_train, Y_train)

Ridge(alpha=10000)

In [33]:
print("Score on training: ")
print("Linear Regression score : {}".format(linear_regressor.score(X_train, Y_train)))
print("Ridge with small Alpha score : {}".format(ridge_regressor_small_alpha.score(X_train, Y_train)))
print("Ridge with large Alpha score : {}".format(ridge_regressor_large_alpha.score(X_train,Y_train)))

Score on training: 
Linear Regression score : 0.14813442423116552
Ridge with small Alpha score : 0.14540547686372285
Ridge with large Alpha score : 0.0027423402624887627


Plus le coef augmente, plus la valeur est proche de 0

In [34]:
coef = pd.DataFrame()
coef['X'] = X.columns
coef['coef_linear_regressor'] = linear_regressor.coef_
coef['coef_ridge_small_alpha'] = ridge_regressor_small_alpha.coef_
coef['coef_ridge_large_alpha'] = ridge_regressor_large_alpha.coef_
coef

Unnamed: 0,X,coef_linear_regressor,coef_ridge_small_alpha,coef_ridge_large_alpha
0,Store,14851.086205,26913.428417,885.903229
1,Temperature,-9863.907934,-9324.924306,-358.336066
2,Fuel_Price,-117946.876518,-101002.409383,-630.73188
3,CPI,-227157.60562,-189702.197596,-1592.950956
4,Unemployment,649.344463,8546.742908,602.083767
