In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Pour afficher les graphiques en ligne dans un notebook Jupyter
%matplotlib inline

In [2]:
# Data cleaning
sales_data = pd.read_csv("sales.csv")
sales_df = sales_data.rename(columns={"Unnamed: 0": "uid"})

# X and y determination
X = sales_df.drop(columns=["uid", "date", "open", "sales"])
y = sales_df["sales"]

# Preprocessing
from sklearn import preprocessing

converted = preprocessing.LabelEncoder()
converted.fit(['0','a', 'b', 'c', 'd'])
X["state_holiday"] = converted.transform(X["state_holiday"])

In [3]:
X

Unnamed: 0,store_ID,day_of_week,nb_customers_on_day,promotion,state_holiday,school_holiday
0,366,4,517,0,0,0
1,394,6,694,0,0,0
2,807,4,970,1,0,0
3,802,2,473,1,0,0
4,726,4,1068,1,0,0
...,...,...,...,...,...,...
640835,409,6,483,0,0,0
640836,97,1,987,1,0,0
640837,987,1,925,0,0,0
640838,1084,4,725,0,0,0


In [4]:
# Standardization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

In [5]:
# Data split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

kf = KFold(n_splits=5, shuffle=False)
nbSplits = kf.get_n_splits()

acc_snooping = np.zeros((nbSplits,)) #"snooping" refers to a scenario where information from outside the training dataset is inadvertently used to make decisions about the model.
i=0

#We will build the predicted y from the partial predictions on the test of each of the folds
yhat = y.copy()

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    dt = KNeighborsClassifier(1)
    dt.fit(X_train,y_train)
    yhat[test_index] = dt.predict(X_test)
    acc_snooping[i] = accuracy_score(yhat[test_index], y_test)

    print(f"R-squared: {(r2_score(y_test, yhat[test_index])):.4f}")
    print(f"MSE: {( mean_squared_error(y_test, yhat[test_index]) / 1_000_000):.4f}")

    i=i+1

print ('Mean accuracy: '+ str(np.mean(acc_snooping)))

R-squared: 0.9105
MSE: 1.3243
R-squared: 0.9125
MSE: 1.3077
R-squared: 0.9121
MSE: 1.3064
R-squared: 0.9124
MSE: 1.2929
R-squared: 0.9144
MSE: 1.2678
Mean accuracy: 0.17044191998002622


In [6]:
# Data split
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

kf = KFold(n_splits=42, shuffle=False)
nbSplits = kf.get_n_splits()

i=0

#We will build the predicted y from the partial predictions on the test of each of the folds
yhat = y.copy()

ln = LinearRegression()

for i, (train_index, test_index) in enumerate(kf.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # dt = KNeighborsClassifier(1)
    ln.fit(X_train,y_train)
    yhat[test_index] = ln.predict(X_test)

    print(f"R-squared: {(r2_score(y_test, yhat[test_index])):.4f}")
    print(f"MSE: {( mean_squared_error(y_test, yhat[test_index]) / 1_000_000):.4f}")

    i=i+1

R-squared: 0.8456
MSE: 2.3037
R-squared: 0.8372
MSE: 2.4000


 4089.55017322]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  yhat[test_index] = ln.predict(X_test)


R-squared: 0.8415
MSE: 2.3351
R-squared: 0.8418
MSE: 2.3502
R-squared: 0.8486
MSE: 2.1790
R-squared: 0.8399
MSE: 2.3597
R-squared: 0.8446
MSE: 2.3164
R-squared: 0.8495
MSE: 2.2773
R-squared: 0.8456
MSE: 2.3061
R-squared: 0.8407
MSE: 2.3685
R-squared: 0.8441
MSE: 2.3284
R-squared: 0.8472
MSE: 2.3178
R-squared: 0.8364
MSE: 2.3976
R-squared: 0.8481
MSE: 2.2227
R-squared: 0.8508
MSE: 2.3001
R-squared: 0.8393
MSE: 2.3981
R-squared: 0.8450
MSE: 2.3028
R-squared: 0.8377
MSE: 2.4308
R-squared: 0.8439
MSE: 2.3735
R-squared: 0.8442
MSE: 2.2816
R-squared: 0.8401
MSE: 2.3463
R-squared: 0.8418
MSE: 2.3267
R-squared: 0.8423
MSE: 2.3717
R-squared: 0.8427
MSE: 2.3499
R-squared: 0.8382
MSE: 2.3413
R-squared: 0.8405
MSE: 2.3591
R-squared: 0.8562
MSE: 2.1545
R-squared: 0.8426
MSE: 2.3083
R-squared: 0.8370
MSE: 2.4055
R-squared: 0.8385
MSE: 2.4054
R-squared: 0.8443
MSE: 2.3143
R-squared: 0.8415
MSE: 2.2966
R-squared: 0.8469
MSE: 2.2702
R-squared: 0.8447
MSE: 2.2928
R-squared: 0.8458
MSE: 2.2421
R-squared:

In [7]:
#Save the model to disk (it can alternatively be stored in a string)
import pickle
ofname = open('linear_model_v2.pkl', 'wb')
s = pickle.dump(ln,ofname)
ofname.close()
print (s)

None


In [8]:
ofname = open('linear_model_v2.pkl','rb') #Open in binary format. You never know how it was saved.
ln_import = pickle.load(ofname)

In [9]:
y_pred_import = ln_import.predict(X_test)

In [10]:
# Import metrics.
from sklearn.metrics import mean_squared_error, r2_score

# Calculate and print R^2 score.
r2 = r2_score(y_test, y_pred_import)
print(f"R-squared: {r2:.4f}")

# Calculate and print MSE score.
mse = mean_squared_error(y_test, y_pred_import) / 1_000_000
print(f"MSE: {mse:.4f}")

rmse = (mse / 1_000_000) ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

R-squared: 0.8368
MSE: 2.3751
Root mean squared error: 0.0015
