# Import packages

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Pour afficher les graphiques en ligne dans un notebook Jupyter
%matplotlib inline

# Import CSV

Import data with the read_csv method from pandas library to visualize the data.

In [2]:
sales_data = pd.read_csv("sales.csv")

In [None]:
sales_data

In [None]:
sales_data.isna()

In [None]:
print(sales_data.columns)

The Unnamed:0 column can become the index column because each value is unique and there is no duplicate.
=> Has become uid for cleared understanding

In [3]:
sales_df = sales_data.rename(columns={"Unnamed: 0": "uid"})
sales_df = sales_df.set_index(["uid"])

In [None]:
sales_df

In [None]:
sales_df.head()

# DataViz

## Visualize the data

Sales by store ID

Visualize each feature compared to the target variable : sales.

In [None]:
sales_df.hist(bins=50, figsize=(20, 15))
plt.show()

# Feature engineering

Feature selection

In [None]:
sales_df.columns

Convert state_holiday from str to int.

In [None]:
# sales_df = pd.get_dummies("sales_df", columns=["school_holiday"], drop_first=True)

In [None]:
# sales_df

Drop the columns that are not useful for our analysis.

drop uid ? / meaningless ? 
xgBoost ? Other models ? Cross validation to gain confidence on the results


In [4]:
X = sales_df.drop(columns=["date", "open", "sales"])
y = sales_df["sales"]

In [None]:
X.head()

In [5]:
X["state_holiday"].unique()

array(['0', 'a', 'c', 'b'], dtype=object)

Because state_holiday is a categorical variable, we can convert it to a numerical variable using the label encoding method.

In [6]:
from sklearn import preprocessing
converted = preprocessing.LabelEncoder()
converted.fit(['0','a', 'b', 'c', 'd'])
X["state_holiday"] = converted.transform(X["state_holiday"])

We check again the data type of the column to see if the conversion was successful.

In [7]:
X["state_holiday"].unique()

array([0, 1, 3, 2])

In [None]:
y.head()

Just to make sure, we compare the shapes of y and X to see if they are the same.

In [8]:
print(f"Shape of X => {X.shape}")
print(f"Shape of y => {y.shape}")

Shape of X => (640840, 6)
Shape of y => (640840,)


# Train-test split

We split the data into training and testing sets using the train_test_split method from the sklearn library.
We set the random_state to 42 to ensure that the results are reproducible.

In [9]:
from sklearn.model_selection import train_test_split

# KFold / Or Bootstraping with other random state

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Standardize the data to ensure that all variables are on the same scale, which can improve model performance and numerical stability.

minMax scaler

In [10]:
# Import StandardScaler.
# from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Instantiate StandardScaler.
scaler = MinMaxScaler()

# Fit and transform training data.
X_train_scaled = scaler.fit_transform(X_train)

# Also transform test data.
X_test_scaled = scaler.transform(X_test)

We now create a linear regression model using the LinearRegression method from the sklearn library.
We fit the model to the training data using the fit method.

In [11]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train_scaled, y_train)

We predict the target variable using the predict method on the test data.

In [12]:
y_pred = lin_reg.predict(X_test_scaled)

We evaluate the model using the mean squared error (MSE) and R-squared (R2) metrics from the sklearn library.

In [14]:
# Import metrics.
from sklearn.metrics import mean_squared_error, r2_score

# Calculate and print R^2 score.
r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2:.4f}")

# Calculate and print MSE score.
mse = mean_squared_error(y_test, y_pred) / 1_000_000
print(f"MSE: {mse:.4f}")

rmse = (mse / 1_000_000) ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

R-squared: 0.8401
MSE: 2.3643
Root mean squared error: 0.0015


In [15]:
#Save the model to disk (it can alternatively be stored in a string)
import pickle
ofname = open('linear_model_v1.pkl', 'wb')
s = pickle.dump(lin_reg,ofname)
ofname.close()
print (s)

#Clear the namespace
%reset -f

None


In [16]:
#Check we don't have the variable in the namespace. This should give a NameError
print(lin_reg)

NameError: name 'lin_reg' is not defined