#### Importing Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor as RFR
from xgboost import XGBRegressor as XGBR
from lightgbm import LGBMRegressor as LGBMR

  from pandas import MultiIndex, Int64Index


#### Loading Data

In [2]:
data = pd.read_csv('sales.csv')
print(data.shape)
data.head()

(640840, 10)


Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


#### Checking Info and Dtypes

In [3]:
data.info

<bound method DataFrame.info of         Unnamed: 0  store_ID  day_of_week        date  nb_customers_on_day  \
0           425390       366            4  2013-04-18                  517   
1           291687       394            6  2015-04-11                  694   
2           411278       807            4  2013-08-29                  970   
3           664714       802            2  2013-05-28                  473   
4           540835       726            4  2013-10-10                 1068   
...            ...       ...          ...         ...                  ...   
640835      359783       409            6  2013-10-26                  483   
640836      152315        97            1  2014-04-14                  987   
640837      117952       987            1  2014-07-07                  925   
640838      435829      1084            4  2014-06-12                  725   
640839      305711       695            7  2015-05-03                    0   

        open  promotion state_h

In [4]:
data.dtypes

Unnamed: 0              int64
store_ID                int64
day_of_week             int64
date                   object
nb_customers_on_day     int64
open                    int64
promotion               int64
state_holiday          object
school_holiday          int64
sales                   int64
dtype: object

#### Checking Nans & Duplicates

In [5]:
data.isna().sum().sum()

0

In [6]:
data.duplicated().sum()

0

#### Cleaning and fixing Dtypes

In [7]:
def clean_dataframe(data):
    data.columns = data.columns.str.lower()
    data['open'] = data['open'].astype('object')
    data['promotion'] = data['promotion'].astype('object')
    data['day_of_week'] = data['day_of_week'].astype('object')
    data['state_holiday'] = data['state_holiday'].astype('object')
    data['school_holiday'] = data['school_holiday'].astype('object')
    data["store_id"] = data["store_id"].astype("object")
    data.date = data.date.astype("datetime64")
    data["year"] = pd.DatetimeIndex(data.date).year.astype("object")
    data["month"] = pd.DatetimeIndex(data.date).month.astype("object")
    data["day"] = pd.DatetimeIndex(data.date).day.astype("object")
    data.date = data.date.astype("object")
    data['state_holiday'] = data['state_holiday'].str.replace('a','1').str.replace('b','2').str.replace('c','3').astype('object')
    try:
        data = data.drop(columns = ["unnamed: 0","date"])
    except:
        pass
    return data

In [8]:
clean_dataframe(data)

Unnamed: 0,store_id,day_of_week,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales,year,month,day
0,366,4,517,1,0,0,0,4422,2013,4,18
1,394,6,694,1,0,0,0,8297,2015,4,11
2,807,4,970,1,1,0,0,9729,2013,8,29
3,802,2,473,1,1,0,0,6513,2013,5,28
4,726,4,1068,1,1,0,0,10882,2013,10,10
...,...,...,...,...,...,...,...,...,...,...,...
640835,409,6,483,1,0,0,0,4553,2013,10,26
640836,97,1,987,1,1,0,0,12307,2014,4,14
640837,987,1,925,1,0,0,0,6800,2014,7,7
640838,1084,4,725,1,0,0,0,5344,2014,6,12


In [9]:
clean_dataframe(data).dtypes

store_id               object
day_of_week            object
nb_customers_on_day     int64
open                   object
promotion              object
state_holiday          object
school_holiday         object
sales                   int64
year                   object
month                  object
day                    object
dtype: object

#### X/y Split

In [17]:
y = clean_dataframe(data)['sales']
X = clean_dataframe(data).drop(['sales'], axis=1)

#### Splitting between num and cat

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
numericals_train = X_train.select_dtypes(np.number)
numericals_test = X_test.select_dtypes(np.number)
categoricals_train = X_train.select_dtypes(object)
categoricals_test = X_test.select_dtypes(object)

#### Scaling Num

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(numericals_train)
numericals_train_scaled = scaler.transform(numericals_train)
numericals_test_scaled = scaler.transform(numericals_test)
cols = scaler.get_feature_names_out(input_features = numericals_train.columns)
numericals_train_scaled = pd.DataFrame(numericals_train_scaled, columns=cols)
numericals_test_scaled = pd.DataFrame(numericals_test_scaled, columns=cols)

#### Encoding Cat

In [20]:
ordinal_train = categoricals_train[['store_id', 'day_of_week','open','promotion', 'state_holiday', 'school_holiday',"year","month","day"]]
ordinal_test = categoricals_test[['store_id', 'day_of_week', 'open', "promotion", 'state_holiday', 'school_holiday',"year","month","day"]]

In [21]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

#### Defining the models 

In [38]:
def run_pipeline_regr(X_train, X_test, y_train, y_test):
    linreg = LinearRegression()
    dtreg = DecisionTreeRegressor()
    knreg = KNeighborsRegressor()
    # mlp = MLPRegressor(max_iter = 700)
    # rfreg = RandomForestRegressor()

    model_pipeline = [linreg, dtreg, knreg]
    model_name = ["linreg","decision_tree_reg","knn_reg"]
    # model_pipeline = [linreg]
    # model_name = ["linreg"]
    
    preds_train = {}
    preds_test = {}
    scores_train = {}
    scores_test = {}
    mses_train = {}
    mses_test = {}
    rmses_train = {}
    rmses_test = {}
    maes_train = {}
    maes_test = {}
    for i,j in zip(model_pipeline, model_name):
        i.fit(X_train, y_train)
        pred_train = i.predict(X_train)
        preds_train[j] = pred_train
        pred_test = i.predict(X_test)
        preds_test[j] = pred_test

        mean_score_train = np.mean(cross_val_score(i, X_train, y_train, cv = 5))
        scores_train[j] = mean_score_train
        mean_score_test = np.mean(cross_val_score(i, X_test, y_test, cv = 5))
        scores_test[j] = mean_score_test

        mse_train = mse(pred_train, y_train)
        mse_test = mse(pred_test, y_test)
        rmse_train = np.sqrt(mse_train)
        rmse_test = np.sqrt(mse_test)
        mae_train = mae(pred_train, y_train)
        mae_test = mae(pred_test, y_test)

        mses_train[j] = mse_train
        mses_test[j] = mse_test
        rmses_train[j] = rmse_train
        rmses_test[j] = rmse_test
        maes_train[j] = mae_train
        maes_test[j] = mae_test 

    return preds_train, preds_test, scores_train, scores_test, mses_train, mses_test, rmses_train, rmses_test, maes_train, maes_test

#### Concatenation of treated Num and Cat data

In [39]:
numericals_train_scaled = numericals_train_scaled.reset_index(drop = True)
numericals_test_scaled = numericals_test_scaled.reset_index(drop = True)
ordinal_train = ordinal_train.reset_index(drop = True)
ordinal_test = ordinal_test.reset_index(drop = True)

X_train_processed = pd.concat([numericals_train_scaled, ordinal_train], axis = 1)
X_test_processed = pd.concat([numericals_test_scaled, ordinal_test], axis = 1)

#### Testing the models and checking scores

In [43]:
preds_train, preds_test, scores_train, scores_test, mses_train, mses_test, rmses_train, rmses_test, maes_train, maes_test = run_pipeline_regr(X_train, X_test, y_train, y_test)

In [44]:
scores_test

{'linreg': 0.8519590365073251,
 'decision_tree_reg': 0.8301853579530916,
 'knn_reg': 0.8579713440824724}

In [45]:
rmses_test

{'linreg': 1479.083112248861,
 'decision_tree_reg': 1192.2919804709227,
 'knn_reg': 1315.0692316643572}

#### Importing new data for predictions

In [26]:
val = pd.read_csv('validation_for_students.csv')
print(val.shape)
val.head()

(71205, 9)


Unnamed: 0,True_index,Store_ID,Day_of_week,Date,Nb_customers_on_day,Open,Promotion,State_holiday,School_holiday
0,7,764,4,2013-12-26,0,0,0,c,1
1,19,22,3,2013-05-22,449,1,0,0,1
2,31,1087,6,2013-06-29,622,1,0,0,0
3,45,139,6,2013-08-17,314,1,0,0,0
4,56,568,1,2014-04-07,356,1,0,0,0


#### Applying the cleaning function in the new data

In [32]:
val = clean_dataframe(val)
val

Unnamed: 0,true_index,store_id,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day
0,7,764,4,2013-12-26 00:00:00,0,0,0,3,1,2013,12,26
1,19,22,3,2013-05-22 00:00:00,449,1,0,0,1,2013,5,22
2,31,1087,6,2013-06-29 00:00:00,622,1,0,0,0,2013,6,29
3,45,139,6,2013-08-17 00:00:00,314,1,0,0,0,2013,8,17
4,56,568,1,2014-04-07 00:00:00,356,1,0,0,0,2014,4,7
...,...,...,...,...,...,...,...,...,...,...,...,...
71200,712004,217,2,2015-01-13 00:00:00,633,1,1,0,0,2015,1,13
71201,712018,604,3,2014-04-30 00:00:00,743,1,1,0,0,2014,4,30
71202,712020,1021,5,2014-07-18 00:00:00,1852,1,1,0,1,2014,7,18
71203,712023,28,3,2014-08-27 00:00:00,0,0,0,0,1,2014,8,27


#### Saving the index as a new variable for late purposes and dropping now

In [33]:
trueindex = val['true_index']
val = val.drop(['true_index'], axis= 1)

In [34]:
val

Unnamed: 0,store_id,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,year,month,day
0,764,4,2013-12-26 00:00:00,0,0,0,3,1,2013,12,26
1,22,3,2013-05-22 00:00:00,449,1,0,0,1,2013,5,22
2,1087,6,2013-06-29 00:00:00,622,1,0,0,0,2013,6,29
3,139,6,2013-08-17 00:00:00,314,1,0,0,0,2013,8,17
4,568,1,2014-04-07 00:00:00,356,1,0,0,0,2014,4,7
...,...,...,...,...,...,...,...,...,...,...,...
71200,217,2,2015-01-13 00:00:00,633,1,1,0,0,2015,1,13
71201,604,3,2014-04-30 00:00:00,743,1,1,0,0,2014,4,30
71202,1021,5,2014-07-18 00:00:00,1852,1,1,0,1,2014,7,18
71203,28,3,2014-08-27 00:00:00,0,0,0,0,1,2014,8,27


#### Splitting between Num and Cat and Scaling + Encoding

In [57]:
numericals_val = val.select_dtypes(np.number)
categoricals_val = val.select_dtypes(object)
numericals_val_scaled = scaler.transform(numericals_val)
cols = scaler.get_feature_names_out(input_features = numericals_val.columns)
numericals_val_scaled = pd.DataFrame(numericals_val_scaled, columns=cols)

In [58]:
ordinal_val = categoricals_val[['store_id', 'day_of_week', 'open', "promotion", 'state_holiday', 'school_holiday',"year","month","day"]]

#### Concatenation of treated Num and Cat data

In [59]:
numericals_val_scaled = numericals_val_scaled.reset_index(drop = True)
ordinal_val = ordinal_val.reset_index(drop = True)
X_val_processed = pd.concat([numericals_val_scaled, ordinal_val], axis = 1)

#### Applying our best model to the new data

In [62]:
dtreg1 = DecisionTreeRegressor().fit(X_train_processed, y_train)
pred_train1 = dtreg.predict(X_val_processed)
pred_train1
pred_train1.tolist()
pred = pred_train1.tolist()

#### Saving the predictions as a new Csv

In [64]:
trueindex = pd.DataFrame(trueindex)
trueindex["prediction"] = pred
result = trueindex
result.to_csv("mosketeers.csv", index = False)
result

Unnamed: 0,true_index,prediction
0,7,0.0
1,19,3739.0
2,31,6987.0
3,45,3885.0
4,56,3523.0
...,...,...
71200,712004,4576.0
71201,712018,7355.0
71202,712020,14913.0
71203,712023,0.0
