# Installing and importing libraries

In [2]:
#!pip install numpy


In [3]:
#Importing libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RepeatedKFold

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

import pickle

# Reading and understanting the dataset

In [4]:
shop_df = pd.read_csv('sales.csv')
shop_df.head(5)

Unnamed: 0.1,Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
0,425390,366,4,2013-04-18,517,1,0,0,0,4422
1,291687,394,6,2015-04-11,694,1,0,0,0,8297
2,411278,807,4,2013-08-29,970,1,1,0,0,9729
3,664714,802,2,2013-05-28,473,1,1,0,0,6513
4,540835,726,4,2013-10-10,1068,1,1,0,0,10882


In [5]:
shop_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640840 entries, 0 to 640839
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Unnamed: 0           640840 non-null  int64 
 1   store_ID             640840 non-null  int64 
 2   day_of_week          640840 non-null  int64 
 3   date                 640840 non-null  object
 4   nb_customers_on_day  640840 non-null  int64 
 5   open                 640840 non-null  int64 
 6   promotion            640840 non-null  int64 
 7   state_holiday        640840 non-null  object
 8   school_holiday       640840 non-null  int64 
 9   sales                640840 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 48.9+ MB


<b>Here we can already see some interesting insight.    
1. There are no nans. This will save us some steps    
2. 'Unnamed : 0' as index
3. Date will probably have to be converted to a date format
4. We will have to change the type of state_holiday and look at its values
</b>

# Cleaning the dataset 

## Removing unused columns

In [6]:
shop_df.set_index("Unnamed: 0", inplace=True)
shop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 640840 entries, 425390 to 305711
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   store_ID             640840 non-null  int64 
 1   day_of_week          640840 non-null  int64 
 2   date                 640840 non-null  object
 3   nb_customers_on_day  640840 non-null  int64 
 4   open                 640840 non-null  int64 
 5   promotion            640840 non-null  int64 
 6   state_holiday        640840 non-null  object
 7   school_holiday       640840 non-null  int64 
 8   sales                640840 non-null  int64 
dtypes: int64(7), object(2)
memory usage: 48.9+ MB


## Cleaning state_holiday

In [7]:
shop_df['state_holiday'].unique()

array(['0', 'a', 'c', 'b'], dtype=object)

<b>I will change a, b, c to 1, 2, 3
</b>

In [8]:
shop_df['state_holiday'] = shop_df['state_holiday'].map({'0' : 0, 'a' : 1, 'b' : 2, 'c' : 3})

In [9]:
shop_df['state_holiday'].unique()

array([0, 1, 3, 2], dtype=int64)

In [10]:
shop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 640840 entries, 425390 to 305711
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   store_ID             640840 non-null  int64 
 1   day_of_week          640840 non-null  int64 
 2   date                 640840 non-null  object
 3   nb_customers_on_day  640840 non-null  int64 
 4   open                 640840 non-null  int64 
 5   promotion            640840 non-null  int64 
 6   state_holiday        640840 non-null  int64 
 7   school_holiday       640840 non-null  int64 
 8   sales                640840 non-null  int64 
dtypes: int64(8), object(1)
memory usage: 48.9+ MB


## Date column to date format to oridnal

In [11]:
shop_df['date'] = pd.to_datetime(shop_df['date'])

In [12]:
shop_df['date'] = shop_df['date'].apply(lambda x : x.toordinal())

In [13]:
shop_df.head(5)

Unnamed: 0_level_0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday,sales
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
425390,366,4,734976,517,1,0,0,0,4422
291687,394,6,735699,694,1,0,0,0,8297
411278,807,4,735109,970,1,1,0,0,9729
664714,802,2,735016,473,1,1,0,0,6513
540835,726,4,735151,1068,1,1,0,0,10882


In [14]:
shop_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 640840 entries, 425390 to 305711
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype
---  ------               --------------   -----
 0   store_ID             640840 non-null  int64
 1   day_of_week          640840 non-null  int64
 2   date                 640840 non-null  int64
 3   nb_customers_on_day  640840 non-null  int64
 4   open                 640840 non-null  int64
 5   promotion            640840 non-null  int64
 6   state_holiday        640840 non-null  int64
 7   school_holiday       640840 non-null  int64
 8   sales                640840 non-null  int64
dtypes: int64(9)
memory usage: 48.9 MB


# Train test split

In [15]:
features_shop_df = shop_df.drop('sales', axis = 1)
target_shop_df = shop_df['sales']

In [18]:
display(features_shop_df.head(5))
display(target_shop_df)

Unnamed: 0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
0,366,4,734976,517,1,0,0,0
1,394,6,735699,694,1,0,0,0
2,807,4,735109,970,1,1,0,0
3,802,2,735016,473,1,1,0,0
4,726,4,735151,1068,1,1,0,0


0          4422
1          8297
2          9729
3          6513
4         10882
          ...  
640835     4553
640836    12307
640837     6800
640838     5344
640839        0
Name: sales, Length: 640840, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(features_shop_df, target_shop_df, random_state = 8, test_size = 0.25)

# Prediction model 1 : KNeighborsRegressor

## k = 3

In [18]:
model_knn_k3 = KNeighborsRegressor(n_neighbors=3)

In [19]:
model_knn_k3.fit(X_train, y_train)
prediction = model_knn_k3.predict(X_test)

In [20]:
np.sqrt(mean_squared_error(y_test,prediction))

1697.3891435752662

In [21]:
r2_score(y_test, prediction)

0.6985390138567662

## k = 30

In [22]:
model_knn_k30 = KNeighborsRegressor(n_neighbors=30)
model_knn_k30.fit(X_train, y_train)
prediction_k30 = model_knn_k30.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction_k30)))
print(r2_score(y_test, prediction_k30))

1572.0454582318232
0.7414178895608413


## k = 300

In [23]:
model_knn_k300 = KNeighborsRegressor(n_neighbors=300)
model_knn_k300.fit(X_train, y_train)
prediction_k300 = model_knn_k300.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction_k300)))
print(r2_score(y_test, prediction_k300))

1610.2246887316319
0.7287053449226366


# Prediction model 2 : DecisionTreeRegressor

In [37]:
tree = DecisionTreeRegressor()
model_tree_ = tree.fit(X_train, y_train)

prediction = model_tree_.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction)))
print(r2_score(y_test, prediction))

1349.6913417773078
0.8123472805031567


In [25]:
tree = DecisionTreeRegressor(max_depth=5)
model_tree_5 = tree.fit(X_train, y_train)

prediction = model_tree_5.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction)))
print(r2_score(y_test, prediction))

1554.9399305239217
0.7470145712090284


In [26]:
tree = DecisionTreeRegressor(max_depth=8)
model_tree_8 = tree.fit(X_train, y_train)

prediction = model_tree_8.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction)))
print(r2_score(y_test, prediction))

1457.6074630423286
0.7776948920223571


In [55]:
tree_15 = DecisionTreeRegressor(max_depth=15)
model_tree_15 = tree_15.fit(X_train, y_train)

prediction = model_tree_15.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction)))
print(r2_score(y_test, prediction))

1286.2902144093364
0.8295630043015463


# Prediction model 3 : DecisionTreeRegressor with bagging

In [38]:
bagging_tree_15 = BaggingRegressor(
    DecisionTreeRegressor(max_depth=15), bootstrap=False, random_state=0)

bagging_tree_15.fit(X_train, y_train)

prediction = bagging_tree_15.predict(X_test)
print(np.sqrt(mean_squared_error(y_test,prediction)))
print(r2_score(y_test, prediction))

1283.189673902641
0.8303836743265848


# Cross validation on a single tree

In [25]:
cross_val = RepeatedKFold(n_splits=5, random_state=0)
cross_val.get_n_splits(features_shop_df, target_shop_df)

tree_15 = DecisionTreeRegressor(max_depth=15)
tree_15.fit(X_train, y_train)

scores = cross_val_score(tree_15, features_shop_df, target_shop_df, scoring='r2', cv=cross_val, n_jobs=-1)
scores.mean()

0.9066280932408727

# Exporting to Pickle

In [26]:
with open('tree_15_model.pkl', 'wb') as file:
    pickle.dump(tree_15, file)
    
X_train.to_pickle('train_features.pkl')
y_train.to_pickle('train_target.pkl')

X_test.to_pickle('test_features.pkl')
y_test.to_pickle('test_target.pkl')

# PART II : Validation

In [47]:
df_validation = pd.read_csv('validation_for students.csv')
df_validation.head(5)

Unnamed: 0,index,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
0,272371,415,7,01/03/2015,0,0,0,0,0
1,558468,27,7,29/12/2013,0,0,0,0,0
2,76950,404,3,19/03/2014,657,1,1,0,0
3,77556,683,2,29/01/2013,862,1,0,0,0
4,456344,920,3,19/03/2014,591,1,1,0,0


In [48]:
df_validation = df_validation.rename(columns={'index': 'Unnamed: 0'})
df_validation.set_index("Unnamed: 0", inplace=True)
df_validation['state_holiday'] = df_validation['state_holiday'].map({'0' : 0, 'a' : 1, 'b' : 2, 'c' : 3})

In [49]:
df_validation['date'] = pd.to_datetime(df_validation['date'], format='%d/%m/%Y')
df_validation['date'] = df_validation['date'].dt.strftime("%Y-%m-%d")
df_validation['date'] = pd.to_datetime(df_validation['date'])
df_validation['date'] = df_validation['date'].apply(lambda x : x.toordinal())

In [50]:
df_validation.head()

Unnamed: 0_level_0,store_ID,day_of_week,date,nb_customers_on_day,open,promotion,state_holiday,school_holiday
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
272371,415,7,735658,0,0,0,0,0
558468,27,7,735231,0,0,0,0,0
76950,404,3,735311,657,1,1,0,0
77556,683,2,734897,862,1,0,0,0
456344,920,3,735311,591,1,1,0,0


In [45]:
my_model = pickle.load(open('tree_15_model.pkl', 'rb'))

In [51]:
pred = my_model.predict(df_validation)

In [56]:
pred_df = pd.DataFrame(pred, columns=['sales']) 

In [63]:
pred_df.head()

Unnamed: 0,sales
0,0.0
1,0.0
2,7494.650794
3,6526.591584
4,6299.767089


In [67]:
pred_df['index'] = df_validation.index

In [68]:
pred_df

Unnamed: 0,sales,index
0,0.000000,272371
1,0.000000,558468
2,7494.650794,76950
3,6526.591584,77556
4,6299.767089,456344
...,...,...
71200,0.000000,59062
71201,0.000000,687449
71202,5704.576923,207393
71203,6639.947658,233378


In [69]:
pred_df.to_csv('my_prediction.csv', index = False)