In [1]:
pip install downcast

Note: you may need to restart the kernel to use updated packages.


In [29]:
#Importing necessary libraries
from downcast import reduce
from tqdm import tqdm
import math
import pandas as pd
import numpy as np
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from prettytable import PrettyTable
import warnings 
from sklearn.metrics import mean_squared_error as mse
warnings.filterwarnings("ignore")

In [3]:
calendar = pd.read_csv('calendar.csv')

In [4]:
#deleting the rows which has data after 22nd May 2016 i.e, days after 1941 
calendar = calendar[(calendar['date'] <= '2016-05-22')]
#Checking for NULL or Nan values
print("Columns with Null values in calendar dataset ",calendar.columns[calendar.isna().any()].tolist())
#Replacing Nan values with 'no_event' - value
calendar=calendar.fillna('no_event')

Columns with Null values in calendar dataset  ['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']


In [5]:
#Loading sales_train_eval dataset
sales_train_eval = pd.read_csv('sales_train_evaluation.csv')

In [6]:
#Loading sell_prices dataset
sell_price = pd.read_csv('sell_prices.csv')

In [7]:
calendar=reduce(calendar)
sales_train_eval=reduce(sales_train_eval)
sell_price=reduce(sell_price)

In [8]:
#dataframe is pivoted to have all the sales data under a single column
sales_final=sales_train_eval.melt(id_vars=['id', 'item_id', 'dept_id', 'cat_id', 
                                           'store_id', 'state_id'], var_name='d',value_name='sales')

In [9]:
#merging sales data with calendar data to plot total sales per day(in terms of date)
sales_final=sales_final.merge(calendar,on='d',how='left')

In [10]:
#Merging with price dataframe
sales_final=sales_final.merge(sell_price,on=['wm_yr_wk','item_id','store_id'],how='left')
sales_final.isnull().values.any()

True

In [11]:
#deleting the rows which has data before 1st Jan 2015
sales_final = sales_final[(sales_final['date'] >= '2015-01-01')]

In [12]:
#deleting unused dataframes to freeup memory space
del calendar
del sales_train_eval
del sell_price
gc.collect()

0

In [13]:
#Extract number from string
sales_final['d'] = sales_final['d'].str.extract(r"(\d+)").astype(np.int16)

In [14]:
sales_final = sales_final.reset_index(drop=True)

In [15]:
column = ['id','item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2']
for feature in column:
    encoder = LabelEncoder()
    sales_final[feature] = encoder.fit_transform(sales_final[feature])

In [16]:
sales_final['sell_price'].fillna(0,inplace=True)

In [17]:
#checking for columns with Nan values 
sales_final.columns[sales_final.isna().any()].tolist()

[]

In [18]:
#dropping wm_yr_wk column
sales_final.drop('wm_yr_wk',axis=1,inplace=True)
#dropping date column
sales_final.drop('date',axis=1,inplace=True)
sales_final.drop('weekday',axis=1,inplace=True)
sales_final = sales_final.reset_index(drop=True)

In [19]:
sales_final = reduce(sales_final)

In [20]:
sales_final.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,14370,1437,3,1,0,0,1434,0,6,1,2015,18,1,0,0,1,1,0,8.257812
1,14380,1438,3,1,0,0,1434,0,6,1,2015,18,1,0,0,1,1,0,3.970703
2,14390,1439,3,1,0,0,1434,0,6,1,2015,18,1,0,0,1,1,0,2.970703
3,14400,1440,3,1,0,0,1434,2,6,1,2015,18,1,0,0,1,1,0,4.640625
4,14410,1441,3,1,0,0,1434,3,6,1,2015,18,1,0,0,1,1,0,2.880859


In [21]:
X_train =  sales_final[(sales_final['d'] <= 1885)]
X_cv = sales_final[(sales_final['d'] > 1885) & (sales_final['d'] <= 1913)]
X_test = sales_final[(sales_final['d'] > 1913)]

In [22]:
y_train = X_train['sales']
y_cv = X_cv['sales']
y_test = X_test['sales']

# We are drpping the features which are not required.
X_train.drop(['sales'],axis = 1,inplace = True)
X_cv.drop(['sales'],axis = 1,inplace = True)
X_test.drop(['sales'],axis = 1,inplace = True)

print(X_train.shape,y_train.shape)
print(X_cv.shape,y_cv.shape)
print(X_test.shape, y_test.shape)

(13781480, 18) (13781480,)
(853720, 18) (853720,)
(853720, 18) (853720,)


Linear Regression

In [23]:
model1 = LinearRegression()
model1.fit(X_train, y_train)
y_pred_cv = model1.predict(X_cv)
rmse = math.sqrt(mse(y_cv,y_pred_cv))
print(f"Validation error of Linear regression is {rmse}")

Validation error of Linear regression is 3.536406676299515


In [24]:
y_pred = model1.predict(X_test)
rmse = math.sqrt(mse(y_test,y_pred))
print(f"test error of Linear regression is {rmse}")

test error of Linear regression is 3.5795101499811897


DecisionTree Regressor

In [25]:
model2= DecisionTreeRegressor(random_state=0)
model2.fit(X_train,y_train)
y_pred_cv = model2.predict(X_cv)
rmse = math.sqrt(mse(y_cv,y_pred_cv))
print(f"Validation error of Decision tree regression is {rmse}")

Validation error of Decision tree regression is 3.092120025208083


In [26]:
y_pred = model2.predict(X_test)
rmse = math.sqrt(mse(y_test,y_pred))
print(f"test error of Decision tree regression is {rmse}")

test error of Decision tree regression is 3.2934024994565854


Random Forest Regressor

In [27]:
model3 = RandomForestRegressor(n_estimators = 5)
model3.fit(X_train, y_train)
y_pred_cv = model3.predict(X_cv)
rmse = math.sqrt(mse(y_cv,y_pred_cv))
print(f"Validation error of Random Forest is {rmse}")

Validation error of Random Forest is 2.5144926783006114


In [28]:
y_pred = model3.predict(X_test)
rmse = math.sqrt(mse(y_test,y_pred))
print(f"test error of Random forest is {rmse}")

test error of Random forest is 2.7114990763010636


In [31]:
# performance of all models
print("Performance of models without any featurization")
prettyTable = PrettyTable([ "No","Model", "RMSE SCORE"])
prettyTable.add_row([1,"Simple Moving Average", "2.23"])
prettyTable.add_row([2,"Exponential Weighted Moving Average", "2.16"])
prettyTable.add_row([3,"Linear regression", "3.579"])
prettyTable.add_row([4,"Decision Tree", "3.293"])
prettyTable.add_row([5,"Random Forest", "2.711"])

print(prettyTable)

Performance of models without any featurization
+----+-------------------------------------+------------+
| No |                Model                | RMSE SCORE |
+----+-------------------------------------+------------+
| 1  |        Simple Moving Average        |    2.23    |
| 2  | Exponential Weighted Moving Average |    2.16    |
| 3  |          Linear regression          |   3.579    |
| 4  |            Decision Tree            |   3.293    |
| 5  |            Random Forest            |   2.711    |
+----+-------------------------------------+------------+
