# Base Libraries --> Import

In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
# pip install lightgbm 

In [3]:
# pip install shap 

In [4]:
#light bgm model
import lightgbm as lgb
import shap
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer

# Configuration
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# Import Data 

In [5]:
train = pd.read_csv('train.csv',parse_dates=['date'])
test = pd.read_csv('test.csv',parse_dates=['date'])

df = pd.concat([train, test], sort=False)

In [6]:
print(train.shape, test.shape, df.shape)

(913000, 4) (45000, 4) (958000, 5)


In [7]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [8]:
df.head()

Unnamed: 0,date,store,item,sales,id
0,2013-01-01,1,1,13.0,
1,2013-01-02,1,1,11.0,
2,2013-01-03,1,1,14.0,
3,2013-01-04,1,1,13.0,
4,2013-01-05,1,1,10.0,


# Data Analysis 

In [9]:
train.store.nunique() 

10

In [10]:
test.store.nunique(), train.item.nunique(), test.item.nunique()

(10, 50, 50)

In [11]:
## there are 10 store 
## there are 50 items 

In [12]:
# Time Range
train["date"].min(), train["date"].max(), test["date"].min(), test["date"].max()


(Timestamp('2013-01-01 00:00:00'),
 Timestamp('2017-12-31 00:00:00'),
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2018-03-31 00:00:00'))

In [13]:
# train == 2013-01-01 to 2017-12-31
# test == 2018-01-01 to 2018-03-31

In [14]:
# How many items are in the store?
df.groupby(["store"])["item"].nunique()

store
1     50
2     50
3     50
4     50
5     50
6     50
7     50
8     50
9     50
10    50
Name: item, dtype: int64

In [15]:
# Summary Stats for each store
df.groupby(["store"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales
Unnamed: 0_level_1,count,sum,mean,median,std,min,max
store,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,91300,4315603.0,47.268379,44.0,24.006252,1.0,155.0
2,91300,6120128.0,67.033165,62.0,33.59581,3.0,231.0
3,91300,5435144.0,59.530602,55.0,29.974102,3.0,196.0
4,91300,5012639.0,54.902946,51.0,27.733097,4.0,186.0
5,91300,3631016.0,39.770164,37.0,20.365757,2.0,130.0
6,91300,3627670.0,39.733516,37.0,20.310451,0.0,134.0
7,91300,3320009.0,36.363735,34.0,18.684825,1.0,122.0
8,91300,5856169.0,64.142048,60.0,32.231751,4.0,204.0
9,91300,5025976.0,55.049025,51.0,27.832186,4.0,195.0
10,91300,5360158.0,58.709288,54.0,29.554994,3.0,187.0


In [16]:
# Summary Stats for each item
df.groupby(["item"]).agg({"sales": ["count","sum", "mean", "median", "std", "min", "max"]})

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales
Unnamed: 0_level_1,count,sum,mean,median,std,min,max
item,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,18260,401384.0,21.981599,21.0,8.468922,1.0,59.0
2,18260,1069564.0,58.574151,56.0,20.093015,9.0,150.0
3,18260,669087.0,36.642223,35.0,13.179441,7.0,104.0
4,18260,401907.0,22.010241,21.0,8.403898,0.0,66.0
5,18260,335230.0,18.358708,18.0,7.265167,1.0,50.0
6,18260,1068281.0,58.503888,56.0,20.174898,11.0,148.0
7,18260,1068777.0,58.531051,56.0,20.146002,11.0,141.0
8,18260,1405108.0,76.950055,74.0,26.130697,15.0,181.0
9,18260,938379.0,51.389869,49.5,17.790158,6.0,134.0
10,18260,1337133.0,73.227437,70.0,24.823725,14.0,175.0


# Feature Engineering

In [17]:

# Time-related feature == creating time related features

def create_date_features(df):
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.dayofweek + 1
    df['year'] = df.date.dt.year
    df["is_wknd"] = df.date.dt.weekday // 4   ### df.date.dt.weekday => Starts from '0' means '0' = 'Monday'. 
                                               ##So, '// 4' will give '1' when day number equals
                                            ## so does for 5 and 6 gives '1' for //4
    df['is_month_start'] = df.date.dt.is_month_start.astype(int)
    df['is_month_end'] = df.date.dt.is_month_end.astype(int)
    df['is_year_start'] = df.date.dt.is_year_start.astype(int)
    df['is_year_end'] = df.date.dt.is_year_end.astype(int)
    return df
df = create_date_features(df)

In [18]:
df.head(10)

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end
0,2013-01-01,1,1,13.0,,1,1,1,1,2,2013,0,1,0,1,0
1,2013-01-02,1,1,11.0,,1,2,2,1,3,2013,0,0,0,0,0
2,2013-01-03,1,1,14.0,,1,3,3,1,4,2013,0,0,0,0,0
3,2013-01-04,1,1,13.0,,1,4,4,1,5,2013,1,0,0,0,0
4,2013-01-05,1,1,10.0,,1,5,5,1,6,2013,1,0,0,0,0
5,2013-01-06,1,1,12.0,,1,6,6,1,7,2013,1,0,0,0,0
6,2013-01-07,1,1,10.0,,1,7,7,2,1,2013,0,0,0,0,0
7,2013-01-08,1,1,9.0,,1,8,8,2,2,2013,0,0,0,0,0
8,2013-01-09,1,1,12.0,,1,9,9,2,3,2013,0,0,0,0,0
9,2013-01-10,1,1,9.0,,1,10,10,2,4,2013,0,0,0,0,0


In [19]:

#Lag/Shifted Features
#==> 1. every feature should be sorted 
#==> 2. generate new features for each 'item' in each 'store' according to 'date' order

#to sort 
df.sort_values(by=['store', 'item', 'date'], axis=0, inplace=True)

#lagging ==> how many sales 1 day before in column "lag1" and so on
"""but here new features are being generated using target variable= sales 
and since target variable is being used == data leakage is possible / overfitteing of data ==> machine loses generalization ability
so adding random noise"""



#lagging 
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["store", "item"])['sales'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe) # Adding random noise to each value.
    return dataframe

#random noise
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

# passing variables and creating feature
lags_list = [91, 98, 105, 112, 119, 126, 182, 364, 546, 728] 
df = lag_features(df, lags_list) 




In [20]:
df.tail()

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end,sales_lag_91,sales_lag_98,sales_lag_105,sales_lag_112,sales_lag_119,sales_lag_126,sales_lag_182,sales_lag_364,sales_lag_546,sales_lag_728
44995,2018-03-27,10,50,,44995.0,3,27,86,13,2,2018,0,0,0,0,0,41.035496,53.471711,65.9214,66.574881,67.547255,82.170129,76.913219,58.640642,94.440614,67.813116
44996,2018-03-28,10,50,,44996.0,3,28,87,13,3,2018,0,0,0,0,0,64.30583,49.864437,70.128985,61.567266,75.185272,79.96069,80.970349,72.246622,80.037714,66.380798
44997,2018-03-29,10,50,,44997.0,3,29,88,13,4,2018,0,0,0,0,0,56.963765,62.272422,71.641714,64.291749,72.563659,86.758995,82.684302,69.469131,95.438395,75.614672
44998,2018-03-30,10,50,,44998.0,3,30,89,13,5,2018,1,0,0,0,0,74.553882,76.236873,73.681981,69.022497,68.084453,81.1677,88.689951,67.390748,81.700766,84.195485
44999,2018-03-31,10,50,,44999.0,3,31,90,13,6,2018,1,0,1,0,0,60.521602,72.380718,51.296377,72.797493,49.00843,78.788735,104.471499,100.669855,97.555881,81.826737


In [21]:

## Rolling mean feature ==> takes "n" previous target variable and averages them and returns as a new value

"""For instance "roll2" takes previous 2 variable and averages it 
Again it uses target variables 
so adding noise on purpose """

def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby(["store", "item"])['sales']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(dataframe)
    return dataframe

roll_mean_list = [365, 546]
df = roll_mean_features(df, roll_mean_list)

In [22]:
df.tail()

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end,sales_lag_91,sales_lag_98,sales_lag_105,sales_lag_112,sales_lag_119,sales_lag_126,sales_lag_182,sales_lag_364,sales_lag_546,sales_lag_728,sales_roll_mean_365,sales_roll_mean_546
44995,2018-03-27,10,50,,44995.0,3,27,86,13,2,2018,0,0,0,0,0,41.035496,53.471711,65.9214,66.574881,67.547255,82.170129,76.913219,58.640642,94.440614,67.813116,87.080172,85.028879
44996,2018-03-28,10,50,,44996.0,3,28,87,13,3,2018,0,0,0,0,0,64.30583,49.864437,70.128985,61.567266,75.185272,79.96069,80.970349,72.246622,80.037714,66.380798,87.622592,85.147034
44997,2018-03-29,10,50,,44997.0,3,29,88,13,4,2018,0,0,0,0,0,56.963765,62.272422,71.641714,64.291749,72.563659,86.758995,82.684302,69.469131,95.438395,75.614672,83.761693,84.776102
44998,2018-03-30,10,50,,44998.0,3,30,89,13,5,2018,1,0,0,0,0,74.553882,76.236873,73.681981,69.022497,68.084453,81.1677,88.689951,67.390748,81.700766,84.195485,90.297854,85.793919
44999,2018-03-31,10,50,,44999.0,3,31,90,13,6,2018,1,0,1,0,0,60.521602,72.380718,51.296377,72.797493,49.00843,78.788735,104.471499,100.669855,97.555881,81.826737,83.152849,87.323165


In [23]:

#Exponentially Weighted Mean Features ==> parameter alpha [0,1]
                                         #alpha close to 1 == more weight to close days 
    
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby(["store", "item"])['sales'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

# In here we have two combinations : alphas and lags. 
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]

df = ewm_features(df, alphas, lags)



In [24]:
df.tail(100)

Unnamed: 0,date,store,item,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end,sales_lag_91,sales_lag_98,sales_lag_105,sales_lag_112,sales_lag_119,sales_lag_126,sales_lag_182,sales_lag_364,sales_lag_546,sales_lag_728,sales_roll_mean_365,sales_roll_mean_546,sales_ewm_alpha_095_lag_91,sales_ewm_alpha_095_lag_98,sales_ewm_alpha_095_lag_105,sales_ewm_alpha_095_lag_112,sales_ewm_alpha_095_lag_180,sales_ewm_alpha_095_lag_270,sales_ewm_alpha_095_lag_365,sales_ewm_alpha_095_lag_546,sales_ewm_alpha_095_lag_728,sales_ewm_alpha_09_lag_91,sales_ewm_alpha_09_lag_98,sales_ewm_alpha_09_lag_105,sales_ewm_alpha_09_lag_112,sales_ewm_alpha_09_lag_180,sales_ewm_alpha_09_lag_270,sales_ewm_alpha_09_lag_365,sales_ewm_alpha_09_lag_546,sales_ewm_alpha_09_lag_728,sales_ewm_alpha_08_lag_91,sales_ewm_alpha_08_lag_98,sales_ewm_alpha_08_lag_105,sales_ewm_alpha_08_lag_112,sales_ewm_alpha_08_lag_180,sales_ewm_alpha_08_lag_270,sales_ewm_alpha_08_lag_365,sales_ewm_alpha_08_lag_546,sales_ewm_alpha_08_lag_728,sales_ewm_alpha_07_lag_91,sales_ewm_alpha_07_lag_98,sales_ewm_alpha_07_lag_105,sales_ewm_alpha_07_lag_112,sales_ewm_alpha_07_lag_180,sales_ewm_alpha_07_lag_270,sales_ewm_alpha_07_lag_365,sales_ewm_alpha_07_lag_546,sales_ewm_alpha_07_lag_728,sales_ewm_alpha_05_lag_91,sales_ewm_alpha_05_lag_98,sales_ewm_alpha_05_lag_105,sales_ewm_alpha_05_lag_112,sales_ewm_alpha_05_lag_180,sales_ewm_alpha_05_lag_270,sales_ewm_alpha_05_lag_365,sales_ewm_alpha_05_lag_546,sales_ewm_alpha_05_lag_728
912990,2017-12-22,10,50,75.0,,12,22,356,51,5,2017,1,0,0,0,0,95.166292,93.968493,95.798573,96.306139,98.705264,99.166345,95.612762,66.203833,112.726271,54.701699,87.969272,80.373492,96.558870,93.783282,96.052632,95.625124,108.019324,66.744209,56.383382,113.870136,55.111306,96.040150,93.526617,95.110310,95.202057,108.874181,67.473304,56.728121,113.590268,55.230991,94.793227,92.861251,93.235229,94.234683,110.067047,68.859556,57.282272,112.631403,55.419134,93.311089,91.965316,91.368928,93.181016,110.525123,70.090358,57.656675,111.216414,55.443396,90.035940,89.600305,87.839022,91.394820,109.161929,71.734958,58.089336,107.424327,54.850299
912991,2017-12-23,10,50,70.0,,12,23,357,51,6,2017,1,0,0,0,0,96.039381,97.431591,92.300994,93.886870,93.002731,100.635800,129.968538,63.297359,116.470907,66.588060,90.570616,82.691603,96.977944,96.839164,93.152632,92.181256,83.300966,60.337210,63.619169,115.893507,70.205565,96.904015,96.652662,93.211031,92.320206,84.687418,60.747330,63.272812,115.759027,69.423099,96.558645,96.172250,93.047046,92.446937,87.613409,61.771911,62.656454,115.326281,67.883827,95.893327,95.489595,92.510678,92.354305,90.557537,63.027108,62.097002,114.564924,66.333019,93.517970,93.300152,90.419511,91.697410,95.580964,65.867479,61.044668,111.712163,62.925149
912992,2017-12-24,10,50,76.0,,12,24,358,51,7,2017,1,0,0,0,0,100.397702,107.063183,97.055553,101.345241,115.864237,98.035780,104.534412,64.077502,108.107484,57.586533,87.588159,85.607192,98.898897,106.491958,95.857632,101.509063,83.015048,72.366861,64.930958,108.394675,57.660278,98.790401,105.965266,95.721103,101.032021,83.168742,71.774733,64.827281,108.775903,58.242310,98.511729,104.834450,95.409409,100.089387,83.922682,70.754382,64.531291,109.465256,59.176765,98.067998,103.546878,94.953204,99.106291,85.267261,70.008132,64.129101,109.969477,59.799906,96.258985,100.150076,93.209756,96.848705,89.290482,69.433740,63.022334,109.856082,59.962575
912993,2017-12-25,10,50,51.0,,12,25,359,52,1,2017,0,0,0,0,0,64.580080,61.641808,74.707899,60.921627,98.680778,70.959583,82.414811,60.081029,69.630549,38.727466,89.176047,81.418061,64.794945,63.274598,75.092882,62.075453,90.600752,68.218343,64.046548,70.969734,42.783014,66.579040,65.496527,76.172110,64.103202,90.216874,68.377473,64.082728,72.977590,43.624231,70.102346,69.766890,78.281882,68.017877,89.584536,68.550876,64.106258,77.093051,45.435353,73.520399,73.764064,80.285961,71.731887,89.280178,68.602440,64.038730,81.290843,47.339972,79.629492,80.575038,83.604878,78.424353,90.145241,68.716870,63.511167,89.428041,50.981287
912994,2017-12-26,10,50,41.0,,12,26,360,52,2,2017,0,0,0,0,0,78.029804,82.913097,71.538942,80.815550,80.548033,79.306640,83.731092,61.471176,87.426671,45.961804,90.602669,79.204160,78.289747,82.963730,70.254644,80.053773,120.430038,68.960917,61.152327,86.198487,44.889151,77.757904,82.149653,70.617211,79.310320,118.821687,68.937747,61.308273,85.597759,44.862423,77.220469,81.153378,71.656376,78.403575,115.516907,68.910175,61.621252,85.018610,45.087071,77.356120,80.929219,73.085788,78.219566,112.184053,68.880732,61.911619,85.287253,45.701992,79.314746,82.287519,76.802439,79.712176,106.072621,68.858435,62.255584,88.214020,47.990644
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,2018-03-27,10,50,,44995.0,3,27,86,13,2,2018,0,0,0,0,0,41.035496,53.471711,65.921400,66.574881,67.547255,82.170129,76.913219,58.640642,94.440614,67.813116,87.080172,85.028879,41.561777,54.028375,66.387487,67.349358,81.895724,112.421502,66.744209,94.617783,71.979468,42.244368,54.197995,65.849785,66.785772,81.777579,112.682169,67.473304,92.561560,71.115472,43.955442,54.879654,64.996171,65.861563,81.488819,112.703381,68.859556,89.361543,69.838571,46.091666,55.909793,64.428121,65.144155,81.162051,112.055216,70.090358,87.248414,69.116907,51.309755,58.648702,64.033868,64.335072,80.828687,109.036310,71.734958,85.489012,68.933911
44996,2018-03-28,10,50,,44996.0,3,28,87,13,3,2018,0,0,0,0,0,64.305830,49.864437,70.128985,61.567266,75.185272,79.960690,80.970349,72.246622,80.037714,66.380798,87.622592,85.147034,61.928089,51.151419,66.969374,60.367468,89.594786,118.671075,60.337210,80.730889,68.198973,60.924437,51.319799,66.884978,60.678577,89.177758,118.368217,60.747330,81.256156,68.311547,59.191088,51.775931,66.599234,61.172313,88.297764,117.740676,61.771911,81.872309,68.367714,57.927500,52.472938,66.228436,61.543247,87.348615,116.916565,63.027108,82.174524,68.335072,57.154878,54.824351,65.516934,62.167536,85.414343,114.018155,65.867479,82.744506,68.466956
44997,2018-03-29,10,50,,44997.0,3,29,88,13,4,2018,0,0,0,0,0,56.963765,62.272422,71.641714,64.291749,72.563659,86.758995,82.684302,69.469131,95.438395,75.614672,83.761693,84.776102,59.146404,62.407571,71.748469,65.718373,102.329739,119.933554,72.366861,98.086544,74.659949,59.192444,61.831980,71.488498,65.467858,101.617776,119.836822,71.774733,97.225616,74.331155,59.038218,60.755186,70.919847,65.034463,100.059553,119.548135,70.754382,95.574462,73.673543,58.678250,59.841881,70.268531,64.662974,98.304585,119.074969,70.008132,93.952357,73.000522,58.077439,58.912176,68.758467,64.083768,94.207172,117.009078,69.433740,90.872253,71.733478
44998,2018-03-30,10,50,,44998.0,3,30,89,13,5,2018,1,0,0,0,0,74.553882,76.236873,73.681981,69.022497,68.084453,81.167700,88.689951,67.390748,81.700766,84.195485,90.297854,85.793919,73.257320,74.370379,71.987423,66.935919,99.166487,100.046678,68.218343,79.954327,82.582997,72.519244,73.683198,71.948850,66.846786,99.261778,101.083682,68.377473,80.822562,82.133115,71.007644,72.151037,71.783969,66.606893,99.211911,103.109627,68.550876,82.314892,81.134709,69.403475,70.452564,71.480559,66.298892,98.791375,105.022491,68.602440,83.485707,80.000156,66.038719,66.956088,70.379233,65.541884,96.603586,108.004539,68.716870,84.936127,77.366739


In [25]:
##Done few feature engineering now shape of data:
df.shape

(958000, 73)

In [26]:
#one-hot encoding ==> ensures that machine learning does not assume that higher numbers are more important

df = pd.get_dummies(df, columns=['store', 'item'])
df.head()

Unnamed: 0,date,sales,id,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_wknd,is_month_start,is_month_end,is_year_start,is_year_end,sales_lag_91,sales_lag_98,sales_lag_105,sales_lag_112,sales_lag_119,sales_lag_126,sales_lag_182,sales_lag_364,sales_lag_546,sales_lag_728,sales_roll_mean_365,sales_roll_mean_546,sales_ewm_alpha_095_lag_91,sales_ewm_alpha_095_lag_98,sales_ewm_alpha_095_lag_105,sales_ewm_alpha_095_lag_112,sales_ewm_alpha_095_lag_180,sales_ewm_alpha_095_lag_270,sales_ewm_alpha_095_lag_365,sales_ewm_alpha_095_lag_546,sales_ewm_alpha_095_lag_728,sales_ewm_alpha_09_lag_91,sales_ewm_alpha_09_lag_98,sales_ewm_alpha_09_lag_105,sales_ewm_alpha_09_lag_112,sales_ewm_alpha_09_lag_180,sales_ewm_alpha_09_lag_270,sales_ewm_alpha_09_lag_365,sales_ewm_alpha_09_lag_546,sales_ewm_alpha_09_lag_728,sales_ewm_alpha_08_lag_91,sales_ewm_alpha_08_lag_98,sales_ewm_alpha_08_lag_105,sales_ewm_alpha_08_lag_112,sales_ewm_alpha_08_lag_180,sales_ewm_alpha_08_lag_270,sales_ewm_alpha_08_lag_365,sales_ewm_alpha_08_lag_546,sales_ewm_alpha_08_lag_728,sales_ewm_alpha_07_lag_91,sales_ewm_alpha_07_lag_98,sales_ewm_alpha_07_lag_105,sales_ewm_alpha_07_lag_112,sales_ewm_alpha_07_lag_180,sales_ewm_alpha_07_lag_270,sales_ewm_alpha_07_lag_365,sales_ewm_alpha_07_lag_546,sales_ewm_alpha_07_lag_728,sales_ewm_alpha_05_lag_91,sales_ewm_alpha_05_lag_98,sales_ewm_alpha_05_lag_105,sales_ewm_alpha_05_lag_112,sales_ewm_alpha_05_lag_180,sales_ewm_alpha_05_lag_270,sales_ewm_alpha_05_lag_365,sales_ewm_alpha_05_lag_546,sales_ewm_alpha_05_lag_728,store_1,store_2,store_3,store_4,store_5,store_6,store_7,store_8,store_9,store_10,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,item_10,item_11,item_12,item_13,item_14,item_15,item_16,item_17,item_18,item_19,item_20,item_21,item_22,item_23,item_24,item_25,item_26,item_27,item_28,item_29,item_30,item_31,item_32,item_33,item_34,item_35,item_36,item_37,item_38,item_39,item_40,item_41,item_42,item_43,item_44,item_45,item_46,item_47,item_48,item_49,item_50
0,2013-01-01,13.0,,1,1,1,1,2,2013,0,1,0,1,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2013-01-02,11.0,,1,2,2,1,3,2013,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2013-01-03,14.0,,1,3,3,1,4,2013,0,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2013-01-04,13.0,,1,4,4,1,5,2013,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2013-01-05,10.0,,1,5,5,1,6,2013,1,0,0,0,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [27]:
df.shape

(958000, 131)

In [30]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [29]:
pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.13.2-cp39-cp39-win_amd64.whl (9.1 MB)
Collecting patsy>=0.5.2
  Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.2
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\users\epoch\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


In [32]:
train1 = pd.read_csv("train.csv", index_col=0, parse_dates=True)

In [36]:
# decompose_result = seasonal_decompose(train, model="multiplicative")

# trend = decompose_result.trend
# seasonal = decompose_result.seasonal
# residual = decompose_result.resid

# decompose_result.plot();

In [37]:
##SMAPE score

def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

# Calculating SMAPE for LightGBM output:
def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

# Split data

In [38]:
#  train and validation set.
train = df.loc[(df["date"] < "2017-01-01"), :] # Until beginning of 2017
val = df.loc[(df["date"] >= "2017-01-01") & (df["date"] < "2017-04-01"), :] # First 3 months of 2017

#dropping useless column
cols = [col for col in train.columns if col not in ['date', 'id', "sales", "year"]]

Y_train = train['sales']
X_train = train[cols]

Y_val = val['sales']
X_val = val[cols]

Y_train.shape, X_train.shape, Y_val.shape, X_val.shape


((730500,), (730500, 127), (45000,), (45000, 127))

# Lgbm model

In [39]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score ,mean_absolute_percentage_error

In [40]:
#lgbm in without any parameters 
first_model = lgb.LGBMRegressor().fit(X_train, Y_train)

print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))
pred = first_model.predict(X_train)
print("MAE:",mean_absolute_error(Y_train, pred))
print("RMSE:",mean_squared_error(Y_train, pred))
print("R2 Score:",r2_score(Y_train, pred))
print("MAPE:",mean_absolute_percentage_error(Y_train, pred))

TRAIN SMAPE: 13.378030408788051
VALID SMAPE: 14.123767326699774
MAE: 5.76007286776308
RMSE: 55.91634867368461
R2 Score: 0.9278090247006089
MAPE: 69096455522.33351


In [None]:
#Feature Importance 
"""calculate a score for all the input features for a given model
- the scores simply represent the “importance” of each feature"""


In [41]:
import joblib
joblib.dump(first_model,r'C:\Users\Epoch\OneDrive\Desktop\salesprediction_deployement\models\lgbm1.sav')

In [42]:
joblib.dump(first_model,r'C:\Users\Epoch\OneDrive\Desktop\salesprediction_deployement\models\lgbm.sav')

['C:\\Users\\Epoch\\OneDrive\\Desktop\\salesprediction_deployement\\models\\lgbm.sav']

# Catboost Regressor

In [43]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [17]:


from catboost import CatBoostRegressor
first_model = CatBoostRegressor()
first_model.fit(X_train, Y_train)

print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))



Learning rate set to 0.116046
0:	learn: 0.5260685	total: 537ms	remaining: 8m 56s
1:	learn: 0.4818381	total: 824ms	remaining: 6m 51s
2:	learn: 0.4432766	total: 1.08s	remaining: 6m
3:	learn: 0.4088533	total: 1.31s	remaining: 5m 26s
4:	learn: 0.3793073	total: 1.56s	remaining: 5m 10s
5:	learn: 0.3527497	total: 1.82s	remaining: 5m 2s
6:	learn: 0.3299499	total: 2.08s	remaining: 4m 55s
7:	learn: 0.3101735	total: 2.31s	remaining: 4m 46s
8:	learn: 0.2932433	total: 2.54s	remaining: 4m 40s
9:	learn: 0.2782458	total: 2.76s	remaining: 4m 32s
10:	learn: 0.2661738	total: 3.01s	remaining: 4m 30s
11:	learn: 0.2555653	total: 3.26s	remaining: 4m 28s
12:	learn: 0.2457106	total: 3.52s	remaining: 4m 27s
13:	learn: 0.2374096	total: 3.74s	remaining: 4m 23s
14:	learn: 0.2303368	total: 3.94s	remaining: 4m 18s
15:	learn: 0.2242352	total: 4.14s	remaining: 4m 14s
16:	learn: 0.2187807	total: 4.38s	remaining: 4m 13s
17:	learn: 0.2142518	total: 4.58s	remaining: 4m 9s
18:	learn: 0.2101221	total: 4.84s	remaining: 4m 10

In [21]:
from xgboost.sklearn import XGBRegressor
first_model = XGBRegressor()
first_model.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=4,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

In [22]:
print("TRAIN SMAPE:", smape(Y_train, first_model.predict(X_train)))
print("VALID SMAPE:", smape(Y_val, first_model.predict(X_val)))

TRAIN SMAPE: 3.575571957355945
VALID SMAPE: 3.8881205001529247
