In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import metrics
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error as mae
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('../data/StoreDemand.csv')
display(df.head())
display(df.tail())

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


Unnamed: 0,date,store,item,sales
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62
912999,2017-12-31,10,50,82


In [6]:
df.shape

(913000, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   date    913000 non-null  object
 1   store   913000 non-null  int64 
 2   item    913000 non-null  int64 
 3   sales   913000 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 27.9+ MB


In [8]:
df.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


## Feature Engineering

In [9]:
parts = df["date"].str.split("-", n = 3, expand = True)
df["year"] = parts[0].astype('int')
df["month"] = parts[1].astype('int')
df["day"] = parts[2].astype('int')
df.head

<bound method NDFrame.head of               date  store  item  sales  year  month  day
0       2013-01-01      1     1     13  2013      1    1
1       2013-01-02      1     1     11  2013      1    2
2       2013-01-03      1     1     14  2013      1    3
3       2013-01-04      1     1     13  2013      1    4
4       2013-01-05      1     1     10  2013      1    5
...            ...    ...   ...    ...   ...    ...  ...
912995  2017-12-27     10    50     63  2017     12   27
912996  2017-12-28     10    50     59  2017     12   28
912997  2017-12-29     10    50     74  2017     12   29
912998  2017-12-30     10    50     62  2017     12   30
912999  2017-12-31     10    50     82  2017     12   31

[913000 rows x 7 columns]>

In [13]:
print(parts)

           0   1   2
0       2013  01  01
1       2013  01  02
2       2013  01  03
3       2013  01  04
4       2013  01  05
...      ...  ..  ..
912995  2017  12  27
912996  2017  12  28
912997  2017  12  29
912998  2017  12  30
912999  2017  12  31

[913000 rows x 3 columns]


In [16]:
from datetime import datetime
import calendar

def weekend_or_weekday(year, month, day):
    d = datetime(year, month, day)
    if d.weekday() >= 5:
        return 1
    else: 
        return 0

df['weekend'] = df.apply(lambda x: weekend_or_weekday(x['year'], x['month'], x['day']), axis=1)
df.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekend
0,2013-01-01,1,1,13,2013,1,1,0
1,2013-01-02,1,1,11,2013,1,2,0
2,2013-01-03,1,1,14,2013,1,3,0
3,2013-01-04,1,1,13,2013,1,4,0
4,2013-01-05,1,1,10,2013,1,5,1


In [18]:
from datetime import date
import holidays

def is_holiday(x):

    india_holidays = holidays.country_holidays('IN')

    if india_holidays.get(x):
        return 1
    else:
        return 0
df['holidays'] = df['date'].apply(is_holiday)
df.head()

KeyboardInterrupt: 