In [82]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from utils import plot_boxplot, get_seasonality_trend_overview, plot_period_mean, plot_periodogram, create_date_features, create_work_related_features, num_leading_zeros, num_trailing_zeros
import warnings
import statsmodels.api as sm
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

In [83]:
df_train = pd.read_csv('train_test_v4.csv', parse_dates=['date'])
df_train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1


In [84]:
df_train = create_date_features(df_train)
df_train.sample(5)

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season
379642,2013-08-02,3,EGGS,550.0,0.0,Quito,Pichincha,D,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,106.94,3213.0,0,0,0,8,2,214,1,31,5,2013,1,3,0,0,0,0,0,0,213,2
2895573,2017-06-13,49,LINGERIE,16.0,0.0,Quito,Pichincha,A,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,46.41,2609.0,0,0,0,6,13,164,2,24,2,2017,0,2,0,0,0,0,0,0,1624,2
2411146,2016-09-15,4,BABY CARE,0.0,0.0,Quito,Pichincha,D,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,43.85,1197.0,0,0,0,9,15,259,3,37,4,2016,0,3,0,0,0,0,0,0,1353,3
422023,2013-08-25,45,LADIESWEAR,0.0,0.0,Quito,Pichincha,A,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0,106.08,4138.0,0,0,0,8,25,237,4,34,7,2013,1,3,0,0,0,0,0,0,236,2
2548993,2016-12-01,23,CLEANING,491.0,6.0,Ambato,Tungurahua,D,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,51.08,921.0,0,0,0,12,1,336,1,48,4,2016,0,4,1,0,0,0,0,0,1430,0


In [85]:
train_wide = pd.pivot(df_train.groupby(['date','store_nbr'])['sales'].agg('sum').reset_index(), index=['date'], columns=['store_nbr'], values=['sales'])
train_wide.head()

Unnamed: 0_level_0,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales
store_nbr,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2,Unnamed: 26_level_2,Unnamed: 27_level_2,Unnamed: 28_level_2,Unnamed: 29_level_2,Unnamed: 30_level_2,Unnamed: 31_level_2,Unnamed: 32_level_2,Unnamed: 33_level_2,Unnamed: 34_level_2,Unnamed: 35_level_2,Unnamed: 36_level_2,Unnamed: 37_level_2,Unnamed: 38_level_2,Unnamed: 39_level_2,Unnamed: 40_level_2,Unnamed: 41_level_2,Unnamed: 42_level_2,Unnamed: 43_level_2,Unnamed: 44_level_2,Unnamed: 45_level_2,Unnamed: 46_level_2,Unnamed: 47_level_2,Unnamed: 48_level_2,Unnamed: 49_level_2,Unnamed: 50_level_2,Unnamed: 51_level_2,Unnamed: 52_level_2,Unnamed: 53_level_2,Unnamed: 54_level_2
2013-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2511.618999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01-02,7417.148,10266.718981,24060.348,10200.08398,10598.61902,13520.485002,11997.502,14659.328,15867.484,5430.088,18329.638,5854.517,5516.831,7698.253,6577.805,6242.572,7673.513,8204.567,5615.389008,0.0,0.0,0.0,6560.337,11018.852996,5316.224,5653.413,7207.547,5176.474,0.0,2623.009,6637.56,2441.508,5720.770998,7749.693,2978.224,0.0,11111.838999,9198.507,9956.37101,4965.014,5205.833,0.0,6054.335,30095.181,27612.05095,27149.92,27263.11101,19700.0,16281.845,16576.132,15134.49099,0.0,0.0,4973.285
2013-01-03,5873.244001,7612.54,18570.745025,7400.245,7770.968,9929.393,9277.958,10781.243,10431.719,4549.52998,12356.20504,4460.164,4852.438995,6158.575996,5691.447,5316.415,5523.241,5107.041,3783.087,0.0,0.0,0.0,4911.20401,10086.240998,4442.91302,3477.559,4387.566,4142.304,0.0,2538.315003,5437.826,2589.699001,5167.402,6246.722,2699.277,0.0,9335.661,7860.617,6986.445,4484.828,3699.117,0.0,5206.779,20447.057,17377.98097,15751.355,18340.205,14350.78403,9909.082,11066.46504,11170.057015,0.0,0.0,3901.57
2013-01-04,5919.879001,7526.377,17392.097995,6902.050001,7347.641991,8869.21198,8096.573018,11124.651,8085.159,3381.016,12171.998022,3684.053004,4249.484,5777.689,4825.64699,4616.99,5239.568995,4033.748,3233.503,0.0,0.0,0.0,5620.528,10059.480998,4844.354,3538.449,4586.466001,2615.644994,0.0,1987.33,6107.636,2705.500996,4282.112,6100.120001,2406.925001,0.0,8954.49001,7433.773,5605.498,3838.027,3581.132004,0.0,5499.704,22795.799,19936.19504,17073.06408,18982.00897,15276.803001,10284.566,12147.44,12452.326,0.0,0.0,3266.966
2013-01-05,6318.78501,9539.566,22700.872005,9948.383,8192.598,12431.68904,8610.605999,14277.69902,13057.11,5409.78501,14763.049,4818.668,6465.232,7974.621,5694.056,6014.655,7557.546,6510.638,6363.821023,0.0,0.0,0.0,6041.002,12031.90902,5817.526005,5503.186,6731.607015,5939.632,0.0,2811.187998,5875.18901,2623.645,5981.775,7189.406004,2981.362,0.0,11268.065,9842.551011,9256.675,5314.727,4694.56,0.0,4328.885,31382.508,27104.223034,27231.19803,26996.263995,22876.474,13506.025,17930.364,15046.247,0.0,0.0,4394.549


Trailing zeros are only in test set so we dont use them as new attributes

In [86]:
leading_zeros = train_wide.apply(num_leading_zeros).to_frame('num_leading_zeros').reset_index()
leading_zeros = leading_zeros[leading_zeros['num_leading_zeros'] != train_wide.shape[0]]
leading_zeros

Unnamed: 0,level_0,store_nbr,num_leading_zeros
0,sales,1,1
1,sales,2,1
2,sales,3,1
3,sales,4,1
4,sales,5,1
5,sales,6,1
6,sales,7,1
7,sales,8,1
8,sales,9,1
9,sales,10,1


In [87]:
from datetime import datetime, timedelta

def check_for_leading(df: pd.Series):
    # if less then its not workday
    days_add = leading_zeros[leading_zeros['store_nbr'] == df['store_nbr']]['num_leading_zeros']
    if df['date'] < datetime.strptime('01-01-2013', '%d-%m-%Y') + timedelta(days=int(days_add)):
        return 0
    else:
        return 1

df_train['workday_because_of_leading_zeroes'] = df_train.apply(check_for_leading, axis=1)

In [88]:
df_train.sample(7)

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season,workday_because_of_leading_zeroes
2777386,2017-04-08,32,CLEANING,655.0,15.0,Guayaquil,Guayas,C,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,52.52,705.0,0,0,0,4,8,98,2,14,6,2017,1,2,0,0,0,0,0,0,1558,1,1
777357,2014-03-13,13,DELI,103.0,0.0,Latacunga,Cotopaxi,C,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,98.57,791.0,0,0,0,3,13,72,2,11,4,2014,0,1,0,0,0,0,0,0,436,1,1
1338566,2015-01-22,9,LAWN AND GARDEN,7.0,0.0,Quito,Pichincha,B,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,45.93,1923.0,0,0,0,1,22,22,4,4,4,2015,0,1,0,0,0,0,0,0,751,0,1
1263325,2014-12-10,51,LADIESWEAR,7.0,0.0,Guayaquil,Guayas,A,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,60.99,2161.0,0,0,0,12,10,344,2,50,3,2014,0,4,0,0,0,0,0,0,708,0,1
228918,2013-05-09,25,PRODUCE,0.0,0.0,Salinas,Santa Elena,D,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,96.09,658.0,0,0,0,5,9,129,2,19,4,2013,0,2,0,0,0,0,0,0,128,1,1
1419711,2015-03-08,38,HOME CARE,0.0,0.0,Loja,Loja,D,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,49.836667,2639.0,0,0,0,3,8,67,2,10,7,2015,1,1,0,0,0,0,0,0,796,1,1
313522,2013-06-25,51,"LIQUOR,WINE,BEER",55.0,0.0,Guayaquil,Guayas,A,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,95.25,1504.0,0,0,0,6,25,176,4,26,2,2013,0,2,0,0,0,0,0,0,175,2,1


In [89]:
def create_work_related_features(df: pd.DataFrame):
    df['workday'] = np.where((df.local_holidays == 1) | (df.national_holidays == 1) | (df.regional_holidays == 1) | (df['day_of_week'].isin([6,7])) | (df.workday_because_of_leading_zeroes == 0), 0, 1)
    df['wageday'] = pd.Series(np.where((df['is_month_end'] == 1) | (df['day_of_month'] == 15), 1, 0)).astype('int8')
    df.drop(columns='workday_because_of_leading_zeroes', inplace=True)
    return df

df_train = create_work_related_features(df_train)

In [91]:
df_train.head()

Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season,workday,wageday
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0


In [179]:
%pip install cacheout
from cacheout import Cache

cache = Cache()

@cache.memoize()
def days_to_holiday(date, holidays, from_holiday=True):
    return (date['date'] - min([h for h in holidays if h <= date['date']], key=lambda x: abs((date['date'] - x).days))).days if from_holiday else (min([h for h in holidays if h >= date['date']], key=lambda x: abs((x - date['date']).days)) - date['date']).days


def create_holiday_features(df, df_holidays):
    df['day_to_nearest_holiday'] = df.apply(lambda x: days_to_holiday(x, df_holidays), axis=1)
    df['day_from_nearest_holiday'] = df.apply(lambda x: days_to_holiday(x, df_holidays,  from_holiday=False), axis=1)
    return df  

df_holidays = pd.read_csv('originalni_datasetovi/holidays_events.csv', parse_dates=['date'])
df_holidays = list(set(df_holidays.loc[df_holidays['locale'] == 'National']['date'].to_list()))
df_holidays.sort()
df_traini = create_holiday_features(df_train, df_holidays)
df_traini.head()

Collecting cacheout
  Downloading cacheout-0.16.0-py3-none-any.whl.metadata (16 kB)
Downloading cacheout-0.16.0-py3-none-any.whl (21 kB)
Installing collected packages: cacheout
Successfully installed cacheout-0.16.0
Note: you may need to restart the kernel to use updated packages.


Unnamed: 0,date,store_nbr,family,sales,onpromotion,city,state,type,cluster,work_day,N Batalla de Pichincha,N Carnaval,N Cyber Monday,N Dia de Difuntos,N Dia de la Madre,N Dia del Trabajo,N Futbol,N Independencia de Cuenca,N Independencia de Guayaquil,N Navidad,N Primer dia del ano,N Terremoto Manabi,N Viernes Santo,oil_price,transactions,local_holidays,regional_holidays,national_holidays,month,day_of_month,day_of_year,week_of_month,week_of_year,day_of_week,year,is_wknd,quarter,is_month_start,is_month_end,is_quarter_start,is_quarter_end,is_year_start,is_year_end,date_index,season,workday,wageday,day_to_nearest_holiday,day_from_nearest_holiday
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
1,2013-01-01,1,BABY CARE,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
2,2013-01-01,1,BEAUTY,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
3,2013-01-01,1,BEVERAGES,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0
4,2013-01-01,1,BOOKS,0.0,0.0,Quito,Pichincha,D,13,0,0,0,0,0,0,0,0,0,0,0,1,0,0,93.14,0.0,0,0,1,1,1,1,1,1,2,2013,0,1,1,0,1,0,1,0,0,0,0,0,0,0


In [180]:
df_traini.to_csv('posle_holidaya.csv')