In [86]:
import numpy as np
import pandas as pd

In [87]:
train=pd.read_csv(r'originalni_datasetovi\train.csv',parse_dates=['date'])
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [88]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [89]:
train.isna().sum()

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64

In [90]:
test=pd.read_csv(r'originalni_datasetovi\test.csv', parse_dates=['date'])
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [91]:
#reading other datasets
holidays=pd.read_csv(r'originalni_datasetovi\holidays_events.csv',parse_dates=['date'])
oil=pd.read_csv(r'originalni_datasetovi\oil.csv', parse_dates=['date'])
stores=pd.read_csv(r'originalni_datasetovi\stores.csv')
transactions=pd.read_csv(r'originalni_datasetovi\transactions.csv', parse_dates=['date'])

In [92]:
#check for missing dates in train
date_min_train=train.date.min()
date_max_train=train.date.max()
ndays_train=(date_max_train-date_min_train).days+1 
print("Number of days that should be in train: ",ndays_train)
actdays_train=train.date.nunique()
print("Number of days that are in train: ",actdays_train)
print("Missing days: ", ndays_train-actdays_train)

Number of days that should be in train:  1688
Number of days that are in train:  1684
Missing days:  4


In [93]:
#inspecting missing dates in train
missing_dates = pd.date_range(date_min_train, date_max_train).difference(train.date.unique()).tolist()
missing_dates

[Timestamp('2013-12-25 00:00:00'),
 Timestamp('2014-12-25 00:00:00'),
 Timestamp('2015-12-25 00:00:00'),
 Timestamp('2016-12-25 00:00:00')]

In [94]:
#check for missing dates in test
date_min_test=test.date.min()
date_max_test=test.date.max()
ndays_test=(date_max_test-date_min_test).days+1 
print("Number of days that should be in train: ",ndays_test)
actdays_test=test.date.nunique()
print("Number of days that are in train: ",actdays_test)
print("Missing days: ", ndays_test-actdays_test)

Number of days that should be in train:  16
Number of days that are in train:  16
Missing days:  0


In [95]:
#solving missing train dates

#makes multiindex with all combination of dates, stores and families (has missing dates)
multi_index=pd.MultiIndex.from_product(
    [pd.date_range(date_min_train,date_max_train), train.store_nbr.unique(), train.family.unique()],
    names=['date','store_nbr','family'])

#first, making MultiIndex with existing dates, stores and families
train=train.set_index(['date','store_nbr','family'])

#then, reindexing with multi_index, on places where old index (missing 4 dates) didn't exist are NaN values
train=train.reindex(multi_index)

#reseting so that the index levels become regular columns again
train=train.reset_index()

#filling missing values with zeros
train[['sales','onpromotion']]=train[['sales','onpromotion']].fillna(0)

In [96]:
#making new id column
train['id']=np.arange(0,train.shape[0])
#place it as a first column
train=train[['id']+[col for col in train.columns if col!='id']]

train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0.0
1,1,2013-01-01,1,BABY CARE,0.0,0.0
2,2,2013-01-01,1,BEAUTY,0.0,0.0
3,3,2013-01-01,1,BEVERAGES,0.0,0.0
4,4,2013-01-01,1,BOOKS,0.0,0.0


In [97]:
#dealing with holidays

#TYPE: transferred - not acc celebrated on that day

transferred_holidays = holidays[(holidays.type == "Holiday") & (holidays.transferred == True)].drop("transferred", axis = 1).reset_index(drop = True) #holidays that were transferred
transfer = holidays[(holidays.type == "Transfer")].drop("transferred", axis = 1).reset_index(drop = True) #days that they were transferred to (in ds they are always below the actual holiday so indexes are going to match)
tr = pd.concat([transferred_holidays,transfer], axis = 1) 
tr = tr.iloc[:, [5,1,2,3,4]] #getting rid of dupplicate columns, keeping date from transferred, everything else from transferred_holidays
tr

Unnamed: 0,date,type,locale,locale_name,description
0,2012-10-12,Holiday,National,Ecuador,Independencia de Guayaquil
1,2013-10-11,Holiday,National,Ecuador,Independencia de Guayaquil
2,2014-10-10,Holiday,National,Ecuador,Independencia de Guayaquil
3,2016-05-27,Holiday,National,Ecuador,Batalla de Pichincha
4,2016-07-24,Holiday,Local,Guayaquil,Fundacion de Guayaquil
5,2016-08-12,Holiday,National,Ecuador,Primer Grito de Independencia
6,2017-01-02,Holiday,National,Ecuador,Primer dia del ano
7,2017-04-13,Holiday,Local,Cuenca,Fundacion de Cuenca
8,2017-05-26,Holiday,National,Ecuador,Batalla de Pichincha
9,2017-08-11,Holiday,National,Ecuador,Primer Grito de Independencia


In [98]:
#deleting transferred holidays and transfer days from holidays and we don't need transferred column anymore
holidays=holidays[(holidays.transferred==False) & (holidays.type !='Transfer')].drop('transferred',axis=1)

#adding concated transferred holidays and transfer days
holidays=pd.concat([holidays,tr]).reset_index(drop=True)

holidays

Unnamed: 0,date,type,locale,locale_name,description
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba
...,...,...,...,...,...
333,2017-04-13,Holiday,Local,Cuenca,Fundacion de Cuenca
334,2017-05-26,Holiday,National,Ecuador,Batalla de Pichincha
335,2017-08-11,Holiday,National,Ecuador,Primer Grito de Independencia
336,2017-09-29,Holiday,Local,Ibarra,Fundacion de Ibarra


In [99]:
#TYPE: Additional 

#have the same name as regular holiday but with + or - number 
#removing +-number
holidays["description"] = holidays["description"].str.replace("-", "").str.replace("+", "").str.replace('\d+', '')

#TYPE: Bridge

#have the same name as regular holiday but with Puente at the beginning
#removing puente
holidays["description"] = holidays["description"].str.replace("Puente ", "")

In [100]:
#TYPE: Work Day

#work days are not actually holidays, they are just days that you wouldn't regularly work on but now you do
#removing them
holidays=holidays[holidays['type']!='Work Day']

In [101]:
#TYPE: Event

#"There are many football events referred to by the match name that include the word 'futbal'
holidays.loc[holidays["description"].str.contains("futbol"), "description"] = "Futbol"

In [103]:
pd.set_option('display.max_rows', None)
holidays

Unnamed: 0,date,type,locale,locale_name,description
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba
5,2012-05-12,Holiday,Local,Puyo,Cantonizacion del Puyo
6,2012-06-23,Holiday,Local,Guaranda,Cantonizacion de Guaranda
7,2012-06-25,Holiday,Regional,Imbabura,Provincializacion de Imbabura
8,2012-06-25,Holiday,Local,Latacunga,Cantonizacion de Latacunga
9,2012-06-25,Holiday,Local,Machala,Fundacion de Machala
