In [48]:
import numpy as np
import pandas as pd

In [49]:
train=pd.read_csv(r'originalni_datasetovi\train.csv',parse_dates=['date'])
train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [50]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [51]:
train.isna().sum()

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64

In [52]:
test=pd.read_csv(r'originalni_datasetovi\test.csv', parse_dates=['date'])
test.head()

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0


In [53]:
#reading other datasets
holidays=pd.read_csv(r'originalni_datasetovi\holidays_events.csv',parse_dates=['date'])
oil=pd.read_csv(r'originalni_datasetovi\oil.csv', parse_dates=['date'])
stores=pd.read_csv(r'originalni_datasetovi\stores.csv')
transactions=pd.read_csv(r'originalni_datasetovi\transactions.csv', parse_dates=['date'])

In [54]:
#check for missing dates in train
date_min_train=train.date.min()
date_max_train=train.date.max()
ndays_train=(date_max_train-date_min_train).days+1 
print("Number of days that should be in train: ",ndays_train)
actdays_train=train.date.nunique()
print("Number of days that are in train: ",actdays_train)
print("Missing days: ", ndays_train-actdays_train)

Number of days that should be in train:  1688
Number of days that are in train:  1684
Missing days:  4


In [55]:
#inspecting missing dates in train
missing_dates = pd.date_range(date_min_train, date_max_train).difference(train.date.unique()).tolist()
missing_dates

[Timestamp('2013-12-25 00:00:00'),
 Timestamp('2014-12-25 00:00:00'),
 Timestamp('2015-12-25 00:00:00'),
 Timestamp('2016-12-25 00:00:00')]

In [56]:
#check for missing dates in test
date_min_test=test.date.min()
date_max_test=test.date.max()
ndays_test=(date_max_test-date_min_test).days+1 
print("Number of days that should be in train: ",ndays_test)
actdays_test=test.date.nunique()
print("Number of days that are in train: ",actdays_test)
print("Missing days: ", ndays_test-actdays_test)

Number of days that should be in train:  16
Number of days that are in train:  16
Missing days:  0


In [57]:
#solving missing train dates

#makes multiindex with all combination of dates, stores and families (has missing dates)
multi_index=pd.MultiIndex.from_product(
    [pd.date_range(date_min_train,date_max_train), train.store_nbr.unique(), train.family.unique()],
    names=['date','store_nbr','family'])

#first, making MultiIndex with existing dates, stores and families
train=train.set_index(['date','store_nbr','family'])

#then, reindexing with multi_index, on places where old index (missing 4 dates) didn't exist are NaN values
train=train.reindex(multi_index)

#reseting so that the index levels become regular columns again
train=train.reset_index()

#filling missing values with zeros
train[['sales','onpromotion']]=train[['sales','onpromotion']].fillna(0)

In [79]:
#making new id column
train['id']=np.arange(0,train.shape[0])
#place it as a firs column
train=train[['id']+[col for col in train.columns if col!='id']]

train.head()

Unnamed: 0,date,store_nbr,family,id,sales,onpromotion
0,2013-01-01,1,AUTOMOTIVE,0,0.000,0.0
1,2013-01-01,1,BABY CARE,1,0.000,0.0
2,2013-01-01,1,BEAUTY,2,0.000,0.0
3,2013-01-01,1,BEVERAGES,3,0.000,0.0
4,2013-01-01,1,BOOKS,4,0.000,0.0
...,...,...,...,...,...,...
3008011,2017-08-15,9,POULTRY,3008011,438.133,0.0
3008012,2017-08-15,9,PREPARED FOODS,3008012,154.553,1.0
3008013,2017-08-15,9,PRODUCE,3008013,2419.729,148.0
3008014,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,3008014,121.000,8.0
