In [1]:
import numpy as np, pandas as pd
import os,glob, re
%matplotlib inline

dfs = {re.search('([a-zA-Z_]*)\.csv', fn).group(1):pd.read_csv(fn) for fn in glob.glob(os.getcwd()+'//Raw//*.csv')}
print('data frames read:{}'.format(list(dfs.keys())))

print('local variables with the same names are created.')
for k, v in dfs.items(): locals()[k] = v

data frames read:['air_reserve', 'air_store_info', 'air_visit_data', 'date_info', 'hpg_reserve', 'hpg_store_info', 'sample_submission', 'store_id_relation']
local variables with the same names are created.


In [2]:
print("Raw shape of each dataset")
for k, v in dfs.items(): print("%s : "%k,v.shape)

Raw shape of each dataset
air_reserve :  (92378, 4)
air_store_info :  (829, 5)
air_visit_data :  (252108, 3)
date_info :  (517, 3)
hpg_reserve :  (2000320, 4)
hpg_store_info :  (4690, 5)
sample_submission :  (32019, 2)
store_id_relation :  (150, 2)


In [3]:
print("Unique store Ids in each dataset")
for k, v in dfs.items(): 
    try:       
        print(k," - Unqiue air_stores: ",v.air_store_id.nunique())
    except:
        pass
    try:
        print(k," - Unqiue hpg_stores: ",v.hpg_store_id.nunique())
    except:
        pass

Unique store Ids in each dataset
air_reserve  - Unqiue air_stores:  314
air_store_info  - Unqiue air_stores:  829
air_visit_data  - Unqiue air_stores:  829
hpg_reserve  - Unqiue hpg_stores:  13325
hpg_store_info  - Unqiue hpg_stores:  4690
store_id_relation  - Unqiue air_stores:  150
store_id_relation  - Unqiue hpg_stores:  150


In [4]:
print("Convert hpg_reserve from hpg_store_id reference to air_store_id reference")
hpg_reserve = store_id_relation.merge(hpg_reserve,on = 'hpg_store_id', how='inner').drop('hpg_store_id',axis = 1)
print(hpg_reserve.shape)
hpg_reserve.head()

Convert hpg_reserve from hpg_store_id reference to air_store_id reference
(28183, 4)


Unnamed: 0,air_store_id,visit_datetime,reserve_datetime,reserve_visitors
0,air_63b13c56b7201bd9,2016-01-04 12:00:00,2016-01-03 14:00:00,7
1,air_63b13c56b7201bd9,2016-01-04 14:00:00,2016-01-02 13:00:00,4
2,air_63b13c56b7201bd9,2016-01-05 12:00:00,2016-01-01 08:00:00,3
3,air_63b13c56b7201bd9,2016-01-09 12:00:00,2016-01-07 20:00:00,6
4,air_63b13c56b7201bd9,2016-01-11 12:00:00,2016-01-10 15:00:00,3


In [5]:
print("Convert hpg_store_info from hpg_store_id reference to air_store_id reference")
hpg_store_info = store_id_relation.merge(hpg_store_info,on = 'hpg_store_id', how='inner').drop('hpg_store_id',axis = 1)
print(hpg_store_info.shape)
hpg_store_info.head()

Convert hpg_store_info from hpg_store_id reference to air_store_id reference
(63, 5)


Unnamed: 0,air_store_id,hpg_genre_name,hpg_area_name,latitude,longitude
0,air_a24bf50c3e90d583,Italian,Shizuoka-ken Hamamatsu-shi Sukenobuchō,34.721644,137.738944
1,air_a38f25e3399d1b25,Japanese style,Tōkyō-to Chiyoda-ku None,35.69578,139.768453
2,air_3c938075889fc059,Japanese style,Fukuoka-ken Fukuoka-shi Ōmiya,33.579734,130.401762
3,air_ef789667e2e6fe96,International cuisine,Tōkyō-to Machida-shi Nakamachi,35.54967,139.448494
4,air_8f3b563416efc6ad,Creative Japanese food,Tōkyō-to Shinjuku-ku None,35.691384,139.701256


In [6]:
date_info.head()

Unnamed: 0,calendar_date,day_of_week,holiday_flg
0,2016-01-01,Friday,1
1,2016-01-02,Saturday,1
2,2016-01-03,Sunday,1
3,2016-01-04,Monday,0
4,2016-01-05,Tuesday,0


In [7]:
print("Add features to date_info")
print()
print("1. Remove Saturday and Sunday as flagged holidays")
weekdayholidays = date_info.apply(lambda x: x.day_of_week in ['Saturday','Sunday'] and x.holiday_flg == 1,axis=1)
date_info.loc[weekdayholidays,'holiday_flg'] = 0
print("2. Add day before the holiday feature")
date_info['lagged'] = date_info['holiday_flg'].shift(-1)
day_b4_holiday = (date_info['holiday_flg'] == 0) & (date_info['lagged'] == 1.0) #& (date_info['day_of_week'] not ['Sunday','Saturday'])
date_info['dayb4holiday'] = 0
date_info.loc[day_b4_holiday,'dayb4holiday'] = 1
date_info.drop('lagged',axis = 1, inplace = True)
date_info.head()

Add features to date_info

Remove Saturday and Sunday as flagged holidays
Add day before the holiday feature


Unnamed: 0,calendar_date,day_of_week,holiday_flg,dayb4holiday
0,2016-01-01,Friday,1,0
1,2016-01-02,Saturday,0,0
2,2016-01-03,Sunday,0,0
3,2016-01-04,Monday,0,0
4,2016-01-05,Tuesday,0,0
