In [1]:
import numpy as np, pandas as pd
import os,glob, re

dfs = {re.search('/([^/\.]*)\.csv', fn).group(1):pd.read_csv(fn) for fn in glob.glob(os.getcwd()+'/Raw/*.csv')}
print('data frames read:{}'.format(list(dfs.keys())))

print('local variables with the same names are created.')
for k, v in dfs.items(): locals()[k] = v

data frames read:['air_reserve', 'air_store_info', 'air_visit_data', 'date_info', 'hpg_reserve', 'hpg_store_info', 'sample_submission', 'store_id_relation']
local variables with the same names are created.


In [2]:
date_info.shape

(517, 3)

In [3]:
weekdayholidays = date_info.apply(lambda x: x.day_of_week in ['Saturday','Sunday'] and x.holiday_flg == 1,axis=1)
date_info.loc[weekdayholidays,'holiday_flg'] = 0
date_info.head()

Unnamed: 0,calendar_date,day_of_week,holiday_flg
0,2016-01-01,Friday,1
1,2016-01-02,Saturday,0
2,2016-01-03,Sunday,0
3,2016-01-04,Monday,0
4,2016-01-05,Tuesday,0


In [4]:
date_info = date_info[pd.to_datetime(date_info.calendar_date) > pd.to_datetime('2016-06-30')].copy()
date_info.shape

(335, 3)

In [5]:
date_info.reset_index(drop = True,inplace = True)

In [6]:
date_info.tail()

Unnamed: 0,calendar_date,day_of_week,holiday_flg
330,2017-05-27,Saturday,0
331,2017-05-28,Sunday,0
332,2017-05-29,Monday,0
333,2017-05-30,Tuesday,0
334,2017-05-31,Wednesday,0


In [7]:
date_info['weights'] = ((date_info.index + 1)/ len(date_info))**7
date_info.tail()

Unnamed: 0,calendar_date,day_of_week,holiday_flg,weights
330,2017-05-27,Saturday,0,0.919353
331,2017-05-28,Sunday,0,0.938973
332,2017-05-29,Monday,0,0.95895
333,2017-05-30,Tuesday,0,0.979291
334,2017-05-31,Wednesday,0,1.0


In [8]:
air_visit_data.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


In [9]:
trainset = air_visit_data.merge(date_info,how= 'left', left_on = 'visit_date', right_on = 'calendar_date')
trainset.drop('calendar_date', inplace = True, axis = 1)

In [10]:
trainset['visitors'] = trainset['visitors'].apply(pd.np.log1p)
trainset.tail()

Unnamed: 0,air_store_id,visit_date,visitors,day_of_week,holiday_flg,weights
252103,air_24e8414b9b07decb,2017-04-18,1.94591,Tuesday,0.0,0.382267
252104,air_24e8414b9b07decb,2017-04-19,1.94591,Wednesday,0.0,0.391525
252105,air_24e8414b9b07decb,2017-04-20,2.079442,Thursday,0.0,0.400975
252106,air_24e8414b9b07decb,2017-04-21,2.197225,Friday,0.0,0.41062
252107,air_24e8414b9b07decb,2017-04-22,1.791759,Saturday,0.0,0.420464


In [11]:
weighted_mean = lambda x : ((x.visitors * x.weights).sum() / (x.weights).sum())
visitors_per_weekday = trainset.groupby(['air_store_id','day_of_week','holiday_flg']).apply(weighted_mean).reset_index()

In [12]:
visitors_per_weekday.rename(columns={0:'visitors'},inplace = True)
visitors_per_weekday.head()

Unnamed: 0,air_store_id,day_of_week,holiday_flg,visitors
0,air_00a91d42b08b08d9,Friday,0.0,3.595823
1,air_00a91d42b08b08d9,Monday,0.0,3.244766
2,air_00a91d42b08b08d9,Monday,1.0,3.091042
3,air_00a91d42b08b08d9,Saturday,0.0,2.450829
4,air_00a91d42b08b08d9,Sunday,0.0,1.098612


In [13]:
sample_submission["airstore"],sample_submission["date"] = sample_submission.id.str[:20],sample_submission.id.str[21:]
sample_submission.head()

Unnamed: 0,id,visitors,airstore,date
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25
3,air_00a91d42b08b08d9_2017-04-26,0,air_00a91d42b08b08d9,2017-04-26
4,air_00a91d42b08b08d9_2017-04-27,0,air_00a91d42b08b08d9,2017-04-27


In [14]:
sample_submission = sample_submission.merge(date_info,how = 'left',left_on = 'date',right_on = 'calendar_date')
sample_submission.tail()

Unnamed: 0,id,visitors,airstore,date,calendar_date,day_of_week,holiday_flg,weights
32014,air_fff68b929994bfbd_2017-05-27,0,air_fff68b929994bfbd,2017-05-27,2017-05-27,Saturday,0,0.919353
32015,air_fff68b929994bfbd_2017-05-28,0,air_fff68b929994bfbd,2017-05-28,2017-05-28,Sunday,0,0.938973
32016,air_fff68b929994bfbd_2017-05-29,0,air_fff68b929994bfbd,2017-05-29,2017-05-29,Monday,0,0.95895
32017,air_fff68b929994bfbd_2017-05-30,0,air_fff68b929994bfbd,2017-05-30,2017-05-30,Tuesday,0,0.979291
32018,air_fff68b929994bfbd_2017-05-31,0,air_fff68b929994bfbd,2017-05-31,2017-05-31,Wednesday,0,1.0


In [15]:
sample_submission.drop('visitors',inplace=True,axis = 1)
sample_submission = sample_submission.merge(visitors_per_weekday,how = "left",left_on = ['airstore','day_of_week','holiday_flg'], 
                                            right_on = ['air_store_id','day_of_week','holiday_flg'])
sample_submission.head()

Unnamed: 0,id,airstore,date,calendar_date,day_of_week,holiday_flg,weights,air_store_id,visitors
0,air_00a91d42b08b08d9_2017-04-23,air_00a91d42b08b08d9,2017-04-23,2017-04-23,Sunday,0,0.430508,air_00a91d42b08b08d9,1.098612
1,air_00a91d42b08b08d9_2017-04-24,air_00a91d42b08b08d9,2017-04-24,2017-04-24,Monday,0,0.440758,air_00a91d42b08b08d9,3.244766
2,air_00a91d42b08b08d9_2017-04-25,air_00a91d42b08b08d9,2017-04-25,2017-04-25,Tuesday,0,0.451216,air_00a91d42b08b08d9,3.423676
3,air_00a91d42b08b08d9_2017-04-26,air_00a91d42b08b08d9,2017-04-26,2017-04-26,Wednesday,0,0.461886,air_00a91d42b08b08d9,3.367432
4,air_00a91d42b08b08d9_2017-04-27,air_00a91d42b08b08d9,2017-04-27,2017-04-27,Thursday,0,0.472772,air_00a91d42b08b08d9,3.528229


In [16]:
sample_submission.apply(pd.notnull).all(axis=0)

id                True
airstore          True
date              True
calendar_date     True
day_of_week       True
holiday_flg       True
weights           True
air_store_id     False
visitors         False
dtype: bool

In [17]:
sample_submission.loc[sample_submission.visitors.isnull()].shape

(740, 9)

In [18]:
sample_submission.loc[sample_submission.visitors.isnull(),'visitors'] = sample_submission[sample_submission.visitors.isnull()].merge(visitors_per_weekday[visitors_per_weekday.holiday_flg ==0],how = "left",left_on = ['airstore','day_of_week'], 
                                            right_on = ['air_store_id','day_of_week'])['visitors_y'].values
sample_submission.apply(pd.notnull).all(axis=0)

id                True
airstore          True
date              True
calendar_date     True
day_of_week       True
holiday_flg       True
weights           True
air_store_id     False
visitors         False
dtype: bool

In [19]:
sample_submission.loc[sample_submission.visitors.isnull()].shape

(484, 9)

In [20]:
sample_submission.loc[sample_submission.visitors.isnull(),'visitors'] = sample_submission[sample_submission.visitors.isnull()].merge(visitors_per_weekday[['air_store_id','visitors']].groupby('air_store_id').mean().reset_index(),how = "left",left_on = 'airstore', 
                                            right_on = 'air_store_id')['visitors_y'].values

In [21]:
sample_submission.loc[sample_submission.visitors.isnull()].shape

(0, 9)

In [22]:
sample_submission = sample_submission[['id','visitors']]

In [23]:
sample_submission['visitors'] = sample_submission.visitors.apply(pd.np.expm1)
sample_submission.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,2.0
1,air_00a91d42b08b08d9_2017-04-24,24.655711
2,air_00a91d42b08b08d9_2017-04-25,29.681993
3,air_00a91d42b08b08d9_2017-04-26,28.003939
4,air_00a91d42b08b08d9_2017-04-27,33.06359


In [25]:
sample_submission.to_csv('sample_submission4.csv', float_format='%.4f', index=None)

In [None]:
sample_submission.shape

In [None]:
# import seaborn as sns
# %matplotlib inline
# sample_submission.groupby('date')['visitors'].sum().plot()

In [None]:
###RESULT : 0.503 RANK: 143