In [1]:
import glob, re
import numpy as np
import pandas as pd
from sklearn import *
from datetime import datetime

data = {
    'tra': pd.read_csv('Raw/air_visit_data.csv'),
    'as': pd.read_csv('Raw/air_store_info.csv'),
    'hs': pd.read_csv('Raw/hpg_store_info.csv'),
    'ar': pd.read_csv('Raw/air_reserve.csv'),
    'hr': pd.read_csv('Raw/hpg_reserve.csv'),
    'id': pd.read_csv('Raw/store_id_relation.csv'),
    'tes': pd.read_csv('Raw/sample_submission.csv'),
    'hol': pd.read_csv('Raw/date_info.csv').rename(columns={'calendar_date':'visit_date'})
    }

data['hr'] = pd.merge(data['hr'], data['id'], how='inner', on=['hpg_store_id'])

for df in ['ar','hr']:
    data[df]['visit_datetime'] = pd.to_datetime(data[df]['visit_datetime'])
    data[df]['visit_datetime'] = data[df]['visit_datetime'].dt.date
    data[df]['reserve_datetime'] = pd.to_datetime(data[df]['reserve_datetime'])
    data[df]['reserve_datetime'] = data[df]['reserve_datetime'].dt.date
    data[df]['reserve_datetime_diff'] = data[df].apply(lambda r: (r['visit_datetime'] - r['reserve_datetime']).days, axis=1)
    tmp1 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].sum().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs1', 'reserve_visitors':'rv1'})
    tmp2 = data[df].groupby(['air_store_id','visit_datetime'], as_index=False)[['reserve_datetime_diff', 'reserve_visitors']].mean().rename(columns={'visit_datetime':'visit_date', 'reserve_datetime_diff': 'rs2', 'reserve_visitors':'rv2'})
    data[df] = pd.merge(tmp1, tmp2, how='inner', on=['air_store_id','visit_date'])




In [2]:
data['tra']['visit_date'] = pd.to_datetime(data['tra']['visit_date'])
data['tra']['dow'] = data['tra']['visit_date'].dt.dayofweek
data['tra']['year'] = data['tra']['visit_date'].dt.year
data['tra']['month'] = data['tra']['visit_date'].dt.month
data['tra']['visit_date'] = data['tra']['visit_date'].dt.date

data['tes']['visit_date'] = data['tes']['id'].map(lambda x: str(x).split('_')[2])
data['tes']['air_store_id'] = data['tes']['id'].map(lambda x: '_'.join(x.split('_')[:2]))
data['tes']['visit_date'] = pd.to_datetime(data['tes']['visit_date'])
data['tes']['dow'] = data['tes']['visit_date'].dt.dayofweek
data['tes']['year'] = data['tes']['visit_date'].dt.year
data['tes']['month'] = data['tes']['visit_date'].dt.month
data['tes']['visit_date'] = data['tes']['visit_date'].dt.date

In [3]:
unique_stores = data['tes']['air_store_id'].unique()
stores = pd.concat([pd.DataFrame({'air_store_id': unique_stores, 'dow': [i]*len(unique_stores)}) 
                    for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)


In [4]:
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow'])
tmp = data['tra'].groupby(['air_store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['air_store_id','dow']) 

stores = pd.merge(stores, data['as'], how='left', on=['air_store_id']) 

stores.head()


Unnamed: 0,air_store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations,air_genre_name,air_area_name,latitude,longitude
0,air_00a91d42b08b08d9,0,1.0,22.457143,19.0,47.0,35.0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595
1,air_0164b9927d20bcc3,0,2.0,7.5,6.0,19.0,20.0,Italian/French,Tōkyō-to Minato-ku Shibakōen,35.658068,139.751599
2,air_0241aa3964b7f861,0,2.0,8.920635,8.0,23.0,63.0,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996
3,air_0328696196e46f18,0,2.0,6.416667,4.0,27.0,12.0,Dining bar,Ōsaka-fu Ōsaka-shi Nakanochō,34.701279,135.52809
4,air_034a3d5b40d5b1b1,0,1.0,11.864865,10.0,66.0,37.0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229


In [5]:
# NEW FEATURES FROM Georgii Vyshnia
stores['air_genre_name'] = stores['air_genre_name'].map(lambda x: str(str(x).replace('/',' ')))
stores['air_area_name'] = stores['air_area_name'].map(lambda x: str(str(x).replace('-',' ')))

lbl = preprocessing.LabelEncoder()
for i in range(10):
    stores['air_genre_name'+str(i)] = lbl.fit_transform(stores['air_genre_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
    stores['air_area_name'+str(i)] = lbl.fit_transform(stores['air_area_name'].map(lambda x: str(str(x).split(' ')[i]) if len(str(x).split(' '))>i else ''))
stores['air_genre_name'] = lbl.fit_transform(stores['air_genre_name'])
stores['air_area_name'] = lbl.fit_transform(stores['air_area_name'])

In [6]:
stores.columns

Index(['air_store_id', 'dow', 'min_visitors', 'mean_visitors',
       'median_visitors', 'max_visitors', 'count_observations',
       'air_genre_name', 'air_area_name', 'latitude', 'longitude',
       'air_genre_name0', 'air_area_name0', 'air_genre_name1',
       'air_area_name1', 'air_genre_name2', 'air_area_name2',
       'air_genre_name3', 'air_area_name3', 'air_genre_name4',
       'air_area_name4', 'air_genre_name5', 'air_area_name5',
       'air_genre_name6', 'air_area_name6', 'air_genre_name7',
       'air_area_name7', 'air_genre_name8', 'air_area_name8',
       'air_genre_name9', 'air_area_name9'],
      dtype='object')

In [7]:
data['hol']['visit_date'] = pd.to_datetime(data['hol']['visit_date'])
data['hol']['day_of_week'] = lbl.fit_transform(data['hol']['day_of_week'])
data['hol']['visit_date'] = data['hol']['visit_date'].dt.date
train = pd.merge(data['tra'], data['hol'], how='left', on=['visit_date']) 
test = pd.merge(data['tes'], data['hol'], how='left', on=['visit_date']) 

In [12]:
train.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,day_of_week,holiday_flg
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1,6,0
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1,4,0
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1,0,0
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1,2,0
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1,1,0


In [13]:
train = pd.merge(train, stores, how='left', on=['air_store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['air_store_id','dow'])

In [14]:
for df in ['ar','hr']:
    train = pd.merge(train, data[df], how='left', on=['air_store_id','visit_date']) 
    test = pd.merge(test, data[df], how='left', on=['air_store_id','visit_date'])

In [17]:
train['id'] = train.apply(lambda r: '_'.join([str(r['air_store_id']), str(r['visit_date'])]), axis=1)

In [18]:
train['total_reserv_sum'] = train['rv1_x'] + train['rv1_y']
train['total_reserv_mean'] = (train['rv2_x'] + train['rv2_y']) / 2
train['total_reserv_dt_diff_mean'] = (train['rs2_x'] + train['rs2_y']) / 2

test['total_reserv_sum'] = test['rv1_x'] + test['rv1_y']
test['total_reserv_mean'] = (test['rv2_x'] + test['rv2_y']) / 2
test['total_reserv_dt_diff_mean'] = (test['rs2_x'] + test['rs2_y']) / 2

In [19]:
train['date_int'] = train['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
test['date_int'] = test['visit_date'].apply(lambda x: x.strftime('%Y%m%d')).astype(int)
train['var_max_lat'] = train['latitude'].max() - train['latitude']
train['var_max_long'] = train['longitude'].max() - train['longitude']
test['var_max_lat'] = test['latitude'].max() - test['latitude']
test['var_max_long'] = test['longitude'].max() - test['longitude']

In [20]:
train.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow,year,month,day_of_week,holiday_flg,min_visitors,mean_visitors,...,rv1_y,rs2_y,rv2_y,id,total_reserv_sum,total_reserv_mean,total_reserv_dt_diff_mean,date_int,var_max_lat,var_max_long
0,air_ba937bf13d40fb24,2016-01-13,25,2,2016,1,6,0,7.0,23.84375,...,,,,air_ba937bf13d40fb24_2016-01-13,,,,20160113,8.362564,4.521799
1,air_ba937bf13d40fb24,2016-01-14,32,3,2016,1,4,0,2.0,20.292308,...,,,,air_ba937bf13d40fb24_2016-01-14,,,,20160114,8.362564,4.521799
2,air_ba937bf13d40fb24,2016-01-15,29,4,2016,1,0,0,4.0,34.738462,...,,,,air_ba937bf13d40fb24_2016-01-15,,,,20160115,8.362564,4.521799
3,air_ba937bf13d40fb24,2016-01-16,22,5,2016,1,2,0,6.0,27.651515,...,,,,air_ba937bf13d40fb24_2016-01-16,,,,20160116,8.362564,4.521799
4,air_ba937bf13d40fb24,2016-01-18,6,0,2016,1,1,0,2.0,13.754386,...,,,,air_ba937bf13d40fb24_2016-01-18,,,,20160118,8.362564,4.521799


In [21]:
# NEW FEATURES FROM Georgii Vyshnia
train['lon_plus_lat'] = train['longitude'] + train['latitude'] 
test['lon_plus_lat'] = test['longitude'] + test['latitude']

lbl = preprocessing.LabelEncoder()
train['air_store_id2'] = lbl.fit_transform(train['air_store_id'])
test['air_store_id2'] = lbl.transform(test['air_store_id'])

In [22]:
col = [c for c in train if c not in ['id', 'air_store_id', 'visit_date','visitors']]
train = train.fillna(-1)
test = test.fillna(-1)

In [25]:
def RMSLE(y, pred):
    return metrics.mean_squared_error(y, pred)**0.5

In [26]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3)
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)
model1.fit(train[col], np.log1p(train['visitors'].values))
model2.fit(train[col], np.log1p(train['visitors'].values))
print('RMSE GradientBoostingRegressor: ', RMSLE(np.log1p(train['visitors'].values), model1.predict(train[col])))
print('RMSE KNeighborsRegressor: ', RMSLE(np.log1p(train['visitors'].values), model2.predict(train[col])))
test['visitors'] = (model1.predict(test[col]) + model2.predict(test[col])) / 2
test['visitors'] = np.expm1(test['visitors']).clip(lower=0.)
sub1 = test[['id','visitors']].copy()
del train; del data;

RMSE GradientBoostingRegressor:  0.502272686555
RMSE KNeighborsRegressor:  0.419451716114


In [28]:
dfs = { re.search('/([^/\.]*)\.csv', fn).group(1):
    pd.read_csv(fn)for fn in glob.glob('Raw/*.csv')}

for k, v in dfs.items(): locals()[k] = v

wkend_holidays = date_info.apply(
    (lambda x:(x.day_of_week=='Sunday' or x.day_of_week=='Saturday') and x.holiday_flg==1), axis=1)
date_info.loc[wkend_holidays, 'holiday_flg'] = 0
date_info['weight'] = ((date_info.index + 1) / len(date_info)) ** 5  

visit_data = air_visit_data.merge(date_info, left_on='visit_date', right_on='calendar_date', how='left')
visit_data.drop('calendar_date', axis=1, inplace=True)
visit_data['visitors'] = visit_data.visitors.map(pd.np.log1p)

wmean = lambda x:( (x.weight * x.visitors).sum() / x.weight.sum() )
visitors = visit_data.groupby(['air_store_id', 'day_of_week', 'holiday_flg']).apply(wmean).reset_index()
visitors.rename(columns={0:'visitors'}, inplace=True) # cumbersome, should be better ways.

sample_submission['air_store_id'] = sample_submission.id.map(lambda x: '_'.join(x.split('_')[:-1]))
sample_submission['calendar_date'] = sample_submission.id.map(lambda x: x.split('_')[2])
sample_submission.drop('visitors', axis=1, inplace=True)
sample_submission = sample_submission.merge(date_info, on='calendar_date', how='left')
sample_submission = sample_submission.merge(visitors, on=[
    'air_store_id', 'day_of_week', 'holiday_flg'], how='left')

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[visitors.holiday_flg==0], on=('air_store_id', 'day_of_week'), 
    how='left')['visitors_y'].values

missings = sample_submission.visitors.isnull()
sample_submission.loc[missings, 'visitors'] = sample_submission[missings].merge(
    visitors[['air_store_id', 'visitors']].groupby('air_store_id').mean().reset_index(), 
    on='air_store_id', how='left')['visitors_y'].values

sample_submission['visitors'] = sample_submission.visitors.map(pd.np.expm1)
sub2 = sample_submission[['id', 'visitors']].copy()
sub_merge = pd.merge(sub1, sub2, on='id', how='inner')

In [29]:
sub_merge['visitors'] = (sub_merge['visitors_x'] + sub_merge['visitors_y']* 1.1)/2
sub_merge[['id', 'visitors']].to_csv('submission50.csv', index=False)

In [48]:
import numpy as np, pandas as pd
import os,glob, re
get_ipython().magic('matplotlib inline')

dfs = {re.search('([a-zA-Z_]*)\.csv', fn).group(1):pd.read_csv(fn) for fn in glob.glob(os.getcwd()+'//Raw//*.csv')}
print('data frames read:{}'.format(list(dfs.keys())))

print('local variables with the same names are created.')
for k, v in dfs.items(): locals()[k] = v


print("Raw shape of each dataset")
for k, v in dfs.items(): print("%s : "%k,v.shape)

print("Split id column in sample_submission")
sample_submission["air_store_id"],sample_submission["visit_date"] = sample_submission.id.str[:20],sample_submission.id.str[21:]
sample_submission.head()

print("Unique store Ids in each dataset")
for k, v in dfs.items(): 
    try:       
        print(k," - Unqiue air_stores: ",v.air_store_id.nunique())
    except:
        pass
    try:
        print(k," - Unqiue hpg_stores: ",v.hpg_store_id.nunique())
    except:
        pass

air_reserve['visit_date'] = pd.to_datetime(air_reserve['visit_datetime']).dt.date.astype(str)

reserve_summary = air_reserve.groupby(['air_store_id','visit_date'])['reserve_visitors'].sum().reset_index()

new_train = air_visit_data.merge(reserve_summary, on =['air_store_id','visit_date'],how = 'left').fillna(0)

new_train['walkins'] = new_train['visitors'] - new_train['reserve_visitors']
new_train.loc[new_train['walkins'] <0,'walkins'] = 0
new_train['noshows'] = new_train['reserve_visitors'] - new_train['visitors']
new_train.loc[new_train['noshows'] <0,'noshows'] = 0
new_train.head()

weekdayholidays = date_info.apply(lambda x: x.day_of_week in ['Saturday','Sunday'] and x.holiday_flg == 1,axis=1)
date_info.loc[weekdayholidays,'holiday_flg'] = 0

date_info['weights'] = ((date_info.index + 1)/ len(date_info))**7

new_train = new_train.merge(date_info,left_on = 'visit_date',right_on = 'calendar_date', how ='left').drop('calendar_date',axis = 1)

new_train['visitors'] = new_train['visitors'].apply(pd.np.log1p)
new_train['reserve_visitors'] = new_train['reserve_visitors'].apply(pd.np.log1p)
new_train['walkins'] = new_train['walkins'].apply(pd.np.log1p)
new_train['noshows'] = new_train['noshows'].apply(pd.np.log1p)

weighted_mean_visitors = lambda x : ((x.visitors * x.weights).sum() / (x.weights).sum())
visitors_per_weekday = new_train.groupby(['air_store_id','day_of_week','holiday_flg']).apply(weighted_mean_visitors).reset_index()

weighted_mean_reservations = lambda x : ((x.reserve_visitors * x.weights).sum() / (x.weights).sum())
reserves_per_weekday = new_train.groupby(['air_store_id','day_of_week','holiday_flg']).apply(weighted_mean_reservations).reset_index()
reserves_per_weekday.head()

weighted_mean_walkins = lambda x : ((x.walkins * x.weights).sum() / (x.weights).sum())
walkin_visitors_per_weekday = new_train.groupby(['air_store_id','day_of_week','holiday_flg']).apply(weighted_mean_walkins).reset_index()

weighted_mean_noshows = lambda x : ((x.noshows * x.weights).sum() / (x.weights).sum())
noshows_per_weekday = new_train.groupby(['air_store_id','day_of_week','holiday_flg']).apply(weighted_mean_noshows).reset_index()

summarized_train = visitors_per_weekday.merge(
    reserves_per_weekday, on= ['air_store_id','day_of_week','holiday_flg'],how = 'outer')

summarized_train.rename(columns={'0_x':'wt_visitors','0_y':'wt_reserves'},inplace = True)

summarized_train = summarized_train.merge(
    walkin_visitors_per_weekday, on = ['air_store_id','day_of_week','holiday_flg'],how ='outer')

summarized_train = summarized_train.merge(
    noshows_per_weekday, on= ['air_store_id','day_of_week','holiday_flg'],how = 'outer')

summarized_train.rename(columns={'0_x':'walkins','0_y':'noshows'},inplace = True)


test = sample_submission.merge(date_info,left_on='visit_date',right_on='calendar_date',how = 'left').drop(['calendar_date','weights'],axis = 1)

newtest = test.merge(reserve_summary, on=['air_store_id','visit_date'], how='left').fillna(0)

newtest = newtest.merge(summarized_train,on = ['air_store_id','day_of_week','holiday_flg'], how = 'left')

temp = newtest[newtest.wt_visitors.isnull()].merge(summarized_train[summarized_train.holiday_flg == 0]
                                            ,on = ['air_store_id','day_of_week'], how = 'left')


newtest.loc[newtest.wt_visitors.isnull(),'wt_visitors'] = temp['wt_visitors_y'].values
newtest.loc[newtest.wt_reserves.isnull(),'wt_reserves'] = temp['wt_reserves_y'].values
newtest.loc[newtest.walkins.isnull(),'walkins'] = temp['walkins_y'].values
newtest.loc[newtest.noshows.isnull(),'noshows'] = temp['noshows_y'].values


temp2 = newtest[newtest.wt_visitors.isnull()].merge(summarized_train[[
    'air_store_id','wt_visitors','wt_reserves','walkins','noshows']].groupby('air_store_id').mean().reset_index(),
                                                    on = 'air_store_id',how = "left")

newtest.loc[newtest.wt_visitors.isnull(),'wt_visitors'] = temp2['wt_visitors_y'].values
newtest.loc[newtest.wt_reserves.isnull(),'wt_reserves'] = temp2['wt_reserves_y'].values
newtest.loc[newtest.walkins.isnull(),'walkins'] = temp2['walkins_y'].values
newtest.loc[newtest.noshows.isnull(),'noshows'] = temp2['noshows_y'].values


max_visitors = air_visit_data.groupby('air_store_id')['visitors'].max().reset_index()
max_visitors.rename(columns = {'visitors' : 'max_cap'},inplace = True)

newtest = newtest.merge(max_visitors,on= 'air_store_id', how = 'left')

newtest.drop(['visitors','air_store_id','visit_date','day_of_week','holiday_flg'],inplace = True,axis = 1)

newtest['wt_visitors'] = newtest['wt_visitors'].apply(pd.np.expm1)
newtest['wt_reserves'] = newtest['wt_reserves'].apply(pd.np.expm1)
newtest['walkins'] = newtest['walkins'].apply(pd.np.expm1)
newtest['noshows'] = newtest['noshows'].apply(pd.np.expm1)

newtest['calculated_visits'] = ((newtest['reserve_visitors']+newtest['wt_reserves'])/2) +newtest['walkins'] - newtest['noshows']

newtest['visitors'] = newtest['calculated_visits']

# k = .6

# newtest['visitors'] = ((newtest['wt_visitors'] * k) + ((1-k)*newtest['calculated_visits']))


# newtest.loc[newtest['visitors'] > newtest['max_cap'],'visitors'] = newtest['max_cap']


# newtest.loc[newtest['visitors'] < 0,'visitors'] = newtest['wt_reserves']


result = newtest[['id','visitors']]


#result.to_csv('result_dump4.csv', float_format='%.4f', index=None)



data frames read:['air_reserve', 'air_store_info', 'air_visit_data', 'date_info', 'hpg_reserve', 'hpg_store_info', 'sample_submission', 'store_id_relation']
local variables with the same names are created.
Raw shape of each dataset
air_reserve :  (92378, 4)
air_store_info :  (829, 5)
air_visit_data :  (252108, 3)
date_info :  (517, 3)
hpg_reserve :  (2000320, 4)
hpg_store_info :  (4690, 5)
sample_submission :  (32019, 2)
store_id_relation :  (150, 2)
Split id column in sample_submission
Unique store Ids in each dataset
air_reserve  - Unqiue air_stores:  314
air_store_info  - Unqiue air_stores:  829
air_visit_data  - Unqiue air_stores:  829
hpg_reserve  - Unqiue hpg_stores:  13325
hpg_store_info  - Unqiue hpg_stores:  4690
sample_submission  - Unqiue air_stores:  821
store_id_relation  - Unqiue air_stores:  150
store_id_relation  - Unqiue hpg_stores:  150


In [49]:
result.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,2.0
1,air_00a91d42b08b08d9_2017-04-24,24.137141
2,air_00a91d42b08b08d9_2017-04-25,27.399424
3,air_00a91d42b08b08d9_2017-04-26,27.299089
4,air_00a91d42b08b08d9_2017-04-27,31.50742


In [50]:
sub_merge.head()

Unnamed: 0,id,visitors_x,visitors_y,visitors
0,air_00a91d42b08b08d9_2017-04-23,2.555948,2.0,2.377974
1,air_00a91d42b08b08d9_2017-04-24,20.944645,23.621632,23.46422
2,air_00a91d42b08b08d9_2017-04-25,25.206066,26.82313,27.355755
3,air_00a91d42b08b08d9_2017-04-26,27.57089,27.60092,28.965951
4,air_00a91d42b08b08d9_2017-04-27,31.519827,31.299646,32.974719


In [51]:
newsub = sub_merge[['id','visitors']].merge(result,how = 'inner',on = 'id')

In [56]:
newsub.head()

Unnamed: 0,id,visitors_x,visitors_y,visitors
0,air_00a91d42b08b08d9_2017-04-23,2.377974,2.0,2.777974
1,air_00a91d42b08b08d9_2017-04-24,23.46422,24.137141,28.188547
2,air_00a91d42b08b08d9_2017-04-25,27.355755,27.399424,32.720381
3,air_00a91d42b08b08d9_2017-04-26,28.965951,27.299089,34.486135
4,air_00a91d42b08b08d9_2017-04-27,32.974719,31.50742,39.234648


In [60]:
newsub['visitors'] = (newsub['visitors_x'] + newsub['visitors_y'])/2

In [61]:
newsub.describe()

Unnamed: 0,visitors_x,visitors_y,visitors
count,32019.0,32019.0,32019.0
mean,19.24714,17.466975,20.103755
std,13.801313,13.695208,14.943723
min,1.149288,-7.5,0.402735
25%,9.014777,7.416091,9.014223
50%,15.71217,13.781842,16.196971
75%,26.018804,23.869239,27.287538
max,143.001379,280.390355,182.310238


In [62]:
newsub[['id', 'visitors']].to_csv('submission52.csv', index=False)

# rank - 0.485