In [5]:
import pandas as pd
import numpy as np
import datetime
import os
import gc

from jpholiday import is_holiday
from utils import one_hot_encoder, PARK_POINT, PARKS, loadpkl, save2pkl

num_rows=10000

%matplotlib inline

In [2]:
# load datasets
train_df = pd.read_csv('../input/train.tsv', sep='\t')
test_df = pd.read_csv('../input/test.tsv', sep='\t')

colopl = pd.read_csv('../input/colopl.tsv', sep='\t')
hotlink = pd.read_csv('../input/hotlink.tsv', sep='\t')
nied_oyama = pd.read_csv('../input/nied_oyama.tsv', sep='\t')
nightley = pd.read_csv('../input/nightley.tsv', sep='\t')
weather = pd.read_csv('../input/weather.tsv', sep='\t')
jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t', nrows=10000)
#jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t')
sub = pd.read_csv('../input/sample_submit.tsv', sep='\t', index_col=0, header=None)

In [3]:
test_df['visitors'] = np.nan
df = train_df.append(test_df[['datetime', 'park', 'visitors']]).reset_index()
df['datetime'] = pd.to_datetime(df['datetime'])

In [4]:
# 日本の休日
def getJapaneseHolidays(dates):
    japanese_holiday = dates.dt.date.apply(is_holiday).astype(int)

    # 祝日データに土日を追加
    japanese_holiday += (dates.dt.weekday==5).astype(int)
    japanese_holiday += (dates.dt.weekday==6).astype(int)

    # 年末年始の6日間を休日に変更
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==29)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==30)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==31)&(japanese_holiday==0)).astype(int)

    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==1)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==2)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==3)&(japanese_holiday==0)).astype(int)

    return japanese_holiday


In [5]:
train_df

Unnamed: 0,datetime,park,visitors
0,2015-01-01,阿寒摩周国立公園,11028
1,2015-01-01,十和田八幡平国立公園,34757
2,2015-01-01,日光国立公園,29714
3,2015-01-01,伊勢志摩国立公園,42652
4,2015-01-01,大山隠岐国立公園,3637
5,2015-01-01,阿蘇くじゅう国立公園,1369
6,2015-01-01,霧島錦江湾国立公園,35352
7,2015-01-01,慶良間諸島国立公園,151
8,2015-01-02,阿寒摩周国立公園,11153
9,2015-01-02,十和田八幡平国立公園,33795


In [6]:
test_df

Unnamed: 0,index,datetime,park,visitors
0,0,2017-01-01,阿寒摩周国立公園,
1,1,2017-01-01,十和田八幡平国立公園,
2,2,2017-01-01,日光国立公園,
3,3,2017-01-01,伊勢志摩国立公園,
4,4,2017-01-01,大山隠岐国立公園,
5,5,2017-01-01,阿蘇くじゅう国立公園,
6,6,2017-01-01,霧島錦江湾国立公園,
7,7,2017-01-01,慶良間諸島国立公園,
8,8,2017-01-02,阿寒摩周国立公園,
9,9,2017-01-02,十和田八幡平国立公園,


In [7]:
colopl

Unnamed: 0,year,month,park,country_jp,count
0,2016,1,阿寒摩周国立公園,アメリカ合衆国,1-9
1,2016,1,阿寒摩周国立公園,シンガポール,1-9
2,2016,1,阿寒摩周国立公園,マレーシア,1-9
3,2016,1,阿寒摩周国立公園,中国,1-9
4,2016,1,阿寒摩周国立公園,台湾,19
5,2016,1,阿寒摩周国立公園,朝鮮、南部,1-9
6,2016,1,阿寒摩周国立公園,香港(PRC),12
7,2016,1,十和田八幡平国立公園,アメリカ合衆国,1-9
8,2016,1,十和田八幡平国立公園,オーストラリア,1-9
9,2016,1,十和田八幡平国立公園,ノルウェー,1-9


In [8]:
hotlink

Unnamed: 0,datetime,domain,keyword,count
0,2015-01-01,twitter_sampling,国立公園,148
1,2015-01-01,blog,アオウミガメ,11
2,2015-01-01,twitter_sampling,アオウミガメ,1
3,2015-01-01,bbs,阿波連岬園地,0
4,2015-01-01,blog,阿波連岬園地,0
5,2015-01-01,bbs,鬼怒川,2
6,2015-01-01,twitter_sampling,阿波連岬園地,0
7,2015-01-01,bbs,稲崎園地,0
8,2015-01-01,blog,稲崎園地,0
9,2015-01-01,twitter_sampling,稲崎園地,0


In [9]:
nied_oyama

Unnamed: 0,日時,積雪深(cm),積雪深重量(kg m^-2),気温(℃),降水量(mm h-1)
0,2015-01-01 00:00:00,99.3,305.1,-4.8,1.0
1,2015-01-01 01:00:00,101.9,308.3,-4.8,1.0
2,2015-01-01 02:00:00,102.5,310.3,-5.3,0.0
3,2015-01-01 03:00:00,101.8,310.6,-5.5,0.0
4,2015-01-01 04:00:00,101.9,311.1,-6.1,0.5
5,2015-01-01 05:00:00,102.7,311.5,-7.0,0.0
6,2015-01-01 06:00:00,105.2,313.7,-7.8,0.0
7,2015-01-01 07:00:00,105.5,314.5,-8.2,0.5
8,2015-01-01 08:00:00,105.8,315.4,-8.5,0.0
9,2015-01-01 09:00:00,105.4,315.0,-8.4,0.5


In [10]:
nightley

Unnamed: 0,datetime,Japan_count,Foreign_count
0,2016-01-01,28,0
1,2016-01-02,66,1
2,2016-01-03,75,6
3,2016-01-04,50,1
4,2016-01-05,25,0
5,2016-01-06,29,2
6,2016-01-07,7,1
7,2016-01-08,9,1
8,2016-01-09,43,2
9,2016-01-10,68,2


In [14]:
weather

Unnamed: 0,年月日,地点,平均気温(℃),最高気温(℃),最低気温(℃),降水量の合計(mm),10分間降水量の最大(mm),日照時間(時間),合計全天日射量(MJ/㎡),最深積雪(cm),...,最多風向(16方位),平均蒸気圧(hPa),平均現地気圧(hPa),平均湿度(%),平均海面気圧(hPa),最小相対湿度(%),最低海面気圧(hPa),平均雲量(10分比),天気概況(昼:06時~18時),天気概況(夜:18時~翌日06時)
0,2015/1/1,十和田,-1.9,0.3,-2.9,5.0,1.0,2.1,,18.0,...,西南西,,,,,,,,,
1,2015/1/2,十和田,-3.9,-1.1,-8.0,0.0,0.0,3.3,,17.0,...,西北西,,,,,,,,,
2,2015/1/3,十和田,-4.0,-1.6,-8.9,2.0,0.5,1.4,,21.0,...,西南西,,,,,,,,,
3,2015/1/4,十和田,0.6,3.4,-1.6,0.0,0.0,2.6,,19.0,...,南西,,,,,,,,,
4,2015/1/5,十和田,0.8,5.4,-5.1,0.0,0.0,4.0,,14.0,...,西南西,,,,,,,,,
5,2015/1/6,十和田,2.0,7.9,-3.1,0.0,0.0,1.2,,10.0,...,西,,,,,,,,,
6,2015/1/7,十和田,-3.7,-1.2,-5.5,0.0,0.0,4.2,,11.0,...,西南西,,,,,,,,,
7,2015/1/8,十和田,0.2,2.2,-1.4,0.0,0.0,1.8,,9.0,...,西,,,,,,,,,
8,2015/1/9,十和田,-0.2,2.4,-3.9,0.5,0.5,2.3,,13.0,...,西,,,,,,,,,
9,2015/1/10,十和田,-0.5,1.8,-2.1,0.0,0.0,4.7,,10.0,...,西南西,,,,,,,,,


In [62]:
# Preprocess agoop.tsv
def agoop(num_rows=None):

    agoop =pd.DataFrame()

    for filename in os.listdir('../input/agoop/'):
        if 'month_time_mesh100m_' in filename:
            # load tsv
            tmp_agoop = pd.read_csv('../input/agoop/'+filename, sep='\t')

            # pivot tableで集約
            tmp_agoop = tmp_agoop.pivot_table(index=['park', 'year', 'month'],
                                              columns=['dayflag', 'hour'],
                                              values='population',
                                              aggfunc=[np.sum, 'mean'])

            # カラム名を変更
            tmp_agoop.columns = ['AGOOP_dayflag'+str(tup[1])+'_'+'hour'+str(tup[2])+tup[0].upper() for tup in tmp_agoop.columns.values]

            # merge
            agoop = agoop.append(tmp_agoop)

            del tmp_agoop
            gc.collect()

            print(filename+' done.')

    agoop = agoop.reset_index()

    # １ヶ月先にシフト
    for i, (y, m) in enumerate(zip(agoop['year'], agoop['month'])):
        if m==12:
            agoop.loc[i,'month']-=11
            agoop.loc[i,'year']+=1
        else:
            agoop.loc[i,'month']+=1

    # 2018/1/1以降のデータを削除
    agoop = agoop[agoop['year']<2018]

    return agoop

In [63]:
agoop()

month_time_mesh100m_201603.tsv done.
month_time_mesh100m_201709.tsv done.
month_time_mesh100m_201607.tsv done.
month_time_mesh100m_201511.tsv done.
month_time_mesh100m_201510.tsv done.
month_time_mesh100m_201509.tsv done.
month_time_mesh100m_201508.tsv done.
month_time_mesh100m_201710.tsv done.
month_time_mesh100m_201705.tsv done.
month_time_mesh100m_201610.tsv done.
month_time_mesh100m_201507.tsv done.
month_time_mesh100m_201707.tsv done.
month_time_mesh100m_201605.tsv done.
month_time_mesh100m_201604.tsv done.
month_time_mesh100m_201512.tsv done.
month_time_mesh100m_201503.tsv done.
month_time_mesh100m_201606.tsv done.
month_time_mesh100m_201708.tsv done.
month_time_mesh100m_201701.tsv done.
month_time_mesh100m_201504.tsv done.
month_time_mesh100m_201506.tsv done.
month_time_mesh100m_201612.tsv done.
month_time_mesh100m_201609.tsv done.
month_time_mesh100m_201505.tsv done.
month_time_mesh100m_201704.tsv done.
month_time_mesh100m_201702.tsv done.
month_time_mesh100m_201703.tsv done.
m

Unnamed: 0,park,year,month,AGOOP_dayflag0_hour0SUM,AGOOP_dayflag0_hour1SUM,AGOOP_dayflag0_hour2SUM,AGOOP_dayflag0_hour3SUM,AGOOP_dayflag0_hour4SUM,AGOOP_dayflag0_hour5SUM,AGOOP_dayflag0_hour6SUM,...,AGOOP_dayflag2_hour14MEAN,AGOOP_dayflag2_hour15MEAN,AGOOP_dayflag2_hour16MEAN,AGOOP_dayflag2_hour17MEAN,AGOOP_dayflag2_hour18MEAN,AGOOP_dayflag2_hour19MEAN,AGOOP_dayflag2_hour20MEAN,AGOOP_dayflag2_hour21MEAN,AGOOP_dayflag2_hour22MEAN,AGOOP_dayflag2_hour23MEAN
0,伊勢志摩国立公園,2016,4,99616,101729,101745,101708,101707,102132,102582,...,30.768497,29.291123,30.356198,32.088963,39.878184,46.792250,57.239819,62.921404,73.814906,80.858423
1,十和田八幡平国立公園,2016,4,3967,3980,3967,3870,3870,3674,3919,...,20.743719,22.160714,23.661972,31.436170,29.653061,34.621951,46.066667,48.125000,47.857143,53.019608
2,大山隠岐国立公園,2016,4,11510,11730,11771,11827,11827,11927,12634,...,50.928302,53.869565,54.019512,65.662651,77.700000,105.860000,106.990000,134.265823,145.200000,167.609375
3,慶良間諸島国立公園,2016,4,565,565,565,565,565,565,565,...,23.750000,23.666667,36.363636,26.866667,22.666667,24.000000,32.250000,36.300000,32.666667,51.857143
4,日光国立公園,2016,4,47980,48175,47822,47337,47253,47945,49550,...,23.188468,22.314884,23.582418,25.673591,31.313218,35.764640,38.780247,41.517881,44.670886,47.444772
5,阿寒摩周国立公園,2016,4,5502,5555,5555,5555,5555,5675,5516,...,11.911972,14.389105,18.197183,25.404412,37.010989,48.000000,39.719101,35.959184,30.238938,45.884058
6,阿蘇くじゅう国立公園,2016,4,53907,55157,55058,55046,55046,54823,55435,...,25.320563,24.522727,25.624283,28.790954,34.195088,42.399277,51.465998,53.550177,62.615811,73.952224
7,霧島錦江湾国立公園,2016,4,21452,21185,21590,21592,21592,21737,22672,...,30.931034,28.172196,29.656352,30.563604,36.536866,38.187179,42.929878,50.449438,55.753247,69.961326
8,伊勢志摩国立公園,2017,10,99382,100115,101146,102105,103298,104911,106242,...,33.046869,33.418126,34.101985,34.984597,38.299311,47.587733,60.547397,65.249825,84.762757,103.702938
9,十和田八幡平国立公園,2017,10,8174,8130,8192,8240,8083,8235,8993,...,14.825424,14.022530,15.588110,19.777108,27.142222,36.531646,46.576000,59.552083,65.779070,64.420455


In [58]:
tmp_agoop

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,...,mean,mean,mean,mean,mean,mean,mean,mean,mean,mean
Unnamed: 0_level_1,Unnamed: 1_level_1,dayflag,0,0,0,0,0,0,0,0,0,0,...,2,2,2,2,2,2,2,2,2,2
Unnamed: 0_level_2,Unnamed: 1_level_2,hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
park,year,month,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3,Unnamed: 23_level_3
伊勢志摩国立公園,2016,3,99616,101729,101745,101708,101707,102132,102582,102381,102331,104139,...,30.768497,29.291123,30.356198,32.088963,39.878184,46.79225,57.239819,62.921404,73.814906,80.858423
十和田八幡平国立公園,2016,3,3967,3980,3967,3870,3870,3674,3919,5223,6764,7090,...,20.743719,22.160714,23.661972,31.43617,29.653061,34.621951,46.066667,48.125,47.857143,53.019608
大山隠岐国立公園,2016,3,11510,11730,11771,11827,11827,11927,12634,13792,15299,17327,...,50.928302,53.869565,54.019512,65.662651,77.7,105.86,106.99,134.265823,145.2,167.609375
慶良間諸島国立公園,2016,3,565,565,565,565,565,565,565,551,545,395,...,23.75,23.666667,36.363636,26.866667,22.666667,24.0,32.25,36.3,32.666667,51.857143
日光国立公園,2016,3,47980,48175,47822,47337,47253,47945,49550,54924,60229,63377,...,23.188468,22.314884,23.582418,25.673591,31.313218,35.76464,38.780247,41.517881,44.670886,47.444772
阿寒摩周国立公園,2016,3,5502,5555,5555,5555,5555,5675,5516,4864,5870,5021,...,11.911972,14.389105,18.197183,25.404412,37.010989,48.0,39.719101,35.959184,30.238938,45.884058
阿蘇くじゅう国立公園,2016,3,53907,55157,55058,55046,55046,54823,55435,57911,62244,66924,...,25.320563,24.522727,25.624283,28.790954,34.195088,42.399277,51.465998,53.550177,62.615811,73.952224
霧島錦江湾国立公園,2016,3,21452,21185,21590,21592,21592,21737,22672,22735,23035,25371,...,30.931034,28.172196,29.656352,30.563604,36.536866,38.187179,42.929878,50.449438,55.753247,69.961326


In [36]:
weather['datetime'] = weather['datetime']+datetime.timedelta(1)

In [45]:
weather

Unnamed: 0_level_0,Unnamed: 1_level_0,10分間降水量の最大(mm),10分間降水量の最大(mm),合計全天日射量(MJ/㎡),合計全天日射量(MJ/㎡),最大風速(m/s),最大風速(m/s),最低気温(℃),最低気温(℃),平均現地気圧(hPa),平均現地気圧(hPa),...,平均蒸気圧(hPa),平均蒸気圧(hPa),降雪量合計(cm),降雪量合計(cm),平均海面気圧(hPa),平均海面気圧(hPa),最大瞬間風速(m/s),最大瞬間風速(m/s),平均風速(m/s),平均風速(m/s)
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean,...,sum,mean,sum,mean,sum,mean,sum,mean,sum,mean
park,datetime,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
伊勢志摩国立公園,2015-01-02,0.0,0.0,0.00,0.00,7.5,7.5,-0.2,-0.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,13.8,13.8,3.4,3.4
伊勢志摩国立公園,2015-01-03,0.0,0.0,0.00,0.00,5.6,5.6,-1.9,-1.9,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.8,11.8,2.3,2.3
伊勢志摩国立公園,2015-01-04,0.0,0.0,0.00,0.00,6.9,6.9,-1.4,-1.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.8,11.8,3.1,3.1
伊勢志摩国立公園,2015-01-05,0.0,0.0,0.00,0.00,6.6,6.6,-1.7,-1.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.2,11.2,2.0,2.0
伊勢志摩国立公園,2015-01-06,0.0,0.0,0.00,0.00,6.6,6.6,0.6,0.6,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,11.0,11.0,2.1,2.1
伊勢志摩国立公園,2015-01-07,1.0,1.0,0.00,0.00,7.5,7.5,3.1,3.1,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,13.7,13.7,2.7,2.7
伊勢志摩国立公園,2015-01-08,0.0,0.0,0.00,0.00,10.0,10.0,3.2,3.2,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,16.3,16.3,6.1,6.1
伊勢志摩国立公園,2015-01-09,0.0,0.0,0.00,0.00,8.6,8.6,2.8,2.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,16.2,16.2,5.5,5.5
伊勢志摩国立公園,2015-01-10,0.0,0.0,0.00,0.00,7.7,7.7,3.8,3.8,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,13.1,13.1,4.3,4.3
伊勢志摩国立公園,2015-01-11,0.0,0.0,0.00,0.00,6.6,6.6,4.7,4.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,12.0,12.0,3.9,3.9


In [78]:
oof_lgbm = pd.read_csv("../output/oof_lgbm.csv")

In [80]:
train_df = loadpkl('../output/train_df.pkl')

In [81]:
train_df

Unnamed: 0,index,datetime,visitors,japanese_holiday,num_holidays,new_years_day,golden_week,park_伊勢志摩国立公園,park_十和田八幡平国立公園,park_大山隠岐国立公園,...,JORUDAN_arrival_prefecture_鹿児島_SUM,JORUDAN_arrival_prefecture_鹿児島_MEAN,JORUDAN_departure_and_arrival_place_mean_sum,JORUDAN_departure_and_arrival_place_sum_sum,JORUDAN_departure_and_arrival_type__mean_sum,JORUDAN_departure_and_arrival_type_sum_sum,JORUDAN_departure_and_arrival_place_mean_ratio,JORUDAN_departure_and_arrival_place_sum_ratio,JORUDAN_departure_and_arrival_type_mean_ratio,JORUDAN_departure_and_arrival_type_sum_ratio
0,0,2015-01-01,11028.0,1,4,1,0,0,0,0,...,,,,,,,,,,
1,8,2015-01-02,11153.0,1,4,1,0,0,0,0,...,0.0,0.000000,1.0,90.0,1.0,90.0,0.666667,0.666667,0.607143,0.607143
2,16,2015-01-03,12343.0,1,4,1,0,0,0,0,...,0.0,0.000000,1.0,152.0,1.0,152.0,2.040000,2.040000,0.206349,0.206349
3,24,2015-01-04,6732.0,1,4,0,0,0,0,0,...,0.0,0.000000,1.0,155.0,1.0,155.0,0.781609,0.781609,0.210938,0.210938
4,32,2015-01-05,4877.0,0,0,0,0,0,0,0,...,0.0,0.000000,1.0,170.0,1.0,170.0,1.698413,1.698413,0.338583,0.338583
5,40,2015-01-06,4162.0,0,0,0,0,0,0,0,...,0.0,0.000000,1.0,209.0,1.0,209.0,1.271739,1.271739,0.471831,0.471831
6,48,2015-01-07,4569.0,0,0,0,0,0,0,0,...,0.0,0.000000,1.0,259.0,1.0,259.0,0.738255,0.738255,0.407609,0.407609
7,56,2015-01-08,2715.0,0,0,0,0,0,0,0,...,0.0,0.000000,1.0,269.0,1.0,269.0,0.842466,0.842466,0.280952,0.280952
8,64,2015-01-09,2258.0,0,0,0,0,0,0,0,...,0.0,0.000000,1.0,254.0,1.0,254.0,1.208696,1.208696,0.343915,0.343915
9,72,2015-01-10,4273.0,1,3,0,0,0,0,0,...,0.0,0.000000,1.0,297.0,1.0,297.0,0.800000,0.800000,0.237500,0.237500
