In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import gc

from jpholiday import is_holiday
from utils import one_hot_encoder, PARK_POINT, PARKS, loadpkl, save2pkl

num_rows=10000

%matplotlib inline

In [2]:
# load datasets
train_df = pd.read_csv('../input/train.tsv', sep='\t')
test_df = pd.read_csv('../input/test.tsv', sep='\t')

colopl = pd.read_csv('../input/colopl.tsv', sep='\t')
hotlink = pd.read_csv('../input/hotlink.tsv', sep='\t')
nied_oyama = pd.read_csv('../input/nied_oyama.tsv', sep='\t')
nightley = pd.read_csv('../input/nightley.tsv', sep='\t')
weather = pd.read_csv('../input/weather.tsv', sep='\t')
jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t', nrows=10000)
#jorudan = pd.read_csv('../input/jorudan.tsv', sep='\t')
sub = pd.read_csv('../input/sample_submit.tsv', sep='\t', index_col=0, header=None)

In [3]:
test_df['visitors'] = np.nan
df = train_df.append(test_df[['datetime', 'park', 'visitors']]).reset_index()
df['datetime'] = pd.to_datetime(df['datetime'])

In [4]:
# 日本の休日
def getJapaneseHolidays(dates):
    japanese_holiday = dates.dt.date.apply(is_holiday).astype(int)

    # 祝日データに土日を追加
    japanese_holiday += (dates.dt.weekday==5).astype(int)
    japanese_holiday += (dates.dt.weekday==6).astype(int)

    # 年末年始の6日間を休日に変更
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==29)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==30)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==12)&(dates.dt.day==31)&(japanese_holiday==0)).astype(int)

    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==1)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==2)&(japanese_holiday==0)).astype(int)
    japanese_holiday += ((dates.dt.month==1)&(dates.dt.day==3)&(japanese_holiday==0)).astype(int)

    return japanese_holiday


In [5]:
train_df

Unnamed: 0,datetime,park,visitors
0,2015-01-01,阿寒摩周国立公園,11028
1,2015-01-01,十和田八幡平国立公園,34757
2,2015-01-01,日光国立公園,29714
3,2015-01-01,伊勢志摩国立公園,42652
4,2015-01-01,大山隠岐国立公園,3637
5,2015-01-01,阿蘇くじゅう国立公園,1369
6,2015-01-01,霧島錦江湾国立公園,35352
7,2015-01-01,慶良間諸島国立公園,151
8,2015-01-02,阿寒摩周国立公園,11153
9,2015-01-02,十和田八幡平国立公園,33795


In [6]:
test_df

Unnamed: 0,index,datetime,park,visitors
0,0,2017-01-01,阿寒摩周国立公園,
1,1,2017-01-01,十和田八幡平国立公園,
2,2,2017-01-01,日光国立公園,
3,3,2017-01-01,伊勢志摩国立公園,
4,4,2017-01-01,大山隠岐国立公園,
5,5,2017-01-01,阿蘇くじゅう国立公園,
6,6,2017-01-01,霧島錦江湾国立公園,
7,7,2017-01-01,慶良間諸島国立公園,
8,8,2017-01-02,阿寒摩周国立公園,
9,9,2017-01-02,十和田八幡平国立公園,


In [7]:
colopl

Unnamed: 0,year,month,park,country_jp,count
0,2016,1,阿寒摩周国立公園,アメリカ合衆国,1-9
1,2016,1,阿寒摩周国立公園,シンガポール,1-9
2,2016,1,阿寒摩周国立公園,マレーシア,1-9
3,2016,1,阿寒摩周国立公園,中国,1-9
4,2016,1,阿寒摩周国立公園,台湾,19
5,2016,1,阿寒摩周国立公園,朝鮮、南部,1-9
6,2016,1,阿寒摩周国立公園,香港(PRC),12
7,2016,1,十和田八幡平国立公園,アメリカ合衆国,1-9
8,2016,1,十和田八幡平国立公園,オーストラリア,1-9
9,2016,1,十和田八幡平国立公園,ノルウェー,1-9


In [8]:
hotlink

Unnamed: 0,datetime,domain,keyword,count
0,2015-01-01,twitter_sampling,国立公園,148
1,2015-01-01,blog,アオウミガメ,11
2,2015-01-01,twitter_sampling,アオウミガメ,1
3,2015-01-01,bbs,阿波連岬園地,0
4,2015-01-01,blog,阿波連岬園地,0
5,2015-01-01,bbs,鬼怒川,2
6,2015-01-01,twitter_sampling,阿波連岬園地,0
7,2015-01-01,bbs,稲崎園地,0
8,2015-01-01,blog,稲崎園地,0
9,2015-01-01,twitter_sampling,稲崎園地,0


In [9]:
nied_oyama

Unnamed: 0,日時,積雪深(cm),積雪深重量(kg m^-2),気温(℃),降水量(mm h-1)
0,2015-01-01 00:00:00,99.3,305.1,-4.8,1.0
1,2015-01-01 01:00:00,101.9,308.3,-4.8,1.0
2,2015-01-01 02:00:00,102.5,310.3,-5.3,0.0
3,2015-01-01 03:00:00,101.8,310.6,-5.5,0.0
4,2015-01-01 04:00:00,101.9,311.1,-6.1,0.5
5,2015-01-01 05:00:00,102.7,311.5,-7.0,0.0
6,2015-01-01 06:00:00,105.2,313.7,-7.8,0.0
7,2015-01-01 07:00:00,105.5,314.5,-8.2,0.5
8,2015-01-01 08:00:00,105.8,315.4,-8.5,0.0
9,2015-01-01 09:00:00,105.4,315.0,-8.4,0.5


In [10]:
nightley

Unnamed: 0,datetime,Japan_count,Foreign_count
0,2016-01-01,28,0
1,2016-01-02,66,1
2,2016-01-03,75,6
3,2016-01-04,50,1
4,2016-01-05,25,0
5,2016-01-06,29,2
6,2016-01-07,7,1
7,2016-01-08,9,1
8,2016-01-09,43,2
9,2016-01-10,68,2


In [14]:
weather

Unnamed: 0,年月日,地点,平均気温(℃),最高気温(℃),最低気温(℃),降水量の合計(mm),10分間降水量の最大(mm),日照時間(時間),合計全天日射量(MJ/㎡),最深積雪(cm),...,最多風向(16方位),平均蒸気圧(hPa),平均現地気圧(hPa),平均湿度(%),平均海面気圧(hPa),最小相対湿度(%),最低海面気圧(hPa),平均雲量(10分比),天気概況(昼:06時~18時),天気概況(夜:18時~翌日06時)
0,2015/1/1,十和田,-1.9,0.3,-2.9,5.0,1.0,2.1,,18.0,...,西南西,,,,,,,,,
1,2015/1/2,十和田,-3.9,-1.1,-8.0,0.0,0.0,3.3,,17.0,...,西北西,,,,,,,,,
2,2015/1/3,十和田,-4.0,-1.6,-8.9,2.0,0.5,1.4,,21.0,...,西南西,,,,,,,,,
3,2015/1/4,十和田,0.6,3.4,-1.6,0.0,0.0,2.6,,19.0,...,南西,,,,,,,,,
4,2015/1/5,十和田,0.8,5.4,-5.1,0.0,0.0,4.0,,14.0,...,西南西,,,,,,,,,
5,2015/1/6,十和田,2.0,7.9,-3.1,0.0,0.0,1.2,,10.0,...,西,,,,,,,,,
6,2015/1/7,十和田,-3.7,-1.2,-5.5,0.0,0.0,4.2,,11.0,...,西南西,,,,,,,,,
7,2015/1/8,十和田,0.2,2.2,-1.4,0.0,0.0,1.8,,9.0,...,西,,,,,,,,,
8,2015/1/9,十和田,-0.2,2.4,-3.9,0.5,0.5,2.3,,13.0,...,西,,,,,,,,,
9,2015/1/10,十和田,-0.5,1.8,-2.1,0.0,0.0,4.7,,10.0,...,西南西,,,,,,,,,


In [62]:
# Preprocess agoop.tsv
def agoop(num_rows=None):

    agoop =pd.DataFrame()

    for filename in os.listdir('../input/agoop/'):
        if 'month_time_mesh100m_' in filename:
            # load tsv
            tmp_agoop = pd.read_csv('../input/agoop/'+filename, sep='\t')

            # pivot tableで集約
            tmp_agoop = tmp_agoop.pivot_table(index=['park', 'year', 'month'],
                                              columns=['dayflag', 'hour'],
                                              values='population',
                                              aggfunc=[np.sum, 'mean'])

            # カラム名を変更
            tmp_agoop.columns = ['AGOOP_dayflag'+str(tup[1])+'_'+'hour'+str(tup[2])+tup[0].upper() for tup in tmp_agoop.columns.values]

            # merge
            agoop = agoop.append(tmp_agoop)

            del tmp_agoop
            gc.collect()

            print(filename+' done.')

    agoop = agoop.reset_index()

    # １ヶ月先にシフト
    for i, (y, m) in enumerate(zip(agoop['year'], agoop['month'])):
        if m==12:
            agoop.loc[i,'month']-=11
            agoop.loc[i,'year']+=1
        else:
            agoop.loc[i,'month']+=1

    # 2018/1/1以降のデータを削除
    agoop = agoop[agoop['year']<2018]

    return agoop

In [36]:
weather['datetime'] = weather['datetime']+datetime.timedelta(1)

In [78]:
oof_lgbm = pd.read_csv("../output/oof_lgbm.csv")

In [2]:
train_df = loadpkl('../output/train_df.pkl')

In [5]:
train_df[:1000].to_csv('../output/train_df.csv')