In [25]:
# driveのマウント
from google.colab import drive
drive.mount('/content/drive')

# モデルのバージョン
VER = '012'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ライブラリのインポート

In [26]:
import gc
import os
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

!pip install japanize-matplotlib
import japanize_matplotlib
import folium


from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# データのロード

In [27]:
# パスは適宜変更してください

# ディレクトリを移動
os.chdir('/content/drive/MyDrive/signate-908-hiroshima/')
path = os.getcwd()

# データ読み込み
water_data = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data.csv'))
water_stations = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'stations.csv'))
# rain_data = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'data.csv'))
# rain_stations = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'stations.csv'))
# tide_data = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'data.csv'))
# tide_stations = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'stations.csv'))

# データ前処理

In [28]:
def preprocess_water_data_station(water_data, water_stations):
    """water_dataのstationの前処理を行う
    """
    # 欠損値補完
    water_data['river'] = water_data['river'].replace('\u3000', '沼田川')

    # (国)への変更前観測所名を変換
    stations = water_data.loc[water_data['station'].str.contains(r'\(国\)'), 'station'].unique()
    # 中野、伊尾、和木は(国)を含まない観測所が別途存在するため別処理
    stations = [x.replace('(国)', '') for x in stations if x not in ['中野(国)', '伊尾(国)', '和木(国)']]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(国)' if x in stations else x)
    # 中野、伊尾、和木は河川名で分けて処理
    water_data.loc[(water_data['station']=='中野')&(water_data['river']=='太田川'), 'station'] = '中野(国)'
    water_data.loc[(water_data['station']=='伊尾')&(water_data['river']=='芦田川'), 'station'] = '伊尾(国)'
    water_data.loc[(water_data['station']=='和木')&(water_data['river']=='小瀬川'), 'station'] = '和木(国)'

    # (電)への変更前観測所名を変換
    stations = water_data.loc[water_data['station'].str.contains(r'\(電\)'), 'station'].unique()
    stations = [x.replace('(電)', '') for x in stations]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(電)' if x in stations else x)

    # 入力ミスと思われるもの
    water_data['station'] = water_data['station'].replace({'藤波': '藤浪',
                                                           '中州橋': '中洲橋',
                                                           '段原': '段原(猿猴川)'})
    water_data['river'] = water_data['river'].replace({'手越川': '手城川',
                                                       '横川': '横川川'})

    # 評価対象のみ抽出
    in_stations = water_stations.loc[water_stations['評価対象']==1, '観測所名称'].unique()
    water_data = water_data[water_data['station'].isin(in_stations)]
    water_stations = water_stations[water_stations['評価対象']==1]

    return water_data, water_stations


# def preprocess_rain_data_station(rain_data):
#     """rain_dataの前処理
#     """
#     # 重複行を削除
#     rain_data = rain_data.drop_duplicates()

#     # 2191日分のデータがある観測所のみ使用する
#     stations = rain_data.groupby(['station', 'city']).count()['date'].reset_index()
#     stations = stations.loc[stations['date']==2191, 'station'].unique()
#     rain_data = rain_data[rain_data['station'].isin(stations)]

#     return rain_data

In [29]:
# 観測所をクリーニング
_, water_stations = preprocess_water_data_station(water_data, water_stations)

del _
gc.collect()
# rain_data = preprocess_rain_data_station(rain_data)

368

In [30]:
# 水位データを入力形式に整形
# ほぼrun.pyからコピペ
# stations = set(water_stations[water_stations['評価対象']==1]['観測所名称'])

# in_all_data = {}

# print('processing water data')
# for data in tqdm(water_data.groupby('date')):
#     day = data[0]
#     data_dict = data[1].to_dict('records')
#     in_data = []
#     for d in data_dict:
#         for k in d.keys():
#             if k not in ('date', 'station', 'river'):
#                 in_data.append({'station':d['station'], 'river':d['river'], 'hour':int(k.split(':')[0]), 'value':d[k]})
#     in_all_data[day] = {}
#     in_all_data[day]['date'] = day
#     in_all_data[day]['stations'] = stations
#     in_all_data[day]['waterlevel'] = in_data

# print('done')
# print('processing rain data')

# for data in tqdm(rain_data.groupby('date')):
#     day = data[0]
#     data_dict = data[1].to_dict('records')
#     in_data = []
#     for d in data_dict:
#         for k in d.keys():
#             if k not in ('date', 'station', 'city'):
#                 in_data.append({'station':d['station'], 'city':d['city'], 'hour':int(k.split(':')[0]), 'value':d[k]})
#     in_all_data[day]['rainfall'] = in_data

# print('done')


In [31]:
# 入力データをdfに変換
# water_df = []
# rain_df = []
# for d in tqdm(range(len(in_all_data))):
#     tmp = pd.DataFrame(in_all_data[d]['waterlevel'])
#     tmp['date'] = d
#     water_df.append(tmp)

    # tmp = pd.DataFrame(in_all_data[d]['rainfall'])
    # tmp['date'] = d
    # rain_df.append(tmp)

# water_df = pd.concat(water_df)
# rain_df = pd.concat(rain_df)

In [32]:
# 水位データを数値に変換
# water_df['value'] = pd.to_numeric(water_df['value'], errors='coerce')
# rain_df['value'] = pd.to_numeric(rain_df['value'], errors='coerce')
# rain_df.isna().sum()

# 水系名を追加
# water_df = water_df.merge(water_stations[['観測所名称', '河川名', '水系名']], left_on=['station', 'river'], right_on=['観測所名称', '河川名'], how='left').drop(['観測所名称', '河川名'], axis=1)
# water_df = water_df.sort_values('水系名')

In [33]:
# # 欠損値を線形補完
# dfs = []
# for group in tqdm(water_df.groupby('station')):
#     df = group[1]
#     dfs.append(df.interpolate())
# water_df_filled = pd.concat(dfs)


# # 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df_filled = water_df_filled[water_df_filled['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax, label='orginal')
#     df_filled.groupby('date').mean()['value'].plot(ax=ax, label='filled', alpha=0.5, style='-')
#     ax.legend()
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

In [34]:
# 前処理終了後の水位データを読み込み
water_df = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data_filled_002.csv'))

# 水系名を追加
water_df = water_df.merge(water_stations[['観測所名称', '水系名']], left_on=['station'], right_on=['観測所名称'], how='left').drop('観測所名称', axis=1)
water_df.head()

Unnamed: 0,station,river,hour,value,date,水系名
0,七宝,沼田川,0,1.64,0,沼田川
1,七宝,沼田川,1,1.64,0,沼田川
2,七宝,沼田川,2,1.64,0,沼田川
3,七宝,沼田川,3,1.64,0,沼田川
4,七宝,沼田川,4,1.64,0,沼田川


In [35]:
# ワイドフォーマットに変更
# rain_df = rain_df[['station', 'date', 'hour', 'value']].pivot_table(index=['date', 'hour'], columns='station', values='value').reset_index()

In [36]:
# 水位データと降水量データをマージ
# water_df = water_df.merge(rain_df, on=['date', 'hour'], how='left')

# 特徴量エンジニアリング

In [37]:
# 目的変数とシフト特徴量を作る
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    for h in [-24, 24]:
        if h==-24:
            df['y'] = df['value'].shift(h)
        else:
            col = 'shift_' + str(h) + 'h'
            df[col] = df['value'].shift(h)
            col = 'diff_' + str(h) + 'h'
            df[col] = df['value'].diff(periods=h)
    dfs.append(df)
water_df = pd.concat(dfs, axis=0).reset_index(drop=True)

100%|██████████| 166/166 [00:01<00:00, 95.06it/s] 


In [38]:
# 当日中の変化量
value_0h = water_df.loc[water_df['hour']==0, ['date', 'station', 'value']]
water_df = water_df.merge(value_0h, on=['date', 'station'], how='left', suffixes=('', '_0h'))

value_12h = water_df.loc[water_df['hour']==12, ['date', 'station', 'value']]
water_df = water_df.merge(value_12h, on=['date', 'station'], how='left', suffixes=('', '_12h'))

value_23h = water_df.loc[water_df['hour']==23, ['date', 'station', 'value']]
water_df = water_df.merge(value_23h, on=['date', 'station'], how='left', suffixes=('', '_23h'))

water_df['diff_0to12'] = water_df['value_12h'] - water_df['value_0h']
water_df['diff_12to23'] = water_df['value_23h'] - water_df['value_12h']
water_df['diff_0to23'] = water_df['value_23h'] - water_df['value_0h']

# 日時情報
water_df['day'] = water_df['date'] % 365
water_df['month'] = water_df['day'] // 30

# モデリング

In [39]:
# ライブラリインポート
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, KFold

In [40]:
def get_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [41]:
# 特徴量カラム
features = ['date', 'hour', 'station', 'river', 'value_23h', 'diff_0to23', 'diff_12to23']

# 欠損値のある行は削除
water_df = water_df[features + ['y']].dropna().reset_index(drop=True)


# ラベルエンコーディング
station_org = water_df['station']
river_org = water_df['river']

label_enc_col = ['station', 'river']
for col in label_enc_col:
    le = LabelEncoder()
    water_df[col] = le.fit_transform(water_df[col].values)

water_df.head()

Unnamed: 0,date,hour,station,river,value_23h,diff_0to23,diff_12to23,y
0,0,0,0,58,1.64,0.0,0.0,1.64
1,0,1,0,58,1.64,0.0,0.0,1.64
2,0,2,0,58,1.64,0.0,0.0,1.64
3,0,3,0,58,1.64,0.0,0.0,1.64
4,0,4,0,58,1.64,0.0,0.0,1.64


In [42]:
# 学習のパラメータ
params = {
    'objective': 'regression',
    'random_state': 42,
    'boosting_type': 'gbdt',
    'verbose': 10,
    'n_estimators': 10000
    }
verbose_eval = 1

# TimeSeriesSplitでCV
# ts = TimeSeriesSplit(n_splits=6)

# KFoldでCV
kf = KFold(n_splits=3)

models = []
scores = []
oof_train = np.zeros((len(water_df),))

date = water_df['date'].unique()
for date_tr, date_val in kf.split(date):
    train = water_df[water_df['date'].isin(date_tr)]
    valid = water_df[water_df['date'].isin(date_val)]

    X_train, y_train = train[features], train['y']
    X_valid, y_valid = valid[features], valid['y']

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, 
            eval_metric='rmse',  # early_stoppingの評価指標(学習用の'metric'パラメータにも同じ指標が自動入力される)
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=10,
            verbose=verbose_eval
            )
    
    oof_train[X_valid.index] = model.predict(X_valid, num_iteration=model.best_iteration_)

    models.append(model)
    scores.append(get_rmse(y_valid, oof_train[X_valid.index]))

[1]	valid_0's rmse: 25.3846	valid_0's l2: 644.376
Training until validation scores don't improve for 10 rounds.
[2]	valid_0's rmse: 22.9705	valid_0's l2: 527.646
[3]	valid_0's rmse: 20.8045	valid_0's l2: 432.828
[4]	valid_0's rmse: 18.8503	valid_0's l2: 355.334
[5]	valid_0's rmse: 17.0976	valid_0's l2: 292.327
[6]	valid_0's rmse: 15.5234	valid_0's l2: 240.976
[7]	valid_0's rmse: 14.1007	valid_0's l2: 198.829
[8]	valid_0's rmse: 12.826	valid_0's l2: 164.507
[9]	valid_0's rmse: 11.6757	valid_0's l2: 136.322
[10]	valid_0's rmse: 10.6459	valid_0's l2: 113.336
[11]	valid_0's rmse: 9.72245	valid_0's l2: 94.526
[12]	valid_0's rmse: 8.88729	valid_0's l2: 78.9839
[13]	valid_0's rmse: 8.14272	valid_0's l2: 66.3039
[14]	valid_0's rmse: 7.46921	valid_0's l2: 55.7891
[15]	valid_0's rmse: 6.86825	valid_0's l2: 47.1728
[16]	valid_0's rmse: 6.41911	valid_0's l2: 41.205
[17]	valid_0's rmse: 6.01784	valid_0's l2: 36.2144
[18]	valid_0's rmse: 5.65727	valid_0's l2: 32.0047
[19]	valid_0's rmse: 5.33326	val

In [43]:
print(f'[各foldのRMSE]{list(map(lambda x: round(x, 5), scores))}')
print(f'[全体のRMSE] {round(np.mean(scores), 5)}')

[各foldのRMSE][2.38368, 0.98401, 0.95928]
[全体のRMSE] 1.44233


009のRMSE  
[各foldのRMSE][2.4824, 1.01357, 0.9703]  
[全体のRMSE] 1.48876

In [44]:
print('[best_iteration]')
for i, model in enumerate(models):
    print(f'fold {i}: {model.best_iteration_}')
    col = 'importance_' + str(i)
    if i == 0:
        results = pd.DataFrame({
            'feature': features,
            col: model.feature_importances_
            })
    else:
        results[col] = model.feature_importances_
print('[feature_importances]')
display(results)

[best_iteration]
fold 0: 92
fold 1: 54
fold 2: 187
[feature_importances]


Unnamed: 0,feature,importance_0,importance_1,importance_2
0,date,309,134,659
1,hour,594,287,1304
2,station,213,183,581
3,river,322,206,581
4,value_23h,764,526,1189
5,diff_0to23,300,168,683
6,diff_12to23,258,116,613


In [21]:
# 観測所ごとに目的変数の1日平均、予測値の1日平均をプロット
water_df['y_pred'] = oof_train
water_df['station_org'] = station_org
water_df['river_org'] = river_org

fig, axes = plt.subplots(83, 2, figsize=(20, 270), tight_layout=True)
for i, station in enumerate(tqdm(water_df['station_org'].unique())):
    ax = axes[i//2, i%2]
    df = water_df[water_df['station_org']==station]
    river = df.iloc[0]['river_org']
    df.groupby('date').mean()['y'].plot(ax=ax, label='true')
    df.groupby('date').mean()['y_pred'].plot(ax=ax, label='pred')
    ax.set_xlim(0, 2200)
    ax.set_title(f'観測所: {station} / 河川: {river}')
    ax.legend()
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [22]:
# 全データで再学習
# イテレーション数は各フォールドのbest_iterationの最小の1.5倍

n_estimators = int(min([m.best_iteration_ for m in models])*1.5)
# n_estimators = 60

params = {
    'objective': 'regression',
    'random_state': 42,
    'boosting_type': 'gbdt',
    'verbose': 10,
    'n_estimators': n_estimators
    }

model = lgb.LGBMRegressor(**params)
model.fit(water_df[features], water_df['y'])

LGBMRegressor(n_estimators=36, objective='regression', random_state=42,
              verbose=10)

# モデル保存

In [23]:
import pickle

In [24]:
model_file = 'baseline_lightgbm_' + VER + '.pkl'
model_path = os.path.join(path, 'models', model_file)
with open(model_path, 'wb') as f:
    pickle.dump(model, f)