In [67]:
# driveのマウント
from google.colab import drive
drive.mount('/content/drive')

# モデルのバージョン
VER = '007'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ライブラリのインポート

In [68]:
import gc
import os
import re
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

!pip install japanize-matplotlib
import japanize_matplotlib
import folium


from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# データのロード

In [69]:
# パスは適宜変更してください

# ディレクトリを移動
os.chdir('/content/drive/MyDrive/signate-908-hiroshima/')
path = os.getcwd()

# データ読み込み
water_data = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data.csv'))
water_stations = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'stations.csv'))
# rain_data = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'data.csv'))
# rain_stations = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'stations.csv'))
# tide_data = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'data.csv'))
# tide_stations = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'stations.csv'))

# データ前処理

In [70]:
def preprocess_water_data_station(water_data, water_stations):
    """water_dataのstationの前処理を行う
    """
    # 欠損値補完
    water_data['river'] = water_data['river'].replace('\u3000', '沼田川')

    # (国)への変更前観測所名を変換
    stations = water_data.loc[water_data['station'].str.contains(r'\(国\)'), 'station'].unique()
    # 中野、伊尾、和木は(国)を含まない観測所が別途存在するため別処理
    stations = [x.replace('(国)', '') for x in stations if x not in ['中野(国)', '伊尾(国)', '和木(国)']]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(国)' if x in stations else x)
    # 中野、伊尾、和木は河川名で分けて処理
    water_data.loc[(water_data['station']=='中野')&(water_data['river']=='太田川'), 'station'] = '中野(国)'
    water_data.loc[(water_data['station']=='伊尾')&(water_data['river']=='芦田川'), 'station'] = '伊尾(国)'
    water_data.loc[(water_data['station']=='和木')&(water_data['river']=='小瀬川'), 'station'] = '和木(国)'

    # (電)への変更前観測所名を変換
    stations = water_data.loc[water_data['station'].str.contains(r'\(電\)'), 'station'].unique()
    stations = [x.replace('(電)', '') for x in stations]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(電)' if x in stations else x)

    # 入力ミスと思われるもの
    water_data['station'] = water_data['station'].replace({'藤波': '藤浪',
                                                           '中州橋': '中洲橋',
                                                           '段原': '段原(猿猴川)'})
    water_data['river'] = water_data['river'].replace({'手越川': '手城川',
                                                       '横川': '横川川'})

    # 評価対象のみ抽出
    in_stations = water_stations.loc[water_stations['評価対象']==1, '観測所名称'].unique()
    water_data = water_data[water_data['station'].isin(in_stations)]
    water_stations = water_stations[water_stations['評価対象']==1]

    return water_data, water_stations


# def preprocess_rain_data_station(rain_data):
#     """rain_dataの前処理
#     """
#     # 重複行を削除
#     rain_data = rain_data.drop_duplicates()

#     # 2191日分のデータがある観測所のみ使用する
#     stations = rain_data.groupby(['station', 'city']).count()['date'].reset_index()
#     stations = stations.loc[stations['date']==2191, 'station'].unique()
#     rain_data = rain_data[rain_data['station'].isin(stations)]

#     return rain_data

In [71]:
# 観測所をクリーニング
_, water_stations = preprocess_water_data_station(water_data, water_stations)
# rain_data = preprocess_rain_data_station(rain_data)

In [72]:
# 水位データを入力形式に整形
# ほぼrun.pyからコピペ
# stations = set(water_stations[water_stations['評価対象']==1]['観測所名称'])

# in_all_data = {}

# print('processing water data')
# for data in tqdm(water_data.groupby('date')):
#     day = data[0]
#     data_dict = data[1].to_dict('records')
#     in_data = []
#     for d in data_dict:
#         for k in d.keys():
#             if k not in ('date', 'station', 'river'):
#                 in_data.append({'station':d['station'], 'river':d['river'], 'hour':int(k.split(':')[0]), 'value':d[k]})
#     in_all_data[day] = {}
#     in_all_data[day]['date'] = day
#     in_all_data[day]['stations'] = stations
#     in_all_data[day]['waterlevel'] = in_data

# print('done')
# print('processing rain data')

# for data in tqdm(rain_data.groupby('date')):
#     day = data[0]
#     data_dict = data[1].to_dict('records')
#     in_data = []
#     for d in data_dict:
#         for k in d.keys():
#             if k not in ('date', 'station', 'city'):
#                 in_data.append({'station':d['station'], 'city':d['city'], 'hour':int(k.split(':')[0]), 'value':d[k]})
#     in_all_data[day]['rainfall'] = in_data

# print('done')


In [73]:
# 入力データをdfに変換
# water_df = []
# rain_df = []
# for d in tqdm(range(len(in_all_data))):
#     tmp = pd.DataFrame(in_all_data[d]['waterlevel'])
#     tmp['date'] = d
#     water_df.append(tmp)

    # tmp = pd.DataFrame(in_all_data[d]['rainfall'])
    # tmp['date'] = d
    # rain_df.append(tmp)

# water_df = pd.concat(water_df)
# rain_df = pd.concat(rain_df)

In [74]:
# 水位データを数値に変換
# water_df['value'] = pd.to_numeric(water_df['value'], errors='coerce')
# rain_df['value'] = pd.to_numeric(rain_df['value'], errors='coerce')
# rain_df.isna().sum()

# 水系名を追加
# water_df = water_df.merge(water_stations[['観測所名称', '河川名', '水系名']], left_on=['station', 'river'], right_on=['観測所名称', '河川名'], how='left').drop(['観測所名称', '河川名'], axis=1)
# water_df = water_df.sort_values('水系名')

In [75]:
# # 欠損値を線形補完
# dfs = []
# for group in tqdm(water_df.groupby('station')):
#     df = group[1]
#     dfs.append(df.interpolate())
# water_df_filled = pd.concat(dfs)


# # 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df_filled = water_df_filled[water_df_filled['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax, label='orginal')
#     df_filled.groupby('date').mean()['value'].plot(ax=ax, label='filled', alpha=0.5, style='-')
#     ax.legend()
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

In [76]:
# 前処理終了後の水位データを読み込み
water_df = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data_filled_002.csv'))

# 水系名を追加
water_df = water_df.merge(water_stations[['観測所名称', '水系名']], left_on=['station'], right_on=['観測所名称'], how='left').drop('観測所名称', axis=1)
water_df.head()

Unnamed: 0,station,river,hour,value,date,水系名
0,七宝,沼田川,0,1.64,0,沼田川
1,七宝,沼田川,1,1.64,0,沼田川
2,七宝,沼田川,2,1.64,0,沼田川
3,七宝,沼田川,3,1.64,0,沼田川
4,七宝,沼田川,4,1.64,0,沼田川


In [77]:
# ワイドフォーマットに変更
# rain_df = rain_df[['station', 'date', 'hour', 'value']].pivot_table(index=['date', 'hour'], columns='station', values='value').reset_index()

In [78]:
# 水位データと降水量データをマージ
# water_df = water_df.merge(rain_df, on=['date', 'hour'], how='left')

# 特徴量エンジニアリング

In [79]:
# 目的変数とシフト特徴量を作る
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    for h in [-24, 24]:
        if h==-24:
            df['y'] = df['value'].shift(h)
        else:
            col = 'shift_' + str(h) + 'h'
            df[col] = df['value'].shift(h)
            col = 'diff_' + str(h) + 'h'
            df[col] = df['value'].diff(periods=h)
    dfs.append(df)
water_df = pd.concat(dfs, axis=0).reset_index(drop=True)

100%|██████████| 166/166 [00:01<00:00, 97.89it/s] 


In [80]:
value_0h = water_df.loc[water_df['hour']==0, ['date', 'station', 'value']]
value_23h = water_df.loc[water_df['hour']==23, ['date', 'station', 'value']]
water_df = water_df.merge(value_0h, on=['date', 'station'], how='left', suffixes=('', '_0h'))
water_df = water_df.merge(value_23h, on=['date', 'station'], how='left', suffixes=('', '_23h'))
water_df['diff_0toT'] = water_df['value'] - water_df['value_0h']
water_df['diff_Tto23'] = water_df['value_23h'] - water_df['value']
water_df['diff_0to23'] = water_df['value_23h'] - water_df['value_0h']

# モデリング

In [81]:
# ライブラリインポート
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, KFold

In [82]:
def get_rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [83]:
# 特徴量カラム
features = ['date', 'hour', 'station', 'river', 'value_23h']

# 欠損値のある行は削除
water_df = water_df[features + ['y']].dropna().reset_index(drop=True)

water_df_cp = water_df.copy()

# ラベルエンコーディング
label_enc_col = ['station', 'river']
for col in label_enc_col:
    le = LabelEncoder()
    water_df[col] = le.fit_transform(water_df[col].values)

water_df.head()

Unnamed: 0,date,hour,station,river,value_23h,y
0,0,0,0,58,1.64,1.64
1,0,1,0,58,1.64,1.64
2,0,2,0,58,1.64,1.64
3,0,3,0,58,1.64,1.64
4,0,4,0,58,1.64,1.64


In [84]:
# 学習のパラメータ
params = {
    'objective': 'regression',
    'random_state': 42,
    'boosting_type': 'gbdt',
    'n_estimators': 10000
    }
verbose_eval = 1


# TimeSeriesSplitでCV
# ts = TimeSeriesSplit(n_splits=6)

# KFoldでCV
kf = KFold(n_splits=6)

models = []
scores = []
oof_train = np.zeros((len(water_df),))

date = water_df['date'].unique()
for date_tr, date_val in kf.split(date):
    train = water_df[water_df['date'].isin(date_tr)]
    valid = water_df[water_df['date'].isin(date_val)]

    X_train, y_train = train[features], train['y']
    X_valid, y_valid = valid[features], valid['y']

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, 
            eval_metric='rmse',  # early_stoppingの評価指標(学習用の'metric'パラメータにも同じ指標が自動入力される)
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=50,
            verbose=verbose_eval
            )
    
    oof_train[X_valid.index] = model.predict(X_valid, num_iteration=model.best_iteration_)

    models.append(model)
    scores.append(get_rmse(y_valid, oof_train[X_valid.index]))

[1]	valid_0's l2: 699.347	valid_0's rmse: 26.4452
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 566.531	valid_0's rmse: 23.8019
[3]	valid_0's l2: 458.945	valid_0's rmse: 21.423
[4]	valid_0's l2: 371.936	valid_0's rmse: 19.2856
[5]	valid_0's l2: 301.314	valid_0's rmse: 17.3584
[6]	valid_0's l2: 244.106	valid_0's rmse: 15.6239
[7]	valid_0's l2: 197.84	valid_0's rmse: 14.0656
[8]	valid_0's l2: 160.286	valid_0's rmse: 12.6604
[9]	valid_0's l2: 129.917	valid_0's rmse: 11.3981
[10]	valid_0's l2: 105.263	valid_0's rmse: 10.2598
[11]	valid_0's l2: 85.3275	valid_0's rmse: 9.23729
[12]	valid_0's l2: 69.1223	valid_0's rmse: 8.31398
[13]	valid_0's l2: 56.0099	valid_0's rmse: 7.48398
[14]	valid_0's l2: 45.3747	valid_0's rmse: 6.73608
[15]	valid_0's l2: 36.7862	valid_0's rmse: 6.06517
[16]	valid_0's l2: 29.8121	valid_0's rmse: 5.46004
[17]	valid_0's l2: 24.1735	valid_0's rmse: 4.91666
[18]	valid_0's l2: 19.5876	valid_0's rmse: 4.42578
[19]	valid_0's l2: 15.8731	vali

In [85]:
print(f'[各foldのRMSE]{scores}')
print(f'[全体のRMSE] {np.mean(scores)}')

[各foldのRMSE][0.15620594891343312, 0.7861960565585849, 0.17765767111635175, 0.3228028410311311, 0.18263955655241354, 0.15926971116289354]
[全体のRMSE] 0.297461964222468


006のRMSE  
各fold: [1.3179053821930575, 1.0794245916438507, 0.16843268983610354, 4.634657172531895, 0.33931059350684156, 0.18290500468706114]
全体: 1.2871059057331349  

In [86]:
print('[best_iteration]')
for i, model in enumerate(models):
    print(f'fold {i}: {model.best_iteration_}')
    col = 'importance_' + str(i)
    if i == 0:
        results = pd.DataFrame({
            'feature': features,
            col: model.feature_importances_
            })
    else:
        results[col] = model.feature_importances_
print('[feature_importances]')
display(results)

[best_iteration]
fold 0: 347
fold 1: 59
fold 2: 676
fold 3: 718
fold 4: 264
fold 5: 409
[feature_importances]


Unnamed: 0,feature,importance_0,importance_1,importance_2,importance_3,importance_4,importance_5
0,date,2342,139,4941,5394,1633,2613
1,hour,2097,355,3777,3800,1609,2290
2,station,1700,370,3344,3448,1398,2066
3,river,1260,295,2341,2512,881,1542
4,value_23h,3011,611,5877,6386,2399,3759


In [87]:
# 観測所ごとに目的変数の1日平均、予測値の1日平均をプロット
water_df_cp['y_pred'] = oof_train

fig, axes = plt.subplots(83, 2, figsize=(20, 270), tight_layout=True)
for i, station in enumerate(tqdm(water_df_cp['station'].unique())):
    ax = axes[i//2, i%2]
    df = water_df_cp[water_df_cp['station']==station]
    river = df.iloc[0]['river']
    df.groupby('date').mean()['y'].plot(ax=ax, label='true')
    df.groupby('date').mean()['y_pred'].plot(ax=ax, label='pred')
    ax.set_xlim(0, 2200)
    ax.set_title(f'観測所: {station} / 河川: {river}')
    ax.legend()
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In [88]:
# 全データで再学習
# イテレーション数はbest_iterationの平均の1.2倍にする

n_estimators = int(np.mean([m.best_iteration_ for m in models])*1.2)

params = {
    'objective': 'regression',
    'random_state': 42,
    'boosting_type': 'gbdt',
    'n_estimators': n_estimators
    }

model = lgb.LGBMRegressor(**params)
model.fit(water_df[features], water_df['y'])

LGBMRegressor(n_estimators=494, objective='regression', random_state=42)

# モデル保存

In [89]:
import pickle

In [90]:
model_file = 'baseline_lightgbm_' + VER + '.pkl'
model_path = os.path.join(path, 'models', model_file)
with open(model_path, 'wb') as f:
    pickle.dump(model, f)