In [66]:
# driveのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ライブラリのインポート

In [67]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

!pip install japanize-matplotlib
import japanize_matplotlib

from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# データのロード

In [68]:
# パスは適宜変更してください

# ディレクトリを移動
os.chdir('/content/drive/MyDrive/signate-908-hiroshima/')
path = os.getcwd()

# データ読み込み
water_data = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data.csv'))
water_stations = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'stations.csv'))

# データ前処理

In [69]:
def preprocess_water_data_station(water_data, water_stations):
    """water_dataのstationの前処理を行う
    """
    # 欠損値補完
    water_data['river'] = water_data['river'].replace('\u3000', '沼田川')

    # (国)への変更前観測所名を変換
    national_stations = water_data.loc[water_data['station'].str.contains(r'\(国\)'), 'station'].unique()
    # 中野、伊尾、和木は(国)を含まない観測所が別途存在するため別処理
    national_stations = [x.replace('(国)', '') for x in national_stations if x not in ['中野(国)', '伊尾(国)', '和木(国)']]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(国)' if x in national_stations else x)
    # 中野、伊尾、和木は河川名で分けて処理
    water_data.loc[(water_data['station']=='中野')&(water_data['river']=='太田川'), 'station'] = '中野(国)'
    water_data.loc[(water_data['station']=='伊尾')&(water_data['river']=='芦田川'), 'station'] = '伊尾(国)'
    water_data.loc[(water_data['station']=='和木')&(water_data['river']=='小瀬川'), 'station'] = '和木(国)'

    # (電)への変更前観測所名を変換
    national_stations = water_data.loc[water_data['station'].str.contains(r'\(電\)'), 'station'].unique()
    national_stations = [x.replace('(電)', '') for x in national_stations]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(電)' if x in national_stations else x)

    # 入力ミスと思われるもの
    water_data['station'] = water_data['station'].replace({'藤波': '藤浪',
                                                           '中州橋': '中洲橋',
                                                           '段原': '段原(猿猴川)'})
    water_data['river'] = water_data['river'].replace({'手越川': '手城川',
                                                       '横川': '横川川'})

    # 入力されない観測所を削除
    in_stations = water_stations.loc[water_stations['入力時使用']==1, '観測所名称'].unique()
    water_data = water_data[water_data['station'].isin(in_stations)]

    return water_data

In [70]:
# 水位データの観測所をクリーニング
water_data = preprocess_water_data_station(water_data, water_stations)

In [71]:
# 水位データを入力形式に整形
# ほぼrun.pyからコピペ
stations = set(water_stations[water_stations['評価対象']==1]['観測所名称'])

in_all_data = {}

start_date = 0
end_date = water_data['date'].max() # 2190

for data in tqdm(water_data.groupby('date')):
    day = data[0]
    if day < start_date:
        pass
    elif (day >= start_date) and (day <= end_date):
        data_dict = data[1].to_dict('records')
        in_data = []
        for d in data_dict:
            for k in d.keys():
                if k not in ('date', 'station', 'river'):
                    in_data.append({'station':d['station'], 'river':d['river'], 'hour':int(k.split(':')[0]), 'value':d[k]})
        in_all_data[day] = {}
        in_all_data[day]['date'] = day
        in_all_data[day]['stations'] = stations
        in_all_data[day]['waterlevel'] = in_data
    elif day > end_date:
        break

100%|██████████| 2191/2191 [00:46<00:00, 47.15it/s]


In [72]:
# 入力データをdfに変換
water_df = []
for d in tqdm(range(len(in_all_data))):
    tmp = pd.DataFrame(in_all_data[d]['waterlevel'])
    tmp['date'] = d
    water_df.append(tmp)

water_df = pd.concat(water_df)

100%|██████████| 2191/2191 [00:12<00:00, 181.40it/s]


In [73]:
# 水位データを数値に変換
water_df['value'] = pd.to_numeric(water_df['value'], errors='coerce')
water_df.isna().sum()

station         0
river           0
hour            0
value      548966
date            0
dtype: int64

In [74]:
# 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax)
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

In [75]:
# 欠損値を線形補完
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    dfs.append(df.interpolate())
water_df = pd.concat(dfs)

# 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax)
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

100%|██████████| 178/178 [00:02<00:00, 74.16it/s] 


# 特徴量エンジニアリング

In [76]:
# 目的変数とシフト特徴量を作る
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    for h in [-24, 24, 48]:
        if h==-24:
            df['y'] = df['value'].shift(h)
        else:
            col = 'shift_' + str(h) + 'h'
            df[col] = df['value'].shift(h)
            col = 'diff_' + str(h) + 'h'
            df[col] = df['value'].diff(periods=h)
    dfs.append(df)
water_df = pd.concat(dfs, axis=0).reset_index(drop=True)

100%|██████████| 178/178 [00:01<00:00, 100.41it/s]


# モデリング

In [77]:
# ライブラリインポート
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [78]:
# ラベルエンコーディング
label_enc_col = ['station', 'river']
for col in label_enc_col:
    le = LabelEncoder()
    water_df[col] = le.fit_transform(water_df[col].values)

# 使用するカラム
features = ['date', 'hour', 'station', 'river', 'value']

train = water_df.loc[water_df['date']<=1314, features+['y']].dropna()
valid = water_df.loc[(water_df['date']>1314)&(water_df['date']<=1752), features+['y']].dropna()
test = water_df.loc[water_df['date']>1752, features+['y']].dropna()

X_train, y_train = train[features], train['y']
X_valid, y_valid = valid[features], valid['y']
X_test, y_test = test[features], test['y']

In [79]:
params = {'objective': 'regression',
         'random_state': 42,
         'boosting_type': 'gbdt',
         'n_estimators': 10000
         }
         
# fit_params = {'callbacks':[lgb.early_stopping(stopping_rounds=50)],
#               'eval_metric': 'rmse',
#               'eval_set': [(X_valid, y_valid)]}

verbose_eval = 1

model = lgb.LGBMRegressor(**params)
model.fit(X_train, y_train, 
          eval_metric='rmse',  # early_stoppingの評価指標(学習用の'metric'パラメータにも同じ指標が自動入力される)
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds=50,
          verbose=verbose_eval
          ) 

[1]	valid_0's l2: 426.782	valid_0's rmse: 20.6587
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 345.806	valid_0's rmse: 18.5959
[3]	valid_0's l2: 280.208	valid_0's rmse: 16.7394
[4]	valid_0's l2: 227.065	valid_0's rmse: 15.0687
[5]	valid_0's l2: 184.014	valid_0's rmse: 13.5652
[6]	valid_0's l2: 149.136	valid_0's rmse: 12.2121
[7]	valid_0's l2: 120.879	valid_0's rmse: 10.9945
[8]	valid_0's l2: 97.9827	valid_0's rmse: 9.89862
[9]	valid_0's l2: 79.4347	valid_0's rmse: 8.91262
[10]	valid_0's l2: 64.4069	valid_0's rmse: 8.02539
[11]	valid_0's l2: 52.231	valid_0's rmse: 7.2271
[12]	valid_0's l2: 42.4606	valid_0's rmse: 6.51618
[13]	valid_0's l2: 34.4483	valid_0's rmse: 5.86927
[14]	valid_0's l2: 27.9534	valid_0's rmse: 5.2871
[15]	valid_0's l2: 22.692	valid_0's rmse: 4.76361
[16]	valid_0's l2: 18.428	valid_0's rmse: 4.29279
[17]	valid_0's l2: 14.9724	valid_0's rmse: 3.86941
[18]	valid_0's l2: 12.205	valid_0's rmse: 3.49357
[19]	valid_0's l2: 9.92842	valid_0'

LGBMRegressor(n_estimators=10000, objective='regression', random_state=42)

In [80]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

In [81]:
print(f'RMSE test: {(np.sqrt(mean_squared_error(y_test, y_pred)))}')

RMSE test: 0.1569316753094176


# モデル保存

In [82]:
import pickle

In [83]:
model_file = 'baseline_lightgbm.pkl'
model_path = os.path.join(path, 'models', model_file)
with open(model_path, 'wb') as f:
    pickle.dump(model, f)