In [None]:
# driveのマウント
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# ライブラリのインポート

In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

!pip install japanize-matplotlib
import japanize_matplotlib

from tqdm import tqdm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# データのロード

In [None]:
# パスは適宜変更してください

# ディレクトリを移動
os.chdir('/content/drive/MyDrive/signate-908-hiroshima/')
path = os.getcwd()

# データ読み込み
water_data = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'data.csv'))
water_stations = pd.read_csv(os.path.join(path, 'train', 'waterlevel', 'stations.csv'))
rain_data = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'data.csv'))
rain_stations = pd.read_csv(os.path.join(path, 'train', 'rainfall', 'stations.csv'))
# tide_data = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'data.csv'))
# tide_stations = pd.read_csv(os.path.join(path, 'train', 'tidelevel', 'stations.csv'))

# データ前処理

In [None]:
def preprocess_water_data_station(water_data, water_stations):
    """water_dataのstationの前処理を行う
    """
    # 欠損値補完
    water_data['river'] = water_data['river'].replace('\u3000', '沼田川')

    # (国)への変更前観測所名を変換
    national_stations = water_data.loc[water_data['station'].str.contains(r'\(国\)'), 'station'].unique()
    # 中野、伊尾、和木は(国)を含まない観測所が別途存在するため別処理
    national_stations = [x.replace('(国)', '') for x in national_stations if x not in ['中野(国)', '伊尾(国)', '和木(国)']]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(国)' if x in national_stations else x)
    # 中野、伊尾、和木は河川名で分けて処理
    water_data.loc[(water_data['station']=='中野')&(water_data['river']=='太田川'), 'station'] = '中野(国)'
    water_data.loc[(water_data['station']=='伊尾')&(water_data['river']=='芦田川'), 'station'] = '伊尾(国)'
    water_data.loc[(water_data['station']=='和木')&(water_data['river']=='小瀬川'), 'station'] = '和木(国)'

    # (電)への変更前観測所名を変換
    national_stations = water_data.loc[water_data['station'].str.contains(r'\(電\)'), 'station'].unique()
    national_stations = [x.replace('(電)', '') for x in national_stations]
    water_data['station'] = water_data['station'].apply(lambda x: x + '(電)' if x in national_stations else x)

    # 入力ミスと思われるもの
    water_data['station'] = water_data['station'].replace({'藤波': '藤浪',
                                                           '中州橋': '中洲橋',
                                                           '段原': '段原(猿猴川)'})
    water_data['river'] = water_data['river'].replace({'手越川': '手城川',
                                                       '横川': '横川川'})

    # 入力されない観測所を削除
    in_stations = water_stations.loc[water_stations['入力時使用']==1, '観測所名称'].unique()
    water_data = water_data[water_data['station'].isin(in_stations)]

    return water_data

In [None]:
# 水位データの観測所をクリーニング
water_data = preprocess_water_data_station(water_data, water_stations)

In [None]:
# 水位データを入力形式に整形
# ほぼrun.pyからコピペ
stations = set(water_stations[water_stations['評価対象']==1]['観測所名称'])

in_all_data = {}

start_date = 0
end_date = water_data['date'].max() # 2190

for data in tqdm(water_data.groupby('date')):
    day = data[0]
    if day < start_date:
        pass
    elif (day >= start_date) and (day <= end_date):
        data_dict = data[1].to_dict('records')
        in_data = []
        for d in data_dict:
            for k in d.keys():
                if k not in ('date', 'station', 'river'):
                    in_data.append({'station':d['station'], 'river':d['river'], 'hour':int(k.split(':')[0]), 'value':d[k]})
        in_all_data[day] = {}
        in_all_data[day]['date'] = day
        in_all_data[day]['stations'] = stations
        in_all_data[day]['waterlevel'] = in_data
    elif day > end_date:
        break

100%|██████████| 2191/2191 [00:33<00:00, 64.94it/s]


In [None]:
# 入力データをdfに変換
water_df = []
for d in tqdm(range(len(in_all_data))):
    tmp = pd.DataFrame(in_all_data[d]['waterlevel'])
    tmp['date'] = d
    water_df.append(tmp)

water_df = pd.concat(water_df)

100%|██████████| 2191/2191 [00:11<00:00, 182.83it/s]


In [None]:
# 水位データを数値に変換
water_df['value'] = pd.to_numeric(water_df['value'], errors='coerce')
water_df.isna().sum()

station         0
river           0
hour            0
value      548966
date            0
dtype: int64

In [None]:
# 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax)
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

In [None]:
# 欠損値を線形補完
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    dfs.append(df.interpolate())
water_df = pd.concat(dfs)

# 観測所ごとに1日の平均をプロット
# fig, axes = plt.subplots(90, 2, figsize=(20, 270), tight_layout=True)
# for i, station in enumerate(tqdm(water_df['station'].unique())):
#     ax = axes[i//2, i%2]
#     df = water_df[water_df['station']==station]
#     df.groupby('date').mean()['value'].plot(ax=ax)
#     ax.set_xlim(0, 2200)
#     ax.set_title(f'{station} / NaNの比率: {round(df["value"].isna().sum()/len(df)*100, 2)}%')
# fig.show()

100%|██████████| 178/178 [00:02<00:00, 87.47it/s] 


# 特徴量エンジニアリング

In [None]:
# 目的変数とシフト特徴量を作る
dfs = []
for group in tqdm(water_df.groupby('station')):
    df = group[1]
    for h in [-24, 24, 48]:
        if h==-24:
            df['y'] = df['value'].shift(h)
        else:
            col = 'shift_' + str(h) + 'h'
            df[col] = df['value'].shift(h)
            col = 'diff_' + str(h) + 'h'
            df[col] = df['value'].diff(periods=h)
    dfs.append(df)
water_df = pd.concat(dfs, axis=0).reset_index(drop=True)

100%|██████████| 178/178 [00:01<00:00, 129.36it/s]


# モデリング

In [None]:
# ライブラリインポート
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [None]:
# ラベルエンコーディング
label_enc_col = ['station', 'river']
for col in label_enc_col:
    le = LabelEncoder()
    water_df[col] = le.fit_transform(water_df[col].values)


# train, valid, testに分ける
train = water_df.loc[water_df['date']<=1314]
valid = water_df.loc[(water_df['date']>1314)&(water_df['date']<=1752)]
test = water_df.loc[water_df['date']>1752]

# 特徴量カラム
features = ['hour', 'station', 'river', 'value']

# NaNを含む行を削除
train = train[features + ['y']].dropna()
valid = valid[features + ['y']].dropna()
test = test[features + ['y']].dropna()

X_train, y_train = train[features], train['y']
X_valid, y_valid = valid[features], valid['y']
X_test, y_test = test[features], test['y']

In [None]:
params = {'objective': 'regression',
         'random_state': 42,
         'boosting_type': 'gbdt',
         'n_estimators': 10000
         }
         
# fit_params = {'callbacks':[lgb.early_stopping(stopping_rounds=50)],
#               'eval_metric': 'rmse',
#               'eval_set': [(X_valid, y_valid)]}

verbose_eval = 1

model = lgb.LGBMRegressor(**params)
model.fit(X_train, y_train, 
          eval_metric='rmse',  # early_stoppingの評価指標(学習用の'metric'パラメータにも同じ指標が自動入力される)
          eval_set=[(X_valid, y_valid)],
          early_stopping_rounds=50,
          verbose=verbose_eval
          ) 

[1]	valid_0's l2: 425.981	valid_0's rmse: 20.6393
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's l2: 344.436	valid_0's rmse: 18.559
[3]	valid_0's l2: 278.45	valid_0's rmse: 16.6868
[4]	valid_0's l2: 225.059	valid_0's rmse: 15.002
[5]	valid_0's l2: 181.864	valid_0's rmse: 13.4857
[6]	valid_0's l2: 146.923	valid_0's rmse: 12.1212
[7]	valid_0's l2: 118.663	valid_0's rmse: 10.8933
[8]	valid_0's l2: 95.8509	valid_0's rmse: 9.79035
[9]	valid_0's l2: 77.4029	valid_0's rmse: 8.79789
[10]	valid_0's l2: 62.461	valid_0's rmse: 7.90323
[11]	valid_0's l2: 50.4087	valid_0's rmse: 7.09991
[12]	valid_0's l2: 40.6682	valid_0's rmse: 6.37716
[13]	valid_0's l2: 32.7788	valid_0's rmse: 5.72528
[14]	valid_0's l2: 26.4218	valid_0's rmse: 5.14021
[15]	valid_0's l2: 21.2795	valid_0's rmse: 4.61297
[16]	valid_0's l2: 17.13	valid_0's rmse: 4.13884
[17]	valid_0's l2: 13.7917	valid_0's rmse: 3.71372
[18]	valid_0's l2: 11.0922	valid_0's rmse: 3.33049
[19]	valid_0's l2: 8.91674	valid_0'

LGBMRegressor(n_estimators=10000, objective='regression', random_state=42)

In [None]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

In [None]:
print(f'RMSE test: {(np.sqrt(mean_squared_error(y_test, y_pred)))}')

RMSE test: 0.14915451696825965


# モデル保存

In [None]:
import pickle

In [None]:
model_file = 'baseline_lightgbm_002.pkl'
model_path = os.path.join(path, 'models', model_file)
with open(model_path, 'wb') as f:
    pickle.dump(model, f)

# EDA

In [None]:
print(len(rain_data))
rain_data = rain_data.drop_duplicates()
print(len(rain_data))

896003
894697


In [None]:
print(rain_stations.shape)
rain_stations.head(3)

(313, 11)


Unnamed: 0,観測所名称,フリガナ,市町,水系名,河川名,データ所管,住所,緯度,経度,事務所,入力時使用
0,西部建設,セイブケンセツ,広島市南区,太田川,京橋川,河川課,広島市南区比治山本町１６－１２　西部建設事務所内,34.381667,132.466667,,1
1,上瀬野,カミセノ,広島市安芸区,瀬野川,瀬野川,河川課,広島市安芸区上瀬野瀬野川公園内,34.428333,132.618056,,1
2,熊野町,クマノチョウ,熊野町,二河川,二河川,河川課,安芸郡熊野町３８１５－１　熊野町役場内,34.338889,132.568056,,1


In [None]:
rain_stations.nunique()

観測所名称    313
フリガナ     290
市町        30
水系名       19
河川名      137
データ所管      7
住所       313
緯度       304
経度       276
事務所        9
入力時使用      2
dtype: int64

In [None]:
rain_stations['入力時使用'].sum()

297

In [None]:
print(rain_data.shape)
rain_data.head(3)

(894697, 27)


Unnamed: 0,date,station,city,00:00:00,01:00:00,02:00:00,03:00:00,04:00:00,05:00:00,06:00:00,...,14:00:00,15:00:00,16:00:00,17:00:00,18:00:00,19:00:00,20:00:00,21:00:00,22:00:00,23:00:00
0,0,栗谷,大竹市,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,大竹市,大竹市,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,弥栄ダム(国),大竹市,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
rain_data[['date', 'station', 'city']].nunique()

date       2191
station     514
city         31
dtype: int64

In [None]:
rain_station_cnt = rain_data.groupby(['station']).count()[['date', 'city']].reset_index()

In [None]:
rain_station_cnt[rain_station_cnt['date']>2191]

Unnamed: 0,station,date,city
66,仁賀,4382,4382
119,吉田,2211,2211
131,呉,2194,2194
195,小奴可,2245,2245
200,小河内,2204,2204
242,御調,2195,2195
258,新市,2245,2245
267,時安,2197,2197
291,松原,2232,2232
310,横川,2212,2212


In [None]:
station = '呉'
display(rain_data.loc[rain_data['station']==station, ['date', 'city']].value_counts()[:30])
display(rain_stations[rain_stations['観測所名称']==station])

date  city
1469  呉市      2
1468  呉市      2
1460  呉市      2
0     呉市      1
1462  呉市      1
1457  呉市      1
1458  呉市      1
1459  呉市      1
1461  呉市      1
1464  呉市      1
1463  呉市      1
1506  呉市      1
1465  呉市      1
1466  呉市      1
1467  呉市      1
1470  呉市      1
1456  呉市      1
1455  呉市      1
1454  呉市      1
1453  呉市      1
1452  呉市      1
1451  呉市      1
1450  呉市      1
1449  呉市      1
1448  呉市      1
1447  呉市      1
1446  呉市      1
1445  呉市      1
1444  呉市      1
1443  呉市      1
dtype: int64

Unnamed: 0,観測所名称,フリガナ,市町,水系名,河川名,データ所管,住所,緯度,経度,事務所,入力時使用
45,呉,クレ,呉市,堺川,堺川,砂防課,呉市東畑２丁目７－３８,34.253833,132.586944,,1


In [None]:
rain_data[(rain_data['station']==station)&(rain_data['date']==1469)].T

Unnamed: 0,603223,603225
date,1469,1469
station,呉,呉
city,呉市,呉市
00:00:00,,
01:00:00,0,0.5
02:00:00,0,0
03:00:00,0,0
04:00:00,0,0
05:00:00,0,0
06:00:00,0,0


In [None]:
rain_data.loc[602772:602775].T

Unnamed: 0,602772,602774
date,1468,1468
station,呉,呉
city,呉市,呉市
00:00:00,,
01:00:00,0,0.5
02:00:00,1,0.5
03:00:00,1,1.5
04:00:00,1,1
05:00:00,5,5
06:00:00,3,3.5
