# CVをつかって汎化性能を評価する

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
import lightgbm as lgb

from rdkit import Chem
from rdkit.Chem import Draw

import optuna
import joblib
import os

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

  import pkg_resources


## データ読み込み

In [None]:
# 入力：読み込みたい記述子のタイプを選択
descriptor_type = 'mordred_3d'  # 'rdkit' or 'mordred_2d' or 'mordred_3d'

# ベースのデータ
dataset = pd.read_csv('data/material_data.csv', index_col=0)

# 条件に応じて記述子ファイルを読み込む
if descriptor_type == "rdkit":
    des = pd.read_csv("outputs/descriptors/descriptor_rdkit.csv", index_col=0)
elif descriptor_type == "mordred_2d":
    des = pd.read_csv("outputs/descriptors/descriptor_mordred_2d.csv", index_col=0)
elif descriptor_type == "mordred_3d":
    des = pd.read_csv("outputs/descriptors/descriptor_mordred_3d.csv", index_col=0)
else:
    raise ValueError(f"未知のdescriptor_type: {descriptor_type}")

# 結合
dataset_full = pd.concat([dataset.reset_index(), des.reset_index(drop=True)], axis=1)
dataset_full = dataset_full.set_index('Material')

# 確認
print(dataset.shape, des.shape, dataset_full.shape)

(1258, 3) (1258, 1427) (1258, 1430)


In [3]:
# PLに欠損値が入っている行を消して、学習データのみにする
dataset_train = dataset_full.dropna(subset='PL')
# SMILESとTypeも消しておく
dataset_train = dataset_train.drop(['SMILES', 'Type'], axis=1)

dataset_train.shape

(251, 1428)

In [4]:
# infをNaNに置き換え
dataset_train = dataset_train.replace(np.inf, np.nan).fillna(np.nan)
dataset_train = dataset_train.drop(dataset_train.columns[dataset_train.isnull().any()], axis=1)

# 標準偏差が0の記述を削除
dataset_train = dataset_train.drop(dataset_train.columns[dataset_train.std() == 0], axis=1)

# 学習データのstdが0の列を特定
zero_std_cols = dataset_train.columns[dataset_train.std() == 0]

# 学習・未知データから同じ列を削除
dataset_train = dataset_train.drop(columns=zero_std_cols)

dataset_train.shape

(251, 1220)

In [5]:
# 目的変数と説明変数に分ける
y = dataset_train['PL']
X = dataset_train.drop('PL', axis=1)

## 予測

### PLS

In [99]:
# PLSモデルの構築
from sklearn.cross_decomposition import PLSRegression

model_pls = PLSRegression(n_components=6)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_pls.fit(autoscaled_X_train, autoscaled_y_train.values.ravel())
    y_pred_scaled = model_pls.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 55.0048
Fold MAE : 40.1641
Fold R2 : 0.7112

Fold RMSE : 39.1255
Fold MAE : 29.4016
Fold R2 : 0.8493

Fold RMSE : 81.0496
Fold MAE : 46.2467
Fold R2 : 0.5356

Fold RMSE : 45.6420
Fold MAE : 36.4036
Fold R2 : 0.8222

Fold RMSE : 62.8705
Fold MAE : 45.5084
Fold R2 : 0.6139

平均RMSE : 56.7385
平均MAE : 39.5449
平均R2 : 0.7064


### PLS+optuna

In [100]:
def objective_pls(trial):
    # ハイパーパラメータ: 潜在変数の数
    n_components = trial.suggest_int('n_components', 1, min(30, X.shape[1]))

    # KFoldで分割
    kf = KFold(n_splits=5, shuffle=True, random_state=1234)

    rmse_scores = []

    for train_index, val_index in kf.split(X):
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # 標準化（Xとy両方）
        X_scaler = StandardScaler()
        X_train_scaled = pd.DataFrame(X_scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
        X_val_scaled = pd.DataFrame(X_scaler.transform(X_val), index=X_val.index, columns=X_val.columns)

        y_scaler = StandardScaler()
        y_train_scaled = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])

        # モデル定義・学習
        model = PLSRegression(n_components=n_components)
        model.fit(X_train_scaled, y_train_scaled.values.ravel())

        # 予測（スケールを戻す）
        y_pred_scaled = model.predict(X_val_scaled)
        y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))

        # RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_scores.append(rmse)

    return np.mean(rmse_scores)


In [101]:
study = optuna.create_study(direction="minimize")
study.optimize(objective_pls, n_trials=30)

print("Best parameters:", study.best_params)
print("Best RMSE:", study.best_value)

[I 2025-08-06 21:16:08,621] A new study created in memory with name: no-name-08af5606-3f04-4c76-bab8-0f0607df6c7f
[I 2025-08-06 21:16:08,953] Trial 0 finished with value: 58.06818154787377 and parameters: {'n_components': 7}. Best is trial 0 with value: 58.06818154787377.
[I 2025-08-06 21:16:09,307] Trial 1 finished with value: 59.17286318559559 and parameters: {'n_components': 9}. Best is trial 0 with value: 58.06818154787377.
[I 2025-08-06 21:16:09,871] Trial 2 finished with value: 66.53241318819937 and parameters: {'n_components': 19}. Best is trial 0 with value: 58.06818154787377.
[I 2025-08-06 21:16:10,298] Trial 3 finished with value: 63.624355778202 and parameters: {'n_components': 16}. Best is trial 0 with value: 58.06818154787377.
[I 2025-08-06 21:16:10,841] Trial 4 finished with value: 71.59664444198032 and parameters: {'n_components': 23}. Best is trial 0 with value: 58.06818154787377.
[I 2025-08-06 21:16:11,193] Trial 5 finished with value: 56.839843254604226 and parameters

Best parameters: {'n_components': 6}
Best RMSE: 56.73847233522677


In [102]:
# PLSモデルの構築
from sklearn.cross_decomposition import PLSRegression

model_pls = PLSRegression(**study.best_params)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_pls.fit(autoscaled_X_train, autoscaled_y_train.values.ravel())
    y_pred_scaled = model_pls.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 55.0048
Fold MAE : 40.1641
Fold R2 : 0.7112

Fold RMSE : 39.1255
Fold MAE : 29.4016
Fold R2 : 0.8493

Fold RMSE : 81.0496
Fold MAE : 46.2467
Fold R2 : 0.5356

Fold RMSE : 45.6420
Fold MAE : 36.4036
Fold R2 : 0.8222

Fold RMSE : 62.8705
Fold MAE : 45.5084
Fold R2 : 0.6139

平均RMSE : 56.7385
平均MAE : 39.5449
平均R2 : 0.7064


### RandomForest

In [103]:
# デフォルト
model_rf = RandomForestRegressor(random_state=1234)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_rf.fit(autoscaled_X_train, autoscaled_y_train.values.ravel())
    y_pred_scaled = model_rf.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 51.2594
Fold MAE : 36.0020
Fold R2 : 0.7491

Fold RMSE : 44.5128
Fold MAE : 31.6281
Fold R2 : 0.8050

Fold RMSE : 48.6313
Fold MAE : 31.3504
Fold R2 : 0.8328

Fold RMSE : 47.4534
Fold MAE : 32.8682
Fold R2 : 0.8079

Fold RMSE : 53.2576
Fold MAE : 32.8291
Fold R2 : 0.7229

平均RMSE : 49.0229
平均MAE : 32.9355
平均R2 : 0.7835


### RandomForest+Optuna

In [104]:
def objective_rf(trial):
    # 検証するパラメータ
    params = {
    "n_estimators" : trial.suggest_int('n_estimators', 5, 1000),
    "max_depth" : trial.suggest_int('max_depth', 3, 50),
    "min_samples_split" : trial.suggest_int('min_samples_split', 2, 10),
    "min_samples_leaf" : trial.suggest_int('min_samples_leaf', 1, 10),
    "max_features" : trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
    'random_state' : 0,
    'n_jobs' : -1,

    }

    # モデル定義
    model_rf = RandomForestRegressor(**params)

    # 分割、CV
    kf = KFold(n_splits=5, shuffle=True, random_state=1234)

    # 評価指標
    rmse_scores = []
    # mae_scores = []
    # r2_scores = []

    for train_index, val_index in kf.split(X):
        # 訓練と検証に分類
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # 標準化、DataFrameに戻す
        X_scaler = StandardScaler()
        autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

        y_scaler = StandardScaler()
        autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
        autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

        # 学習&予測
        model_rf.fit(autoscaled_X_train, autoscaled_y_train)
        y_pred_scaled = model_rf.predict(autoscaled_X_val)
        y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

        # 性能チェック
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        # mae = mean_absolute_error(y_val, y_pred)
        # r2 = r2_score(y_val, y_pred)

        # 結果格納
        rmse_scores.append(rmse)
        # mae_scores.append(mae)
        # r2_scores.append(r2)

    return np.mean(rmse_scores)  # 最適化したい評価指標を選ぶ



In [105]:
# 最適化
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective_rf, n_trials=50)

[I 2025-08-06 21:17:04,042] A new study created in memory with name: regression
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-08-06 21:17:11,073] Trial 0 finished with value: 58.658152620834265 and parameters: {'n_estimators': 657, 'max_depth': 6, 'min_samples_split': 2, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 0 with value: 58.658152620834265.
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
[I 2025-08-06 21:17:14,742] Trial 1 finished with value: 48.54104783302864 and parameters: {'n_estimators': 303, 'max_depth': 27, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_featur

In [106]:
# ハイパーパラメータ・スコアの確認
print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  RMSE: 47.2222
  Params:
    n_estimators: 867
    max_depth: 24
    min_samples_split: 5
    min_samples_leaf: 1
    max_features: sqrt


In [107]:
# optunaで最適化されたパラメータをセットし、予測
model_rf_op = RandomForestRegressor(**study.best_params)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習&予測
    model_rf_op.fit(autoscaled_X_train, autoscaled_y_train.values.ravel())
    y_pred_scaled = model_rf_op.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 47.4713
Fold MAE : 36.6037
Fold R2 : 0.7849

Fold RMSE : 44.9971
Fold MAE : 32.7044
Fold R2 : 0.8007

Fold RMSE : 50.6210
Fold MAE : 34.2110
Fold R2 : 0.8188

Fold RMSE : 52.3166
Fold MAE : 39.0137
Fold R2 : 0.7665

Fold RMSE : 42.8248
Fold MAE : 30.2876
Fold R2 : 0.8209

平均RMSE : 47.6462
平均MAE : 34.5641
平均R2 : 0.7983


In [108]:
# 最終モデルの構築
model_rf_final = RandomForestRegressor(**study.best_params)

# すべての学習データを標準化して、学習させる
X_scaler_final = StandardScaler()
y_scaler_final = StandardScaler()
autoscaled_X = X_scaler_final.fit_transform(X)
autoscaled_y = y_scaler_final.fit_transform(y.values.reshape(-1, 1))

# 学習
model_rf_final = model_rf_final.fit(autoscaled_X, autoscaled_y.ravel())

# ディレクトリ作成（なければ）
os.makedirs('models/rf', exist_ok=True)

# モデルとスケーラーの保存（joblib使用）
joblib.dump(model_rf_final, 'models/rf/model_rf.pkl')
joblib.dump(X_scaler_final, 'models/rf/X_scaler.pkl')
joblib.dump(y_scaler_final, 'models/rf/y_scaler.pkl')

['models/rf/y_scaler.pkl']

### XGBoost

In [109]:
# 初手
# デフォルトXGB
model_xgb = xgb.XGBRegressor(random_state=1234)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_xgb.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)], verbose=0)
    y_pred_scaled = model_xgb.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}\n')

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 56.0942
Fold MAE : 34.3792
Fold R2 : 0.6996

Fold RMSE : 43.7661
Fold MAE : 29.1311
Fold R2 : 0.8115

Fold RMSE : 57.9358
Fold MAE : 40.7227
Fold R2 : 0.7627

Fold RMSE : 48.3889
Fold MAE : 30.4412
Fold R2 : 0.8002

Fold RMSE : 55.5638
Fold MAE : 34.8252
Fold R2 : 0.6984

平均RMSE : 52.3498
平均MAE : 33.8999
平均R2 : 0.7545


- RFより若干良い？

### XGBoost+optuna

In [110]:
def objective(trial):
    # ハイパーパラメータのサンプリング
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': 1234,
        'tree_method': 'hist',  # gpu_histだとエラー出た
    }

    # モデル定義
    model_xgb = xgb.XGBRegressor(**params)

    # 分割、CV
    kf = KFold(n_splits=5, shuffle=True, random_state=1234)

    # 評価指標
    rmse_scores = []
    # mae_scores = []
    # r2_scores = []

    for train_index, val_index in kf.split(X):
        # 訓練と検証に分類
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # 標準化、DataFrameに戻す
        X_scaler = StandardScaler()
        autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

        y_scaler = StandardScaler()
        autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
        autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

        # 学習
        model_xgb.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)], verbose=0)
        y_pred_scaled = model_xgb.predict(autoscaled_X_val)
        y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

        # 性能チェック
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        # mae = mean_absolute_error(y_val, y_pred)
        # r2 = r2_score(y_val, y_pred)

        # 結果格納
        rmse_scores.append(rmse)
        # mae_scores.append(mae)
        # r2_scores.append(r2)

    return np.mean(rmse_scores)  # 最適化したい評価指標を選ぶ

In [111]:
# 最適化
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=50)

[I 2025-08-06 21:23:50,505] A new study created in memory with name: regression


[I 2025-08-06 21:24:08,910] Trial 0 finished with value: 65.23266144683647 and parameters: {'n_estimators': 182, 'max_depth': 3, 'learning_rate': 0.006087660706064375, 'subsample': 0.8139662514785513, 'colsample_bytree': 0.792103872776475, 'reg_alpha': 2.202878084801497, 'reg_lambda': 0.004431468206482492}. Best is trial 0 with value: 65.23266144683647.
[I 2025-08-06 21:24:28,572] Trial 1 finished with value: 88.55277319458571 and parameters: {'n_estimators': 104, 'max_depth': 5, 'learning_rate': 0.0038650220964551096, 'subsample': 0.9758788724737484, 'colsample_bytree': 0.5527691452618815, 'reg_alpha': 0.0016042001284223455, 'reg_lambda': 4.345603173242743e-08}. Best is trial 0 with value: 65.23266144683647.
[I 2025-08-06 21:24:34,219] Trial 2 finished with value: 56.14249083753706 and parameters: {'n_estimators': 60, 'max_depth': 3, 'learning_rate': 0.023262200685475663, 'subsample': 0.6846700763806285, 'colsample_bytree': 0.5071005496152201, 'reg_alpha': 9.669644438936276e-05, 'reg_

In [112]:
# ハイパーパラメータ・スコアの確認
print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  RMSE: 44.8686
  Params:
    n_estimators: 274
    max_depth: 5
    learning_rate: 0.02861790659974872
    subsample: 0.5585990035465763
    colsample_bytree: 0.9454726083312922
    reg_alpha: 1.362661959560019e-08
    reg_lambda: 8.861825605411882


In [113]:
# optunaで最適化されたパラメータをセットし、予測
model_xgb_op = xgb.XGBRegressor(**study.best_params)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_xgb_op.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)], verbose=0)
    y_pred_scaled = model_xgb_op.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}\n')

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 46.1361
Fold MAE : 32.2237
Fold R2 : 0.7968

Fold RMSE : 38.8037
Fold MAE : 28.3488
Fold R2 : 0.8518

Fold RMSE : 47.6693
Fold MAE : 29.3928
Fold R2 : 0.8394

Fold RMSE : 45.4361
Fold MAE : 32.5245
Fold R2 : 0.8238

Fold RMSE : 47.7451
Fold MAE : 32.5032
Fold R2 : 0.7773

平均RMSE : 45.1581
平均MAE : 30.9986
平均R2 : 0.8178


In [114]:
# 最終モデルの構築
model_xgb_final = xgb.XGBRegressor(**study.best_params)

# すべての学習データを標準化して、学習させる
X_scaler_final = StandardScaler()
y_scaler_final = StandardScaler()
autoscaled_X = X_scaler_final.fit_transform(X)
autoscaled_y = y_scaler_final.fit_transform(y.values.reshape(-1, 1))

# 学習
model_xgb_final = model_xgb_final.fit(autoscaled_X, autoscaled_y.ravel())

# ディレクトリ作成（なければ）
os.makedirs('models/xgb', exist_ok=True)

# モデルとスケーラーの保存（joblib使用）
joblib.dump(model_xgb_final, 'models/xgb/model_xgb.pkl')
joblib.dump(X_scaler_final, 'models/xgb/X_scaler.pkl')
joblib.dump(y_scaler_final, 'models/xgb/y_scaler.pkl')

['models/xgb/y_scaler.pkl']

### LightGBM

In [115]:
# デフォルト
model_lgb = lgb.LGBMRegressor(random_state=1234, verbose=-1)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_lgb.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)])
    y_pred_scaled = model_lgb.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 45.1902
Fold MAE : 31.6159
Fold R2 : 0.8050

Fold RMSE : 44.6490
Fold MAE : 32.9797
Fold R2 : 0.8038

Fold RMSE : 52.8957
Fold MAE : 34.8776
Fold R2 : 0.8022

Fold RMSE : 42.6535
Fold MAE : 30.5197
Fold R2 : 0.8448

Fold RMSE : 50.3521
Fold MAE : 34.6076
Fold R2 : 0.7523

平均RMSE : 47.1481
平均MAE : 32.9201
平均R2 : 0.8016


### LightGBM+optuna

In [116]:
def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',  # LightGBMが内部で使う評価指標
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
        'random_state': 42,
        'verbosity': -1,
        'device' : 'cpu'
    }

    # モデル定義
    model_lgb = lgb.LGBMRegressor(**params)

    # 分割、CV
    kf = KFold(n_splits=5, shuffle=True, random_state=1234)

    # 評価指標
    rmse_scores = []
    # mae_scores = []
    # r2_scores = []

    for train_index, val_index in kf.split(X):
        # 訓練と検証に分類
        X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]

        # 標準化、DataFrameに戻す
        X_scaler = StandardScaler()
        autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
        autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

        y_scaler = StandardScaler()
        autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
        autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

        # 学習
        model_lgb.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)])
        y_pred_scaled = model_lgb.predict(autoscaled_X_val)
        y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

        # 性能チェック
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        # mae = mean_absolute_error(y_val, y_pred)
        # r2 = r2_score(y_val, y_pred)

        # 結果格納
        rmse_scores.append(rmse)
        # mae_scores.append(mae)
        # r2_scores.append(r2)

    return np.mean(rmse_scores)  # 最適化したい評価指標を選ぶ

In [117]:
# 最適化
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

[I 2025-08-06 21:56:40,392] A new study created in memory with name: regression


[I 2025-08-06 21:56:43,114] Trial 0 finished with value: 46.612237582447825 and parameters: {'n_estimators': 228, 'learning_rate': 0.03058956945426088, 'max_depth': 10, 'num_leaves': 57, 'subsample': 0.842476559726396, 'colsample_bytree': 0.9894701869599233, 'reg_alpha': 0.00037161006983939244, 'reg_lambda': 6.925070853712586e-07}. Best is trial 0 with value: 46.612237582447825.
[I 2025-08-06 21:56:43,904] Trial 1 finished with value: 70.86689269892109 and parameters: {'n_estimators': 59, 'learning_rate': 0.011879451489631501, 'max_depth': 3, 'num_leaves': 12, 'subsample': 0.9998838520703163, 'colsample_bytree': 0.7427498758995823, 'reg_alpha': 0.0063167522121264875, 'reg_lambda': 9.207357955112185e-05}. Best is trial 0 with value: 46.612237582447825.
[I 2025-08-06 21:56:46,125] Trial 2 finished with value: 47.18256426968496 and parameters: {'n_estimators': 186, 'learning_rate': 0.04738067384599583, 'max_depth': 8, 'num_leaves': 62, 'subsample': 0.6504497098052655, 'colsample_bytree': 

In [118]:
# ハイパーパラメータ・スコアの確認
print("Best trial:")
trial = study.best_trial

print(f"  RMSE: {trial.value:.4f}")
print("  Params:")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

Best trial:
  RMSE: 46.3403
  Params:
    n_estimators: 255
    learning_rate: 0.01988402943452076
    max_depth: 6
    num_leaves: 25
    subsample: 0.6936459178623545
    colsample_bytree: 0.9451153576709722
    reg_alpha: 2.890962152474971e-07
    reg_lambda: 7.921445115940868e-08


In [119]:
# optunaで最適化されたパラメータをセットし、予測
model_lgb_op = lgb.LGBMRegressor(**study.best_params)

# 5分割交差検証
kf = KFold(n_splits=5, shuffle=True, random_state=1234)

# スコア保存用
rmse_scores = []
mae_scores = []
r2_scores = []

for train_index, val_index in kf.split(X):
    # 訓練と検証に分類
    X_train, X_val = X.iloc[train_index, :], X.iloc[val_index, :]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]

    # 標準化、DataFrameに戻す
    X_scaler = StandardScaler()
    autoscaled_X_train = pd.DataFrame(X_scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
    autoscaled_X_val = pd.DataFrame(X_scaler.transform(X_val), columns=X_val.columns, index=X_val.index)

    y_scaler = StandardScaler()
    autoscaled_y_train = pd.DataFrame(y_scaler.fit_transform(y_train.values.reshape(-1,1)), index=y_train.index, columns=['y'])
    autoscaled_y_val = pd.DataFrame(y_scaler.transform(y_val.values.reshape(-1,1)), index=y_val.index, columns=['y'])

    # 学習
    model_lgb_op.fit(autoscaled_X_train, autoscaled_y_train, eval_set=[(autoscaled_X_val, autoscaled_y_val)])
    y_pred_scaled = model_lgb_op.predict(autoscaled_X_val)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1,1))

    # 性能チェック
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2 : {r2:.4f}')
    print()

print(f'平均RMSE : {np.mean(rmse_scores):.4f}')
print(f'平均MAE : {np.mean(mae_scores):.4f}')
print(f'平均R2 : {np.mean(r2_scores):.4f}')

Fold RMSE : 46.5304
Fold MAE : 31.9990
Fold R2 : 0.7933

Fold RMSE : 44.0963
Fold MAE : 32.5853
Fold R2 : 0.8086

Fold RMSE : 50.2665
Fold MAE : 32.5033
Fold R2 : 0.8214

Fold RMSE : 44.5412
Fold MAE : 32.0285
Fold R2 : 0.8307

Fold RMSE : 48.8479
Fold MAE : 33.3066
Fold R2 : 0.7669

平均RMSE : 46.8565
平均MAE : 32.4846
平均R2 : 0.8042


In [120]:
# 最終モデルの構築
model_lgb_final = lgb.LGBMRegressor(**study.best_params)

# すべての学習データを標準化して、学習させる
X_scaler_final = StandardScaler()
y_scaler_final = StandardScaler()
autoscaled_X = X_scaler_final.fit_transform(X)
autoscaled_y = y_scaler_final.fit_transform(y.values.reshape(-1, 1))

# 学習
model_lgb_final = model_lgb_final.fit(autoscaled_X, autoscaled_y.ravel())

# ディレクトリ作成（なければ）
os.makedirs('models/lgb', exist_ok=True)

# モデルとスケーラーの保存
joblib.dump(model_lgb_final, 'models/lgb/model_lgb.pkl')
joblib.dump(X_scaler_final, 'models/lgb/X_scaler.pkl')
joblib.dump(y_scaler_final, 'models/lgb/y_scaler.pkl')

['models/lgb/y_scaler.pkl']

### NN

In [6]:
# GPU set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(device)

cuda


In [7]:
# NN定義
class MLPRegressor(nn.Module):
    def __init__(self, input_dim):
        super(MLPRegressor, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.Linear(16, 1)
        )

    def forward(self, x):
        return self.model(x)

In [8]:
# 学習loop
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()

        # 勾配クリッピング追加！
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# 評価
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [None]:
# 評価指標格納用
rmse_scores = []
mae_scores = []
r2_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=1234)

for train_index, val_index in kf.split(X):
    # モデル構築
    model_nn = MLPRegressor(input_dim=X.shape[1]).to(device)
    # 損失関数
    criterion = nn.MSELoss()
    # 最適化関数
    optimizer = torch.optim.Adam(model_nn.parameters(), lr=1e-4)

    # 分割
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    # 標準化（情報漏洩防止）
    X_scaler = StandardScaler()
    autoscaled_X_train = X_scaler.fit_transform(X_train)
    autoscaled_X_val = X_scaler.transform(X_val)

    y_scaler = StandardScaler()
    autoscaled_y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
    autoscaled_y_val = y_scaler.transform(y_val.values.reshape(-1, 1))

    # tensor化
    X_train_tensor = torch.tensor(autoscaled_X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(autoscaled_y_train, dtype=torch.float32).view(-1, 1)

    X_val_tensor = torch.tensor(autoscaled_X_val, dtype=torch.float32)
    y_val_tensor = torch.tensor(autoscaled_y_val, dtype=torch.float32).view(-1, 1)

    # DataLoader
    train_ds = TensorDataset(X_train_tensor, y_train_tensor)
    val_ds = TensorDataset(X_val_tensor, y_val_tensor)
    train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=16, shuffle=False)

    # 学習ループ
    epochs = 100

    for epoch in range(epochs):
        train(model_nn, train_dl, optimizer, criterion, device)

    # 推論と逆変換
    model_nn.eval()
    preds = []
    with torch.no_grad():
        for X_batch, _ in val_dl:
            X_batch = X_batch.to(device)
            y_pred_std = model_nn(X_batch).cpu().numpy()
            preds.append(y_pred_std)

    y_pred = y_scaler.inverse_transform(np.vstack(preds))
    y_true = y_scaler.inverse_transform(autoscaled_y_val)

    # 評価指標
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2  : {r2:.4f}\n')

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

# 結果出力
print(f'平均RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}')
print(f'平均MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}')
print(f'平均R2: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}')

Fold RMSE : 49.5397
Fold MAE : 34.3333
Fold R2  : 0.7657

Fold RMSE : 45.5003
Fold MAE : 34.3626
Fold R2  : 0.7963

Fold RMSE : 62.7420
Fold MAE : 38.1763
Fold R2  : 0.7217

Fold RMSE : 48.0101
Fold MAE : 35.1398
Fold R2  : 0.8033

Fold RMSE : 50.9370
Fold MAE : 36.0248
Fold R2  : 0.7466

平均RMSE: 51.3458 ± 5.9766
平均MAE: 35.6074 ± 1.4258
平均R2: 0.7667 ± 0.0305


### NN+Optuna

In [10]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [12]:
# データセット定義
class RegressionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
# 学習loop
def train(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for X_batch, y_batch in dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        pred = model(X_batch)
        loss = criterion(pred, y_batch)
        loss.backward()

        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(dataloader)

# 評価
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for X_batch, y_batch in dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            total_loss += loss.item()
    return total_loss / len(dataloader)

In [13]:
# Optuna設定
def define_model(trial, input_dim):
    layers = []
    n_layers = trial.suggest_int("n_layers", 2, 5)
    hidden_dim = trial.suggest_int("hidden_dim", 32, 1024)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)
    activation_name = trial.suggest_categorical("activation", ["relu", "leaky_relu"])
    use_batchnorm = trial.suggest_categorical("use_batchnorm", [True, False])

    in_dim = input_dim
    for i in range(n_layers):
        layers.append(nn.Linear(in_dim, hidden_dim))

        # 活性化関数の前に入れる
        if use_batchnorm:
            layers.append(nn.BatchNorm1d(hidden_dim))

        if activation_name == "relu":
            layers.append(nn.ReLU())
        else:
            layers.append(nn.LeakyReLU())
        layers.append(nn.Dropout(dropout_rate))
        in_dim = hidden_dim

    layers.append(nn.Linear(in_dim, 1))
    return nn.Sequential(*layers)

def objective(trial):
    # ハイパーパラメータ提案
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 128])
    lr = trial.suggest_float("lr", 1e-5, 1e-3, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "SGD"])
    epochs = trial.suggest_int("epochs", 30, 100)

    # モデル定義
    model = define_model(trial, X.shape[1]).to(device)
    criterion = nn.MSELoss()

    if optimizer_name == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)

    kf = KFold(n_splits=5, shuffle=True, random_state=1234)
    val_losses = []

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # 標準化
        X_scaler = StandardScaler()
        autoscaled_X_train = X_scaler.fit_transform(X_train)
        autoscaled_X_val = X_scaler.transform(X_val)

        y_scaler = StandardScaler()
        autoscaled_y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
        autoscaled_y_val = y_scaler.transform(y_val.values.reshape(-1, 1))

        # Dataset, DataLoader
        train_ds = RegressionDataset(autoscaled_X_train, autoscaled_y_train)
        val_ds = RegressionDataset(autoscaled_X_val, autoscaled_y_val)
        train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
        val_dl = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

        # 学習、評価
        for epoch in range(epochs):
            train(model, train_dl, optimizer, criterion, device)

        val_loss = evaluate(model, val_dl, criterion, device)
        val_losses.append(val_loss)

    return np.mean(val_losses)


In [14]:
# 実行
print('Optuna最適化開始...')
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50) # 試行回数を増やす

print('Best trial :', study.best_trial.params)

[I 2025-08-17 19:38:10,997] A new study created in memory with name: no-name-d847193f-a611-4640-82b9-cdcc5e82b15b


Optuna最適化開始...


[I 2025-08-17 19:38:14,930] Trial 0 finished with value: 0.3031741701066494 and parameters: {'batch_size': 64, 'lr': 0.00030933837402161407, 'optimizer': 'SGD', 'epochs': 38, 'n_layers': 2, 'hidden_dim': 434, 'dropout_rate': 0.05071778690889278, 'activation': 'leaky_relu', 'use_batchnorm': True}. Best is trial 0 with value: 0.3031741701066494.
[I 2025-08-17 19:38:20,461] Trial 1 finished with value: 0.11292753592133523 and parameters: {'batch_size': 128, 'lr': 0.00023927648110724282, 'optimizer': 'Adam', 'epochs': 75, 'n_layers': 3, 'hidden_dim': 410, 'dropout_rate': 0.4911731660684614, 'activation': 'relu', 'use_batchnorm': True}. Best is trial 1 with value: 0.11292753592133523.
[I 2025-08-17 19:38:27,047] Trial 2 finished with value: 0.1687363401055336 and parameters: {'batch_size': 128, 'lr': 0.0005597200372000181, 'optimizer': 'SGD', 'epochs': 58, 'n_layers': 5, 'hidden_dim': 705, 'dropout_rate': 0.23515154090165558, 'activation': 'relu', 'use_batchnorm': True}. Best is trial 1 wit

Best trial : {'batch_size': 128, 'lr': 0.00017354962311365348, 'optimizer': 'Adam', 'epochs': 87, 'n_layers': 3, 'hidden_dim': 524, 'dropout_rate': 0.011821131613319363, 'activation': 'relu', 'use_batchnorm': False}


In [15]:
# optunaで最適化されたパラメータをセットし、予測

# 評価指標格納用
rmse_scores = []
mae_scores = []
r2_scores = []

kf = KFold(n_splits=5, shuffle=True, random_state=1234)

for train_index, val_index in kf.split(X):
    # 分割
    X_train = X.iloc[train_index]
    X_val = X.iloc[val_index]
    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    # 標準化（情報漏洩防止）
    X_scaler = StandardScaler()
    autoscaled_X_train = X_scaler.fit_transform(X_train)
    autoscaled_X_val = X_scaler.transform(X_val)

    y_scaler = StandardScaler()
    autoscaled_y_train = y_scaler.fit_transform(y_train.values.reshape(-1, 1))
    autoscaled_y_val = y_scaler.transform(y_val.values.reshape(-1, 1))

    # DataLoader
    train_ds = RegressionDataset(autoscaled_X_train, autoscaled_y_train)
    val_ds = RegressionDataset(autoscaled_X_val, autoscaled_y_val)
    train_dl = DataLoader(train_ds, batch_size=study.best_trial.params['batch_size'], shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=study.best_trial.params['batch_size'], shuffle=False)

    # モデル構築
    model_nn_op = define_model(study.best_trial, input_dim=autoscaled_X_train.shape[1]).to(device)
    optimizer_name = study.best_trial.params['optimizer']
    lr = study.best_trial.params['lr']
    optimizer = torch.optim.Adam(model_nn_op.parameters(), lr=lr) if optimizer_name == "Adam" else torch.optim.SGD(model_nn_op.parameters(), lr=lr)
    criterion = nn.MSELoss()

    # 学習ループ
    epochs = study.best_trial.params['epochs']  # Optuna で最適化されたエポック数を使う

    for epoch in range(epochs):
        train(model_nn_op, train_dl, optimizer, criterion, device)

    # 推論と逆変換
    model_nn_op.eval()
    preds = []
    with torch.no_grad():
        for X_batch, _ in val_dl:
            X_batch = X_batch.to(device)
            y_pred_std = model_nn_op(X_batch).cpu().numpy()
            preds.append(y_pred_std)

    y_pred = y_scaler.inverse_transform(np.vstack(preds))
    y_true = y_scaler.inverse_transform(autoscaled_y_val)

    # 評価指標
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f'Fold RMSE : {rmse:.4f}')
    print(f'Fold MAE : {mae:.4f}')
    print(f'Fold R2  : {r2:.4f}\n')

    rmse_scores.append(rmse)
    mae_scores.append(mae)
    r2_scores.append(r2)

# 結果出力
print(f'平均RMSE: {np.mean(rmse_scores):.4f} ± {np.std(rmse_scores):.4f}')
print(f'平均MAE: {np.mean(mae_scores):.4f} ± {np.std(mae_scores):.4f}')
print(f'平均R2: {np.mean(r2_scores):.4f} ± {np.std(r2_scores):.4f}')

Fold RMSE : 49.7124
Fold MAE : 34.7088
Fold R2  : 0.7641

Fold RMSE : 43.7754
Fold MAE : 28.2927
Fold R2  : 0.8114

Fold RMSE : 62.1350
Fold MAE : 36.7677
Fold R2  : 0.7271

Fold RMSE : 42.8854
Fold MAE : 32.9179
Fold R2  : 0.8431

Fold RMSE : 61.7221
Fold MAE : 38.7026
Fold R2  : 0.6279

平均RMSE: 52.0461 ± 8.4045
平均MAE: 34.2779 ± 3.5675
平均R2: 0.7547 ± 0.0748


In [16]:
# すべてのデータを学習させて、保存
# 全データで標準化
X_scaler_final = StandardScaler()
y_scaler_final = StandardScaler()
autoscaled_X = X_scaler_final.fit_transform(X)
autoscaled_y = y_scaler_final.fit_transform(y.values.reshape(-1, 1))

# Dataset, DataLoader
final_ds = RegressionDataset(autoscaled_X, autoscaled_y)
final_dl = DataLoader(final_ds, batch_size=study.best_trial.params['batch_size'], shuffle=True)

# モデル定義
final_model = define_model(study.best_trial, input_dim=autoscaled_X.shape[1]).to(device)
optimizer_name = study.best_trial.params['optimizer']
lr = study.best_trial.params['lr']
optimizer = torch.optim.Adam(final_model.parameters(), lr=lr) if optimizer_name == "Adam" else torch.optim.SGD(final_model.parameters(), lr=lr)
criterion = nn.MSELoss()
epochs = study.best_trial.params['epochs']

# 学習ループ
for epoch in range(epochs):
    train(final_model, final_dl, optimizer, criterion, device)

In [18]:
# 保存先ディレクトリを作成（なければ）
os.makedirs('models/nn', exist_ok=True)

# PyTorchモデルの保存（state_dict）、モデルの重み
torch.save(final_model.state_dict(), 'models/nn/final_model.pth')

# Optunaのbest_trial(ハイパラ)
joblib.dump(study.best_trial.params, 'models/nn/hparams.pkl')

# 特徴量名（列順の再現用)
joblib.dump(list(X.columns), 'models/nn/feature_names.pkl')

# スケーラーの保存（joblib）
joblib.dump(X_scaler_final, 'models/nn/X_scaler.pkl')
joblib.dump(y_scaler_final, 'models/nn/y_scaler.pkl')

['models/nn/y_scaler.pkl']