<a href="https://colab.research.google.com/github/Hidenori24/LearnDirectory/blob/master/SMBC2025_ver2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. ライブラリセットアップ


In [8]:
# ============================================
# 0. ライブラリ & CFG 定義
# ============================================
!pip -q install lightgbm==4.3.0 polars==0.20.19 holidays==0.42
!pip install -U scikit-learn -q

import os, random, math, gc, pickle, warnings
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import holidays
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# ---------- CFG ----------
class CFG:
    seed         = 42
    n_folds      = 5
    early_stop   = 300
    num_boost_round = 20_000
    test_size_hr = 4380          # ≒6ヶ月
    lags         = [1, 24]
    rolls        = [24, 168]
    data_path    = '/content/drive/MyDrive/ML/Signate_1634/'
    use_polars   = False         # True にすると FE 後が高速
    lgb_params = {
        'objective'      : 'regression',
        'metric'         : 'rmse',
        'learning_rate'  : 0.05,
        'num_leaves'     : 256,
        'subsample'      : 0.8,
        'colsample_bytree': 0.8,
        'seed'           : seed,
        'verbose'        : -1,
    }

# set seed
random.seed(CFG.seed)
np.random.seed(CFG.seed)


# 1. Google Drive マウント


In [9]:
# ============================================
# 1. Google Drive マウント
# ============================================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


# 2. データ読み込み

In [10]:
# =========================================================
# 2. データ読み込み
#    - index を DatetimeIndex（UTC）に
# =========================================================
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)                 # まずは普通に読み込み
    df['time'] = pd.to_datetime(df['time'], utc=True)   # ①文字列→datetime(UTC)
    df['time'] = df['time'].dt.tz_convert(None)         # ②タイムゾーン情報を外す（naive へ）
    df = df.set_index('time').sort_index()              # ③DatetimeIndex として設定
    return df

train_df = read_data(os.path.join(CFG.data_path, 'train.csv'))
test_df  = read_data(os.path.join(CFG.data_path, 'test.csv'))

print('train', train_df.shape, 'test', test_df.shape)

train (26280, 91) test (8760, 90)


# 3. 特徴量生成

In [11]:
# ============================================
# 3. 特徴量エンジニアリング (改善版)
# ============================================
es_holidays = holidays.country_holidays('ES', years=range(2015, 2019))
holiday_set = set(es_holidays.keys())  # set で高速判定

def add_calendar(df):
    idx = df.index
    df['hour']       = idx.hour
    df['dow']        = idx.dayofweek
    df['month']      = idx.month
    df['is_weekend'] = (df['dow'] >= 5).astype(np.int8)
    df['is_holiday'] = np.isin(idx.date, list(holiday_set)).astype(np.int8)

    df['sin_hour'] = np.sin(2*np.pi*df['hour']/24)
    df['cos_hour'] = np.cos(2*np.pi*df['hour']/24)
    df['sin_dow']  = np.sin(2*np.pi*df['dow']/7)
    df['cos_dow']  = np.cos(2*np.pi*df['dow']/7)
    df['sin_month'] = np.sin(2*np.pi*(df['month']-1)/12) # 月の周期性
    df['cos_month'] = np.cos(2*np.pi*(df['month']-1)/12)

    return df

def add_supply_gap(df):
    gen_cols = [c for c in df.columns if c.startswith('generation_')]
    df['supply_total'] = df[gen_cols].sum(axis=1)
    df['gap_supply_demand'] = df['supply_total'] - df['total_load_actual']

    ren_cols = [c for c in gen_cols if any(k in c for k in ['solar','wind','hydro'])]
    df['renewable_ratio'] = df[ren_cols].sum(axis=1) / df['supply_total']
    df['renewable_ratio'] = df['renewable_ratio'].replace([np.inf, -np.inf], np.nan).fillna(0) # 無限大やNaNの処理
    return df

# 改善1: より多くのラグとロールを追加
def add_lag_roll_enhanced(df, lags=None, rolls=None):
    if lags is None:
        lags = [1, 24, 24*7, 24*30] # 1時間、1日、1週間、1ヶ月のラグを追加
    if rolls is None:
        rolls = [24, 168, 24*30] # 1日、1週間、1ヶ月のロールを追加

    for l in lags:
        df[f'price_lag_{l}']  = df['price_actual'].shift(l)
        df[f'demand_lag_{l}'] = df['total_load_actual'].shift(l)
    for r in rolls:
        df[f'price_rollmean_{r}'] = df['price_actual'].shift(1).rolling(r).mean()
        df[f'price_rollstd_{r}'] = df['price_actual'].shift(1).rolling(r).std() # priceにもrollstdを追加
        df[f'gap_rollstd_{r}']    = df['gap_supply_demand'].shift(1).rolling(r).std()
        df[f'demand_rollmean_{r}'] = df['total_load_actual'].shift(1).rolling(r).mean() # demandにもrollmeanを追加
    return df

# 改善2: 時間に関連する交互作用特徴量
def add_interaction_features(df):
    df['hour_x_dow'] = df['hour'] * df['dow']
    df['is_weekend_x_hour'] = df['is_weekend'] * df['hour']
    df['is_holiday_x_hour'] = df['is_holiday'] * df['hour']
    return df

def make_features_enhanced(full):
    full = add_calendar(full)
    full = add_supply_gap(full)
    full = add_lag_roll_enhanced(full) # 改善1の関数を呼び出し
    full = add_interaction_features(full) # 改善2の関数を呼び出し
    return full

# train+test を縦結合して一括 FE
full_df = pd.concat(
    [train_df, test_df.assign(price_actual=np.nan)],
    axis=0
)
full_df = make_features_enhanced(full_df) # 改善版の特徴量生成関数を使用

# 欠損補完（時系列なので forward → 数値列平均）
full_df = full_df.fillna(method='ffill')
num_cols = full_df.select_dtypes(include=[np.number, 'bool']).columns
full_df[num_cols] = full_df[num_cols].fillna(full_df[num_cols].mean())

print('FE 完了:', full_df.shape)

FE 完了: (35040, 128)


# 4. 数値列 & カテゴリー列を分離

In [12]:
# ============================================
# 4. 数値列 & カテゴリー列を分離
#    ─ LightGBM にカテゴリーを渡す設定 ─
# ============================================
TARGET = 'price_actual'

# 1) すべての object 列を category 型へ変換
str_cols = full_df.select_dtypes('object').columns
full_df[str_cols] = full_df[str_cols].astype('category')

# 2) カテゴリー列リストを自動取得
cat_cols = list(full_df.select_dtypes('category').columns)

# LightGBM 用のカテゴリー列名リスト
lgb_cat_features = cat_cols

# XGBoost は、通常は数値特徴量として扱うか、One-Hot Encoding などの前処理が必要です。
# ここではシンプルに数値変換された特徴量をそのまま渡すため、特別な設定は不要です。

# 3) 特徴量リスト（目的変数を除く全列）
features = [c for c in full_df.columns if c != TARGET]

# 4) 学習・テスト DataFrame
train_fe = full_df.loc[train_df.index]
test_fe  = full_df.loc[test_df.index]

# 5) 学習 / 推論データ
# XGBoost は pandas DataFrame をそのまま扱えるため、LightGBM の to_numeric_np はここでは不要ですが、
# LightGBM のために残しておきます。
X_train, y_train = train_fe[features], train_fe[TARGET]
X_test           = test_fe[features]

# カテゴリー列を数値に変換 (XGBoost も内部で処理できますが、明示的に変換しておくことも可能です)
# ここでは LightGBM の to_numeric_np 関数を再利用します。
def to_numeric_np(df: pd.DataFrame) -> np.ndarray:
    """
    - category は codes(int32)
    - object が残っていれば factorize(int32)
    - 数値 / bool はそのまま
    """
    df_num = df.copy()

    # category → int32
    for col in cat_cols:
        df_num[col] = df_num[col].cat.codes.astype('int32')

    # 念のため残っている object 列を factorize
    obj_cols = df_num.select_dtypes('object').columns
    for col in obj_cols:
        df_num[col] = pd.factorize(df_num[col], sort=True)[0].astype('int32')

    return df_num.to_numpy(dtype=np.float32)

# 5. 時系列 CV & LightGBM 学習

In [14]:
# =========================================================
# 5. LightGBM / XGBoost 時系列 CV 学習
# =========================================================
!pip install xgboost==2.0.3 -q # XGBoost をインストール

import xgboost as xgb

tscv = TimeSeriesSplit(n_splits=CFG.n_folds, test_size=CFG.test_size_hr)
oof_lgb = np.zeros(len(X_train))
pred_lgb = np.zeros(len(X_test))
oof_xgb = np.zeros(len(X_train))
pred_xgb = np.zeros(len(X_test))

# XGBoost のパラメータ例
# LightGBM とは異なるパラメータ名と値になることが多いです。
xgb_params = {
    'objective': 'reg:squarederror', # 回帰問題の目的関数
    'eval_metric': 'rmse',           # 評価指標
    'eta': 0.05,                      # 学習率
    'max_depth': 8,                   # 木の最大深度
    'subsample': 0.8,                 # 各木を構築する際のデータのサンプリング率
    'colsample_bytree': 0.8,          # 各木を構築する際の列のサンプリング率
    'seed': CFG.seed,
    'nthread': -1,                    # 使用するスレッド数 (-1で全スレッド)
    'tree_method': 'hist',            # 大規模データ向きの高速なツリー構築アルゴリズム
    'disable_default_eval_metric': 1  # デフォルトの評価指標を使わない
}

for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train)):
    print(f'\n---- Fold {fold} ----')
    X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

    # LightGBM の学習
    print('  Training LightGBM...')
    lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=lgb_cat_features, free_raw_data=False)
    lgb_val   = lgb.Dataset(X_val, label=y_val, categorical_feature=lgb_cat_features, free_raw_data=False)

    model_lgb = lgb.train(
        CFG.lgb_params,
        lgb_train,
        num_boost_round=CFG.num_boost_round,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        callbacks=[
            lgb.early_stopping(CFG.early_stop, verbose=True),
            lgb.log_evaluation(500)
        ]
    )
    # LightGBM OOF
    oof_lgb[val_idx] = model_lgb.predict(
        to_numeric_np(X_val),
        num_iteration=model_lgb.best_iteration
    )

    # LightGBM TEST
    pred_lgb += model_lgb.predict(
        to_numeric_np(X_test),
        num_iteration=model_lgb.best_iteration
    ) / CFG.n_folds

    # XGBoost の学習
    print('  Training XGBoost...')
    # XGBoost は DMatrix 形式に変換するのが一般的ですが、DataFrame も扱えます。
    # ここでは簡単のため DataFrame のまま渡します。
    # カテゴリー特徴量は、通常は数値に変換済みである必要があります。
    model_xgb = xgb.XGBRegressor(**xgb_params)

    model_xgb.fit(
        # 修正点: XGBoost に渡すデータを数値に変換
        to_numeric_np(X_tr), y_tr,
        eval_set=[(to_numeric_np(X_val), y_val)], # 修正点: eval_set のデータも数値に変換
        early_stopping_rounds=CFG.early_stop,
        verbose=True # ここでは log_evaluation の代わりに verbose を使用
    )

    # XGBoost OOF
    oof_xgb[val_idx] = model_xgb.predict(to_numeric_np(X_val))

    # XGBoost TEST
    pred_xgb += model_xgb.predict(to_numeric_np(X_test)) / CFG.n_folds


# モデルごとの OOF RMSE を計算
rmse_lgb = np.sqrt(mean_squared_error(y_train, oof_lgb))
print(f'\nLightGBM OOF RMSE = {rmse_lgb:.4f}')

rmse_xgb = np.sqrt(mean_squared_error(y_train, oof_xgb))
print(f'XGBoost OOF RMSE = {rmse_xgb:.4f}')

# 複数モデルの予測を組み合わせる例 (平均)
# oof_combined = (oof_lgb + oof_xgb) / 2
# pred_combined = (pred_lgb + pred_xgb) / 2
# rmse_combined = np.sqrt(mean_squared_error(y_train, oof_combined))
# print(f'Combined OOF RMSE (Average) = {rmse_combined:.4f}')

# 提出には、性能の良い方のモデル、または組み合わせたモデルの予測を使用します。
# ここでは、例として LightGBM の予測を使用します。
final_pred = pred_lgb
final_oof = oof_lgb


---- Fold 0 ----
  Training LightGBM...
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[106]	train's rmse: 0.946248	valid's rmse: 2.76023
  Training XGBoost...
[0]	validation_0-rmse:10.89218
[1]	validation_0-rmse:10.40095
[2]	validation_0-rmse:9.94333
[3]	validation_0-rmse:9.62205
[4]	validation_0-rmse:9.21176
[5]	validation_0-rmse:8.82370
[6]	validation_0-rmse:8.44663
[7]	validation_0-rmse:8.09366
[8]	validation_0-rmse:7.77449
[9]	validation_0-rmse:7.45800
[10]	validation_0-rmse:7.17376
[11]	validation_0-rmse:6.89717
[12]	validation_0-rmse:6.62802
[13]	validation_0-rmse:6.37152
[14]	validation_0-rmse:6.19019
[15]	validation_0-rmse:6.05146
[16]	validation_0-rmse:5.83775
[17]	validation_0-rmse:5.63792
[18]	validation_0-rmse:5.45871
[19]	validation_0-rmse:5.31029
[20]	validation_0-rmse:5.15027
[21]	validation_0-rmse:4.99448
[22]	validation_0-rmse:4.85149
[23]	validation_0-rmse:4.74758
[24]	validation_0-rmse:4.61993
[25]	validation_0-rmse

# 6. 提出ファイル生成

In [15]:
# =========================================================
# 6. 提出ファイル生成
# =========================================================
sub = pd.DataFrame({
    'time': test_df.index.strftime('%Y-%m-%d %H:%M:%S'),
    'price_actual': final_pred # 最終的に使用する予測
})
save_path = os.path.join(CFG.data_path, 'submission_enhanced.csv') # ファイル名を変更
sub.to_csv(save_path, index=False)
print('saved to', save_path)


saved to /content/drive/MyDrive/ML/Signate_1634/submission_enhanced.csv
