<a href="https://colab.research.google.com/github/Hidenori24/Signate_colab/blob/main/SMBC_2025_2ndVer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0. ライブラリセットアップ


In [32]:
# ============================================
# 0. ライブラリ & CFG 定義
# ============================================
!pip -q install lightgbm==4.3.0 polars==0.20.19 holidays==0.42

import os, random, math, gc, pickle, warnings
import numpy as np
import pandas as pd
import polars as pl
import lightgbm as lgb
import holidays
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from tqdm.notebook import tqdm

warnings.filterwarnings('ignore')

# ---------- CFG ----------
class CFG:
    seed         = 42
    n_folds      = 5
    early_stop   = 300
    num_boost_round = 20_000
    test_size_hr = 4380          # ≒6ヶ月
    lags         = [1, 24]
    rolls        = [24, 168]
    data_path    = '/content/drive/MyDrive/ML/Signate_1634/'
    use_polars   = False         # True にすると FE 後が高速
    lgb_params = {
        'objective'      : 'regression',
        'metric'         : 'rmse',
        'learning_rate'  : 0.05,
        'num_leaves'     : 256,
        'subsample'      : 0.8,
        'colsample_bytree': 0.8,
        'seed'           : seed,
        'verbose'        : -1,
    }

# set seed
random.seed(CFG.seed)
np.random.seed(CFG.seed)


# 1. Google Drive マウント


In [33]:
# ============================================
# 1. Google Drive マウント
# ============================================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


# 2. データ読み込み

In [34]:
# =========================================================
# 2. データ読み込み
#    - index を DatetimeIndex（UTC）に
# =========================================================
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)                 # まずは普通に読み込み
    df['time'] = pd.to_datetime(df['time'], utc=True)   # ①文字列→datetime(UTC)
    df['time'] = df['time'].dt.tz_convert(None)         # ②タイムゾーン情報を外す（naive へ）
    df = df.set_index('time').sort_index()              # ③DatetimeIndex として設定
    return df

train_df = read_data(os.path.join(CFG.data_path, 'train.csv'))
test_df  = read_data(os.path.join(CFG.data_path, 'test.csv'))

print('train', train_df.shape, 'test', test_df.shape)

train (26280, 91) test (8760, 90)


# 3. 特徴量生成

In [35]:
# ============================================
# 3. 特徴量エンジニアリング
# ============================================
es_holidays = holidays.country_holidays('ES', years=range(2015, 2019))
holiday_set = set(es_holidays.keys())  # set で高速判定

def add_calendar(df):
    idx = df.index
    df['hour']       = idx.hour
    df['dow']        = idx.dayofweek
    df['month']      = idx.month
    df['is_weekend'] = (df['dow'] >= 5).astype(np.int8)
    df['is_holiday'] = np.isin(idx.date, list(holiday_set)).astype(np.int8)

    df['sin_hour'] = np.sin(2*np.pi*df['hour']/24)
    df['cos_hour'] = np.cos(2*np.pi*df['hour']/24)
    df['sin_dow']  = np.sin(2*np.pi*df['dow']/7)
    df['cos_dow']  = np.cos(2*np.pi*df['dow']/7)
    return df

def add_supply_gap(df):
    gen_cols = [c for c in df.columns if c.startswith('generation_')]
    df['supply_total'] = df[gen_cols].sum(axis=1)
    df['gap_supply_demand'] = df['supply_total'] - df['total_load_actual']

    ren_cols = [c for c in gen_cols if any(k in c for k in ['solar','wind','hydro'])]
    df['renewable_ratio'] = df[ren_cols].sum(axis=1) / df['supply_total']
    return df

def add_lag_roll(df, lags=CFG.lags, rolls=CFG.rolls):
    for l in lags:
        df[f'price_lag_{l}']  = df['price_actual'].shift(l)
        df[f'demand_lag_{l}'] = df['total_load_actual'].shift(l)
    for r in rolls:
        df[f'price_rollmean_{r}'] = df['price_actual'].shift(1).rolling(r).mean()
        df[f'gap_rollstd_{r}']    = df['gap_supply_demand'].shift(1).rolling(r).std()
    return df

def make_features(full):
    full = add_calendar(full)
    full = add_supply_gap(full)
    full = add_lag_roll(full)
    return full

# train+test を縦結合して一括 FE
full_df = pd.concat(
    [train_df, test_df.assign(price_actual=np.nan)],
    axis=0
)
full_df = make_features(full_df)

# 欠損補完（時系列なので forward → 数値列平均）
full_df = full_df.fillna(method='ffill')
num_cols = full_df.select_dtypes(include=[np.number, 'bool']).columns
full_df[num_cols] = full_df[num_cols].fillna(full_df[num_cols].mean())

print('FE 完了:', full_df.shape)


FE 完了: (35040, 111)


# 4. 数値列 & カテゴリー列を分離

In [36]:
# ============================================
# 4. 数値列 & カテゴリー列を分離
#    ─ LightGBM にカテゴリーを渡すオプション付き ─
# ============================================
TARGET = 'price_actual'

cat_cols = [c for c in full_df.columns if c.endswith(('weather_main',
                                                      'weather_description',
                                                      'weather_icon'))]
full_df[cat_cols] = full_df[cat_cols].astype('category')  # dtype=category に変換

features = [c for c in full_df.columns if c != TARGET]    # すべて渡す

train_fe = full_df.loc[train_df.index]
test_fe  = full_df.loc[test_df.index]

def to_lgb_matrix(df: pd.DataFrame, cat_cols: list) -> pd.DataFrame:
    df_num = df.copy()
    for col in cat_cols:
        df_num[col] = df_num[col].cat.codes.astype('int32')
    return df_num



X_train, y_train = train_fe[features], train_fe[TARGET]

# 5. 時系列 CV & LightGBM 学習

In [37]:
# ============================================
# 5. LightGBM 時系列 CV 学習  ★修正版
# ============================================
tscv = TimeSeriesSplit(n_splits=CFG.n_folds, test_size=CFG.test_size_hr)
oof = np.zeros(len(X_train))
pred= np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(tscv.split(X_train)):
    print(f'\n---- Fold {fold} ----')
    X_tr, y_tr = X_train.iloc[tr_idx], y_train.iloc[tr_idx]
    X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]

    lgb_train = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols, free_raw_data=False)
    lgb_val   = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(
        CFG.lgb_params,
        lgb_train,
        num_boost_round=CFG.num_boost_round,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        callbacks=[
            lgb.early_stopping(CFG.early_stop, verbose=True),
            lgb.log_evaluation(500)
        ]
    )

    oof[val_idx] = model.predict(
        to_lgb_matrix(X_val, cat_cols),
        num_iteration=model.best_iteration
    )

    pred += model.predict(
        to_lgb_matrix(X_test, cat_cols),
        num_iteration=model.best_iteration
    ) / CFG.n_folds

rmse = mean_squared_error(y_train, oof, squared=False)
print(f'\nOOF RMSE = {rmse:.4f}')



---- Fold 0 ----
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[96]	train's rmse: 1.12473	valid's rmse: 2.77683


ValueError: train and valid dataset categorical_feature do not match.

# 6. 提出ファイル生成

In [None]:
# =========================================================
# 6. 提出ファイル生成
# =========================================================
sub = pd.DataFrame({
    'time': test_df.index.strftime('%Y-%m-%d %H:%M:%S'),
    'price_actual': pred
})
save_path = os.path.join(CFG.data_path, 'submission_lgbm_cfg.csv')
sub.to_csv(save_path, index=False)
print('saved to', save_path)
