In [None]:
!pip uninstall -y pandas seaborn numpy matplotlib scikit-learn xgboost --quiet

In [None]:
!pip install pandas seaborn numpy matplotlib scikit-learn xgboost scikit-optimize lightgbm optuna --quiet

In [None]:
import numpy as np
print (f"numpy version: {np.__version__}")

In [None]:
!pip3 freeze > requirements.txt

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sklearn
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, KFold
import random as rn
RANDOM_SEED = 2025
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

# ==========================================================
# 상수 정의
# ==========================================================
BUILDING_COL = "building_number"
TIME_COL = "date_time"
TARGET_COL = "power_consumption"

# ============================================================
# 1) Feature Sets (corrected to match feature engineering)
# ============================================================
TYPE1_BASE = [
    "temperature", "humidity", "windspeed",
    "day_of_week", "month", "week", "is_holiday",
    "sin_hour", "cos_hour",
    "day_hour_mean", "day_hour_std",
    "holiday_hour_mean", "holiday_std",
    "THI", "WCT", "CDH", "is_peak_season"
]

TYPE1_DETAIL = [
    "summer_sin", "summer_cos", "day_max_temperature", "day_min_temperature",
    "day_mean_temperature", "day_diff_temperature"
]

TYPE2_BASE = [
    "temperature", "humidity", "windspeed",
    "day_of_week", "month", "week", "is_holiday",
    "sin_hour", "cos_hour",
    "day_hour_mean", "day_hour_std",
    "holiday_hour_mean", "holiday_std",
    "THI", "WCT", "CDH", "is_peak_season",
    "total_area", "cooling_area",
    "building_number", "building_type"
]

TYPE2_DETAIL = [
    "summer_sin", "summer_cos", "day_max_temperature", "day_min_temperature",
    "day_mean_temperature", "day_diff_temperature", "pv_temp", "ess_pcs_std"
]

Data Load and Preprocessing

In [None]:
train = pd.read_csv('../data/raw/train.csv')
test = pd.read_csv('../data/raw/test.csv')
building_info = pd.read_csv('../data/raw/building_info.csv')

In [None]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
# train.drop('num_date_time', axis = 1, inplace=True)

test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
# test.drop('num_date_time', axis = 1, inplace=True)

building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '학교': 'University',
    '백화점': 'Department Store',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    'IDC(전화국)': 'IDC',
    '호텔': 'Hotel'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)

building_info['solar_power_utility_binary'] = np.where(building_info.solar_power_capacity !='-',1,0)
building_info['ess_utility_binary'] = np.where(building_info.ess_capacity !='-',1,0)

building_info['solar_power_capacity'] = building_info['solar_power_capacity'].replace('-', '0').astype(float)
building_info['ess_capacity'] = building_info['ess_capacity'].replace('-', '0').astype(float)

train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

In [None]:
# ========================
# 이상치 마킹
# ========================

In [None]:
# outlier_detect 컬럼 초기화 (0: 정상, 1: 이상치)
train['outlier_detect'] = 0
test['outlier_detect'] = 0

# power_consumption이 0인 경우를 이상치로 마킹 (앞뒤 1포인트 포함)
zero_outlier_indices = train.index[train['power_consumption'] == 0].tolist()

# 앞뒤 1포인트도 포함하여 확장된 이상치 인덱스 생성
expanded_outlier_indices = set()
for idx in zero_outlier_indices:
    # 현재 인덱스와 앞뒤 1포인트 추가
    for offset in [-1, 0, 1]:
        new_idx = idx + offset
        if new_idx >= 0 and new_idx < len(train):  # 인덱스 범위 체크
            expanded_outlier_indices.add(new_idx)

# 확장된 인덱스들을 이상치로 마킹
train.loc[list(expanded_outlier_indices), 'outlier_detect'] = 1

# 이상치 개수 확인
outlier_count = (train['outlier_detect'] == 1).sum()
total_count = len(train)
print(f"총 데이터 개수: {total_count}")
print(f"power_consumption이 0인 원본 이상치 개수: {len(zero_outlier_indices)}")
print(f"앞뒤 1포인트 포함 확장된 이상치 개수: {outlier_count}")
print(f"확장된 이상치 비율: {outlier_count/total_count*100:.2f}%")

In [None]:
# IQR 방식을 이용한 이상치 탐지 (앞뒤 1포인트 포함)
def detect_outliers_iqr_expanded(df, column, building_col='building_number'):
    """
    IQR 방식으로 이상치를 탐지하는 함수
    건물별로 따로 계산하여 더 정확한 이상치 탐지
    이상치 발견 시 앞뒤 1포인트도 함께 이상치로 마킹
    """
    outlier_indices = []

    for building in df[building_col].unique():
        building_mask = df[building_col] == building
        building_data = df[building_mask][column]
        building_indices = df[building_mask].index.tolist()
        building_indices_set = set(building_indices)  # 빠른 검색을 위해 set 사용

        # 0이 아닌 값들만으로 IQR 계산 (0은 이미 이상치로 마킹됨)
        non_zero_data = building_data[building_data > 0]

        if len(non_zero_data) > 0:
            Q1 = non_zero_data.quantile(0.25)
            Q3 = non_zero_data.quantile(0.75)
            IQR = Q3 - Q1

            # IQR 기반 경계값 계산
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # 해당 건물의 이상치 인덱스 찾기 (0값은 제외, 이미 마킹됨)
            building_outliers_mask = ((building_data < lower_bound) | (building_data > upper_bound)) & (building_data > 0)
            original_outlier_indices = building_data[building_outliers_mask].index.tolist()

            # 앞뒤 1포인트 확장
            expanded_outlier_indices = set()
            for idx in original_outlier_indices:
                # 현재 인덱스와 앞뒤 1포인트 추가
                for offset in [-1, 0, 1]:
                    new_idx = idx + offset
                    # 인덱스 범위 체크 및 같은 건물 내의 인덱스인지 확인
                    if 0 <= new_idx < len(df) and new_idx in building_indices_set:
                        expanded_outlier_indices.add(new_idx)

            outlier_indices.extend(list(expanded_outlier_indices))

            print(f"건물 {building}: Q1={Q1:.2f}, Q3={Q3:.2f}, IQR={IQR:.2f}")
            print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
            print(f"  원본 IQR 이상치 개수: {len(original_outlier_indices)}")
            print(f"  확장된 IQR 이상치 개수: {len(expanded_outlier_indices)}")

    return outlier_indices

# power_consumption에 대해 IQR 이상치 탐지 (확장 버전)
print("=== IQR 방식 이상치 탐지 (앞뒤 1포인트 포함) ===")
iqr_outlier_indices = detect_outliers_iqr_expanded(train, 'power_consumption')

# IQR 이상치를 outlier_detect에 마킹 (기존 0값 이상치와 합쳐짐)
train.loc[iqr_outlier_indices, 'outlier_detect'] = 1

# 최종 이상치 개수 확인
total_outliers = (train['outlier_detect'] == 1).sum()
zero_outliers = len(zero_outlier_indices)  # 원본 0값 이상치 개수
iqr_outliers = len(iqr_outlier_indices)

print(f"\n=== 최종 이상치 현황 ===")
print(f"power_consumption=0 원본 이상치: {zero_outliers}개")
print(f"IQR 방식 확장 이상치: {iqr_outliers}개")
print(f"총 이상치 개수: {total_outliers}개")
print(f"전체 데이터 대비 이상치 비율: {total_outliers/len(train)*100:.2f}%")

In [None]:
# ==========================================================
# 이상치 제거 원상복구
# ==========================================================

# 이상치 제거 원상복구 설정
exclude_outlier_removal_buildings = [1, # 호텔
                                     13, 15, 23, 94, # 연구소
                                     8, 22, 46, 55, 87, # 학교
                                     71, 25, 91, 93, #아파트
                                     2, 51, 99, # 상용
                                     30, 37, 43, 52, 64, # IDC(전화국)
                                     96, # 건물기타
                                     ]  # 이상치 제거를 하지 않을 건물 번호 리스트

# 지정된 건물의 outlier_detect를 0으로 설정 (원상복구)
for building_id in exclude_outlier_removal_buildings:
    if building_id in train['building_number'].unique():
        mask = train['building_number'] == building_id
        train.loc[mask, 'outlier_detect'] = 0
        print(f"Building {building_id}: {mask.sum()} rows restored (outlier_detect set to 0)")
    else:
        print(f"Warning: Building {building_id} not found in dataset")

# 현재 이상치 상태 확인
print("\n=== 현재 이상치 마킹 상태 ===")
outlier_count_by_building = train.groupby('building_number')['outlier_detect'].sum().sort_index()
total_outliers = train['outlier_detect'].sum()
total_rows = len(train)

print(f"전체 데이터: {total_rows:,} rows")
print(f"전체 이상치: {total_outliers:,} rows ({total_outliers/total_rows*100:.2f}%)")
print(f"\n건물별 이상치 개수:")
for building_id, count in outlier_count_by_building.items():
    building_total = len(train[train['building_number'] == building_id])
    print(f"  Building {building_id:2d}: {count:4d} outliers ({count/building_total*100:5.2f}%)")


In [None]:
# ==========================================================
# 특정 기간 이상치 마킹
# ==========================================================

def mark_outlier_periods(df):
    """
    reference/xgb_type.py에서 확인한 특정 기간들을 이상치로 마킹
    기존에는 데이터를 drop했지만, 여기서는 outlier_detect=1로 마킹

    기간 형식:
    - 날짜만: ['2024-07-07', '2024-07-08']  # 전체 날짜
    - 시간까지: ['2024-07-07 14:00', '2024-07-08 09:00']  # 특정 시간대
    """

    print("=== 특정 기간 이상치 마킹 시작 ===")

    # date_time 컬럼이 datetime 타입인지 확인하고 변환
    if not pd.api.types.is_datetime64_any_dtype(df['date_time']):
        print("date_time을 datetime 타입으로 변환 중...")
        df['date_time'] = pd.to_datetime(df['date_time'], format='%Y%m%d %H')

    initial_outliers = (df['outlier_detect'] == 1).sum()

    # 1. 건물기타 (Other Buildings) 등.
    other_buildings_config = [
        {'building_id': 7, 'periods': [
            ['2024-07-07 10:00', '2024-07-08 11:00'],
            ['2024-07-12 14:00', '2024-08-06 03:00']
        ]},
        {'building_id': 26, 'periods': [
            ['2024-06-17 14:00', '2024-06-18 11:00']
        ]},
        {'building_id': 69, 'periods': [
            ['2024-06-02', '2024-06-02']
        ]},
        {'building_id': 82, 'periods': [
            ['2024-07-17 14:00', '2024-07-17 14:00']
        ]}
    ]

    # 2. IDC(전화국) 등
    idc_config = [
        {'building_id': 30, 'periods': [
            ['2024-07-13 20:00', '2024-07-13 20:00'],
            ['2024-07-25 00:00', '2024-07-25 00:00']
        ]},
        {'building_id': 36, 'periods': [
            ['2024-07-21 00:00', '2024-07-21 23:00']
        ]},
        {'building_id': 43, 'periods': [
            ['2024-06-10 17:00', '2024-06-10 18:00'],
            ['2024-08-12 16:00', '2024-08-12 17:00']
        ]},
        {'building_id': 52, 'periods': [
            ['2024-08-10 00:00', '2024-08-10 02:00']
        ]},
        {'building_id': 57, 'periods': [
            ['2024-06-01', '2024-06-07']
        ]},
        {'building_id': 67, 'periods': [
            ['2024-07-26', '2024-07-27'],
            ['2024-08-01 15:00', '2024-08-01 17:00']
        ]}
    ]

    # 3. 병원 (Hospital) 등.
    hospital_config = [
        {'building_id': 17, 'periods': [
            ['2024-06-25 20:00', '2024-06-26 08:00']
        ]},
        {'building_id': 44, 'periods': [
            ['2024-06-06 12:00', '2024-06-06 14:00']
        ]}
    ]

    # 4. 상용 (Commercial) 등
    commercial_config = [
        {'building_id': 20, 'periods': [
            ['2024-06-01 10:00', '2024-06-01 11:00'],
            ['2024-06-09 10:00', '2024-06-09 10:00']
        ]},
        {'building_id': 41, 'periods': [
            ['2024-07-17 09:00', '2024-07-17 15:00']
        ]},
        {'building_id': 51, 'periods': [
            ['2024-06-30', '2024-06-30']
        ]},
    ]

    # 5. 아파트 (Apartment) 등
    apartment_config = [
        {'building_id': 25, 'periods': [
            ['2024-07-04 12:00', '2024-07-04 14:00']
        ]},
        {'building_id': 65, 'periods': [
            ['2024-06-01', '2024-06-09']
        ]},
        {'building_id': 70, 'periods': [
            ['2024-06-04 09:00', '2024-06-05 08:00']
        ]}
    ]

    # 6. 연구소 (Research Institute) 등.
    research_config = [
        {'building_id': 49, 'periods': [
            ['2024-06-15 09:00', '2024-06-15 11:00'],
            ['2024-07-06', '2024-07-07'],
            ['2024-08-17', '2024-08-18'],
            ['2024-08-22', '2024-08-22'],
        ]},
        {'building_id': 53, 'periods': [
            ['2024-06-14 16:00', '2024-06-17 10:00'],
            ['2024-08-18 15:00', '2024-08-19 09:00']
        ]},
        {'building_id': 83, 'periods': [
            ['2024-07-17 14:00', '2024-07-17 14:00'],
        ]},
        {'building_id': 94, 'periods': [
            ['2024-07-26 18:00', '2024-08-05 05:00']
        ]}
    ]

    # 7. 학교 (University) 등.
    university_config = [
        {'building_id': 8, 'periods': [
            ['2024-07-21 08:00', '2024-07-21 11:00'],
            ['2024-08-24 09:00', '2024-08-24 23:00']
        ]},
        {'building_id': 12, 'periods': [
            ['2024-07-21 08:00', '2024-07-21 11:00'],
            ['2024-08-24 08:00', '2024-08-24 10:00']
        ]},
        {'building_id': 55, 'periods': [
            ['2024-07-17 14:00', '2024-07-17 14:00'],
        ]},
        {'building_id': 87, 'periods': [
            ['2024-06-01', '2024-06-30']
        ]}
    ]

    # 8. 호텔 (Hotel) 등
    hotel_config = [
        {'building_id': 89, 'periods': [
            ['2024-07-12 00:00', '2024-07-12 23:00']
        ]}
    ]

    # 9. 공공 (Public) 등
    public_config = [
        {'building_id': 38, 'periods': [
            ['2024-07-17 14:00', '2024-07-17 15:00']
        ]},
        {'building_id': 72, 'periods': [
            ['2024-06-11 00:00', '2024-06-11 02:00']
        ]},
        {'building_id': 92, 'periods': [
            ['2024-07-17 14:00', '2024-07-18 04:00']
        ]}
    ]

    # 10. 백화점 (Department Store) 등
    department_store_config = [
        {'building_id': 19, 'periods': [
            ['2024-07-31 13:00', '2024-07-31 16:00']
        ]},
        {'building_id': 32, 'periods': [
            ['2024-07-08 09:00', '2024-07-08 10:00']
        ]},
        {'building_id': 40, 'periods': [
            ['2024-07-14 00:00', '2024-07-14 01:00']
        ]},
        {'building_id': 45, 'periods': [
            ['2024-07-04 00:00', '2024-07-04 03:00']
        ]},
        {'building_id': 73, 'periods': [
            ['2024-07-08 22:00', '2024-07-08 22:00']
        ]},
        {'building_id': 79, 'periods': [
            ['2024-08-19 03:00', '2024-08-19 05:00']
        ]},
        {'building_id': 95, 'periods': [
            ['2024-08-05 10:00', '2024-08-05 11:00']
        ]},
        # ===================== 월요일 제거 ==========================
        #
        # {'building_id': 19, 'periods': [
        #     ['2024-06-10 00:00', '2024-06-10 23:00'],
        #     ['2024-07-08 00:00', '2024-07-08 23:00'],
        #     ['2024-08-19 00:00', '2024-08-19 23:00'],
        # ]},
        #
        # {'building_id': 45, 'periods': [
        #     ['2024-06-10 00:00', '2024-06-10 23:00'],
        #     ['2024-07-08 00:00', '2024-07-08 23:00'],
        #     ['2024-08-19 00:00', '2024-08-19 23:00'],
        # ]},
        #
        # {'building_id': 54, 'periods': [
        #     ['2024-06-17 00:00', '2024-06-17 23:00'],
        #     ['2024-07-01 00:00', '2024-07-01 23:00'],
        #     ['2024-08-19 00:00', '2024-08-19 23:00'],
        # ]},
        #
        # {'building_id': 74, 'periods': [
        #     ['2024-06-17 00:00', '2024-06-17 23:00'],
        #     ['2024-07-01 00:00', '2024-07-01 23:00'],
        # ]},
        #
        # {'building_id': 79, 'periods': [
        #     ['2024-06-17 00:00', '2024-06-17 23:00'],
        #     ['2024-07-01 00:00', '2024-07-01 23:00'],
        #     ['2024-08-19 00:00', '2024-08-19 23:00'],
        # ]},
        # {'building_id': 95, 'periods': [
        #     ['2024-07-08 00:00', '2024-07-08 23:00'],
        #     ['2024-08-05 00:00', '2024-08-05 23:00'],
        # ]},
    ]

    # 모든 설정을 통합 (10개 건물 타입 전체)
    all_configs = [
        ('건물기타 (Other Buildings)', other_buildings_config),
        ('IDC(전화국)', idc_config),
        ('병원 (Hospital)', hospital_config),
        ('상용 (Commercial)', commercial_config),
        ('아파트 (Apartment)', apartment_config),
        ('연구소 (Research Institute)', research_config),
        ('학교 (University)', university_config),
        ('호텔 (Hotel)', hotel_config),
        ('공공 (Public)', public_config),
        ('백화점 (Department Store)', department_store_config)
    ]

    total_marked = 0

    for building_type_name, configs in all_configs:
        print(f"\\n--- {building_type_name} 처리 ---")
        type_marked = 0

        if not configs:  # 빈 리스트인 경우
            print(f"  {building_type_name}: 설정된 이상치 기간 없음")
            continue

        for config in configs:
            building_id = config['building_id']
            periods = config['periods']

            building_marked = 0
            for period in periods:
                # 시간이 포함되어 있는지 확인
                if len(period[0].split()) > 1:  # 시간이 포함된 경우
                    start_datetime = pd.to_datetime(period[0])
                    end_datetime = pd.to_datetime(period[1])

                    # 정확한 시간까지 비교
                    condition = (df['building_number'] == building_id) & \
                               (df['date_time'] >= start_datetime) & \
                               (df['date_time'] <= end_datetime)

                    print(f"  건물 {building_id}: {period[0]} ~ {period[1]} (시간 포함)", end=" | ")
                else:  # 날짜만 있는 경우 (기존 방식)
                    start_date = pd.to_datetime(period[0]).date()
                    end_date = pd.to_datetime(period[1]).date()

                    # 해당 건물과 기간에 맞는 조건
                    condition = (df['building_number'] == building_id) & \
                               (df['date_time'].dt.date >= start_date) & \
                               (df['date_time'].dt.date <= end_date)

                    print(f"  건물 {building_id}: {period[0]} ~ {period[1]} (전체 날짜)", end=" | ")

                marked_count = condition.sum()
                df.loc[condition, 'outlier_detect'] = 1
                building_marked += marked_count
                print(f"{marked_count}개 마킹")

            type_marked += building_marked

        total_marked += type_marked
        print(f"  {building_type_name} 총 마킹: {type_marked}개")

    final_outliers = (df['outlier_detect'] == 1).sum()
    newly_marked = final_outliers - initial_outliers

    print(f"\\n=== 특정 기간 이상치 마킹 완료 ===")
    print(f"기존 이상치: {initial_outliers}개")
    print(f"새로 마킹된 이상치: {newly_marked}개")
    print(f"최종 이상치: {final_outliers}개")
    print(f"전체 데이터 대비 이상치 비율: {final_outliers/len(df)*100:.2f}%")

    return df

# date_time 컬럼을 먼저 datetime 타입으로 변환
print("=== 날짜 데이터 타입 확인 및 변환 ===")
if not pd.api.types.is_datetime64_any_dtype(train['date_time']):
    print("train 데이터의 date_time을 datetime 타입으로 변환 중...")
    train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

if not pd.api.types.is_datetime64_any_dtype(test['date_time']):
    print("test 데이터의 date_time을 datetime 타입으로 변환 중...")
    test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

# train 데이터에 특정 기간 이상치 마킹 적용
train = mark_outlier_periods(train)

# test 데이터에도 outlier_detect 컬럼이 있는지 확인하고 같은 기간 마킹
if 'outlier_detect' in test.columns:
    print("\\n=== Test 데이터에도 같은 기간 이상치 마킹 ===")
    test = mark_outlier_periods(test)
else:
    print("\\nTest 데이터에는 outlier_detect 컬럼이 없어 건너뜁니다.")

feature engineering

In [None]:
# ==========================================================
# Feature Engineering - Type 1 & Type 2 Features
# Based on FEATURE.md specifications
# ==========================================================

print("=== Feature Engineering 시작 ===")

# 날짜 변환 확인
if not pd.api.types.is_datetime64_any_dtype(train['date_time']):
    train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')
if not pd.api.types.is_datetime64_any_dtype(test['date_time']):
    test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

# ==========================================================
# 기본 시간 피처 생성
# ==========================================================
print("1. 기본 시간 피처 생성...")

# 기본 시간 피처
for df in [train, test]:
    df['hour'] = df['date_time'].dt.hour
    df['day'] = df['date_time'].dt.day
    df['month'] = df['date_time'].dt.month
    df['day_of_week'] = df['date_time'].dt.dayofweek  # 월요일=0, 일요일=6
    df['week'] = df['date_time'].dt.isocalendar().week  # 주차

# ==========================================================
# 건물별 휴일 피처 (is_holiday) - HOLIDAY.md 기반
# ==========================================================
print("2. 건물별 휴일 피처 생성...")

def apply_building_specific_holidays(df):
    """HOLIDAY.md에 따른 건물별 휴일 설정"""
    
    # 초기화
    df['is_holiday'] = 0
    
    # 휴일 설정 정의 (HOLIDAY.md 기반)
    holiday_config = {
        # Commercial
        'Commercial': {
            'buildings': [2, 6, 16, 20, 51, 86],
            'holidays': ['6/6', '8/15']
        },
        'Commercial_56': {
            'buildings': [56],
            'holidays': ['8/15']
        },
        
        # Department Store
        'Department_Store_bimonthly_sunday': {
            'buildings': [27, 40, 59, 63],
            'holidays': ['bimonthly_sunday']  # 2번째, 4번째 일요일
        },
        'Department_Store_bimonthly_monday': {
            'buildings': [32],
            'holidays': ['bimonthly_monday']  # 2번째, 4번째 월요일
        },
        'Department_Store_45': {
            'buildings': [45],
            'holidays': ['6/10', '7/8', '8/19']
        },
        'Department_Store_54': {
            'buildings': [54],
            'holidays': ['6/17', '7/8', '8/19']
        },
        'Department_Store_74': {
            'buildings': [74],
            'holidays': ['6/17', '7/1']
        },
        'Department_Store_79': {
            'buildings': [79],
            'holidays': ['6/17', '7/1', '8/19']
        },
        'Department_Store_95': {
            'buildings': [95],
            'holidays': ['7/8', '8/5']
        },
        
        # Hospital
        'Hospital': {
            'buildings': 'all_hospital',  # 모든 병원
            'holidays': ['6/6', '8/15']
        },
        
        # IDC
        'IDC_43_52': {
            'buildings': [43, 52],
            'holidays': ['6/6', '8/15']
        },
        'IDC_64_67': {
            'buildings': [64, 67],
            'holidays': ['8/15']
        },
        
        # Other Buildings
        'Other_Buildings': {
            'buildings': [47, 67],
            'holidays': ['6/6', '8/15']
        },
        
        # Public
        'Public': {
            'buildings': [38, 50, 66, 68, 72, 80],
            'holidays': ['6/6', '8/15']
        },
        
        # Research Institute
        'Research_Institute_basic': {
            'buildings': [13, 15, 37, 49, 53, 62, 83],
            'holidays': ['6/6', '8/15']
        },
        'Research_Institute_extended': {
            'buildings': [23, 94],
            'holidays': ['6/6', '6/7', '8/15', '8/16']
        },
        
        # University
        'University': {
            'buildings': [5, 8, 12, 14, 22, 24, 46, 55, 60, 87],
            'holidays': ['6/6', '8/15']
        }
    }
    
    # 각 설정 적용
    for config_name, config in holiday_config.items():
        buildings = config['buildings']
        holidays = config['holidays']
        
        # 건물 선택
        if buildings == 'all_hospital':
            # 모든 병원 건물 선택 (building_type == 'Hospital')
            building_mask = df['building_type'] == 'Hospital'
        else:
            # 특정 건물 번호들 선택
            building_mask = df['building_number'].isin(buildings)
        
        # 각 휴일 적용
        for holiday in holidays:
            if holiday == 'bimonthly_sunday':
                # 2번째, 4번째 일요일
                is_sunday = df['date_time'].dt.dayofweek == 6
                week_of_month = ((df['date_time'].dt.day - 1) // 7)
                is_2nd_or_4th_week = (week_of_month == 1) | (week_of_month == 3)
                holiday_condition = building_mask & is_sunday & is_2nd_or_4th_week
                
            elif holiday == 'bimonthly_monday':
                # 2번째, 4번째 월요일
                is_monday = df['date_time'].dt.dayofweek == 0
                week_of_month = ((df['date_time'].dt.day - 1) // 7)
                is_2nd_or_4th_week = (week_of_month == 1) | (week_of_month == 3)
                holiday_condition = building_mask & is_monday & is_2nd_or_4th_week
                
            else:
                # 특정 날짜 (MM/DD 형식)
                month, day = map(int, holiday.split('/'))
                date_condition = (df['date_time'].dt.month == month) & (df['date_time'].dt.day == day)
                holiday_condition = building_mask & date_condition
            
            # 휴일 마킹
            df.loc[holiday_condition, 'is_holiday'] = 1
    
    return df

# train과 test에 휴일 설정 적용
train = apply_building_specific_holidays(train)
test = apply_building_specific_holidays(test)

print(f"Train 휴일 데이터: {(train['is_holiday'] == 1).sum()}개")
print(f"Test 휴일 데이터: {(test['is_holiday'] == 1).sum()}개")

# ==========================================================
# 주기적 시간 피처 (sin_hour, cos_hour)
# ==========================================================
print("3. 주기적 시간 피처 생성...")

for df in [train, test]:
    # 24시간 주기
    df['sin_hour'] = np.sin(2 * np.pi * df['hour'] / 24.0)
    df['cos_hour'] = np.cos(2 * np.pi * df['hour'] / 24.0)

# ==========================================================
# 일별 온도 통계 피처
# ==========================================================
print("4. 일별 온도 통계 피처 생성...")

def calculate_daily_temperature_stats(dataframe):
    """일별 온도 통계 계산"""
    # 건물별, 월별, 일별로 온도 통계 계산
    daily_temp_stats = dataframe.groupby(['building_number', 'month', 'day'])['temperature'].agg([
        ('day_max_temperature', 'max'),
        ('day_min_temperature', 'min'),
        ('day_mean_temperature', 'mean')
    ]).reset_index()
    
    # 원본 데이터와 merge
    merged = dataframe.merge(daily_temp_stats, on=['building_number', 'month', 'day'], how='left')
    
    # 일교차 계산
    merged['day_diff_temperature'] = merged['day_max_temperature'] - merged['day_min_temperature']
    
    return merged

train = calculate_daily_temperature_stats(train)
test = calculate_daily_temperature_stats(test)

# ==========================================================
# 여름 계절 피처 (summer_sin, summer_cos)
# ==========================================================
print("5. 여름 계절 피처 생성...")

for df in [train, test]:
    # 6월~8월을 여름으로 간주, 월별로 주기적 변화 표현
    # 6월=0, 7월=1, 8월=2 -> 정규화하여 [0, 2π] 범위로 변환
    summer_months = {6: 0, 7: 1, 8: 2}
    df['summer_phase'] = df['month'].map(summer_months)
    df['summer_sin'] = np.sin(2 * np.pi * df['summer_phase'] / 3.0)
    df['summer_cos'] = np.cos(2 * np.pi * df['summer_phase'] / 3.0)
    df = df.drop('summer_phase', axis=1)

# ==========================================================
# 날씨 지수 피처 (THI, WCT, CDH)
# ==========================================================
print("6. 날씨 지수 피처 생성...")

# THI (Temperature-Humidity Index) 계산
for df in [train, test]:
    df['THI'] = 9/5 * df['temperature'] - 0.55 * (1 - df['humidity']/100) * (9/5 * df['temperature'] - 26) + 32

# WCT (Wind Chill Temperature) 계산
for df in [train, test]:
    df['WCT'] = 13.12 + 0.6125 * df['temperature'] - 11.37 * (df['windspeed'] ** 0.16) + \
                0.3965 * (df['windspeed'] ** 0.16) * df['temperature']

# CDH (Cooling Degrees Hours) 계산
def calculate_cdh(temperatures):
    """CDH 계산 - 26도를 기준으로 누적 냉방도 계산"""
    # 수정: 26도 이하일 때는 0으로 처리
    temp_diff = np.maximum(temperatures - 26, 0)
    cumsum = np.cumsum(temp_diff)
    # 11시간 이후부터는 sliding window 적용
    return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))

def calculate_and_add_cdh(dataframe):
    """건물별로 CDH 계산 후 추가"""
    cdhs = []
    for building_id in range(1, 101):
        building_data = dataframe[dataframe['building_number'] == building_id]['temperature'].values
        if len(building_data) > 0:
            cdh = calculate_cdh(building_data)
            cdhs.extend(cdh)
        # 수정: else 블록 제거 (애초에 building_data가 비어있으면 위 조건문에 안 들어감)
    return cdhs

train['CDH'] = calculate_and_add_cdh(train)
test['CDH'] = calculate_and_add_cdh(test)

# ==========================================================
# 성수기 피처 (is_peak_season)
# ==========================================================
print("7. 성수기 피처 생성...")

# 호텔 성수기 설정 (reference 기반)
peak_season_config = {
    9: [{'start': '2024-07-13', 'end': '2024-08-31'}],
    10: [{'start': '2024-07-04', 'end': '2024-08-22'}],
    28: [{'start': '2024-07-18', 'end': '2024-08-31'}],
    77: [{'start': '2024-07-18', 'end': '2024-08-31'}],
    89: [{'start': '2024-07-17', 'end': '2024-08-31'}],
    98: [{'start': '2024-07-15', 'end': '2024-08-31'}],
    100: [{'start': '2024-07-15', 'end': '2024-08-31'}],
}

for df in [train, test]:
    df['is_peak_season'] = 0
    for building_id, seasons in peak_season_config.items():
        for season in seasons:
            start_date = pd.to_datetime(season['start'])
            end_date = pd.to_datetime(season['end'])
            condition = (
                (df['building_number'] == building_id) &
                (df['date_time'] >= start_date) &
                (df['date_time'] <= end_date)
            )
            df.loc[condition, 'is_peak_season'] = 1

# ==========================================================
# 통계 피처 계산용 클린 데이터 생성
# ==========================================================
print("8. 통계 피처 계산용 데이터 준비...")

# 이상치가 제거된 데이터로 통계 피처 계산
train_for_stats = train[train['outlier_detect'] == 0].copy()

# ==========================================================
# 통계 피처 생성 (day_hour_mean, day_hour_std, holiday_hour_mean, holiday_std)
# ==========================================================
print("9. 통계 피처 생성...")

# 요일별 시간 평균/표준편차 (power_consumption 기준)
day_hour_stats = train_for_stats.groupby(['building_number', 'hour', 'day_of_week'])['power_consumption'].agg([
    ('day_hour_mean', 'mean'),
    ('day_hour_std', 'std')
]).reset_index()

# NaN 처리
day_hour_stats['day_hour_std'] = day_hour_stats['day_hour_std'].fillna(0)

# 휴일 시간 평균/표준편차 - 안전한 처리
holiday_data = train_for_stats[train_for_stats['is_holiday'] == 1]
print(f"휴일 데이터 개수: {len(holiday_data)}")

if len(holiday_data) > 0:
    holiday_hour_stats = holiday_data.groupby(['building_number', 'hour'])['power_consumption'].agg([
        ('holiday_hour_mean', 'mean'),
        ('holiday_std', 'std')
    ]).reset_index()
    
    # NaN 처리
    holiday_hour_stats['holiday_std'] = holiday_hour_stats['holiday_std'].fillna(0)
    print(f"휴일 통계 생성: {len(holiday_hour_stats)}개")
else:
    # 휴일 데이터가 없으면 빈 데이터프레임 생성
    holiday_hour_stats = pd.DataFrame(columns=['building_number', 'hour', 'holiday_hour_mean', 'holiday_std'])
    print("휴일 데이터가 없어서 빈 통계 생성")

# train과 test에 통계 피처 병합
print("10. 통계 피처 병합...")

# 요일별 시간 통계 병합
train = train.merge(day_hour_stats, on=['building_number', 'hour', 'day_of_week'], how='left')
test = test.merge(day_hour_stats, on=['building_number', 'hour', 'day_of_week'], how='left')

# 휴일 통계 초기화
train['holiday_hour_mean'] = 0.0
train['holiday_std'] = 0.0
test['holiday_hour_mean'] = 0.0
test['holiday_std'] = 0.0

# 휴일 통계가 있으면 병합
if len(holiday_hour_stats) > 0:
    for df in [train, test]:
        holiday_mask = df['is_holiday'] == 1
        if holiday_mask.sum() > 0:
            holiday_df = df[holiday_mask][['building_number', 'hour']].merge(
                holiday_hour_stats, on=['building_number', 'hour'], how='left'
            )
            
            # 성공적으로 병합된 경우만 업데이트
            if len(holiday_df) > 0:
                df.loc[holiday_mask, 'holiday_hour_mean'] = holiday_df['holiday_hour_mean'].fillna(0).values
                df.loc[holiday_mask, 'holiday_std'] = holiday_df['holiday_std'].fillna(0).values

# ==========================================================
# Type 2 전용 피처 (building features)
# ==========================================================
print("11. Type 2 전용 건물 피처 확인...")

# total_area, cooling_area는 이미 존재 (building_info에서 병합됨)
# building_number, building_type도 이미 존재
print("건물 관련 피처들이 이미 존재합니다.")

# ==========================================================
# Type 2 detail_2 전용 피처
# ==========================================================
print("12. Type 2 detail_2 전용 피처 생성...")

# pv_temp (태양광 발전 온도 상호작용)
for df in [train, test]:
    df['pv_temp'] = df['solar_power_capacity'] * df['temperature']

# ess_pcs_std (ESS 관련 표준편차)
# 건물별 ESS 용량의 표준편차를 시간대별로 계산
if len(train_for_stats) > 0:
    ess_stats = train_for_stats.groupby(['building_number', 'hour'])['ess_capacity'].agg([
        ('ess_pcs_std', 'std')
    ]).reset_index()
    
    # 표준편차가 NaN인 경우 0으로 채움 (단일 값인 경우)
    ess_stats['ess_pcs_std'] = ess_stats['ess_pcs_std'].fillna(0)
    
    # train과 test에 병합
    train = train.merge(ess_stats, on=['building_number', 'hour'], how='left')
    test = test.merge(ess_stats, on=['building_number', 'hour'], how='left')
    
    # 병합되지 않은 값들은 0으로 채움
    train['ess_pcs_std'] = train['ess_pcs_std'].fillna(0)
    test['ess_pcs_std'] = test['ess_pcs_std'].fillna(0)
else:
    # 통계 데이터가 없으면 0으로 초기화
    train['ess_pcs_std'] = 0.0
    test['ess_pcs_std'] = 0.0

# ==========================================================
# 피처 정리 및 확인
# ==========================================================
print("13. 피처 생성 완료 및 확인...")

# NaN 값 처리
for df in [train, test]:
    # 통계 피처의 NaN은 0으로 채움 (통계 정보가 없는 경우)
    stats_columns = ['day_hour_mean', 'day_hour_std', 'holiday_hour_mean', 'holiday_std']
    for col in stats_columns:
        if col in df.columns:
            df[col] = df[col].fillna(0)

# Type 1 base_1 피처 목록 확인
type1_base1_features = [
    'temperature', 'humidity', 'windspeed', 'day_of_week', 'month', 'week',
    'is_holiday', 'sin_hour', 'cos_hour', 'day_hour_mean', 'day_hour_std',
    'holiday_hour_mean', 'holiday_std', 'THI', 'WCT', 'CDH', 'is_peak_season'
]

# Type 1 detail_1 추가 피처
type1_detail1_features = [
    'summer_sin', 'summer_cos', 'day_max_temperature', 'day_min_temperature',
    'day_mean_temperature', 'day_diff_temperature'
]

# Type 2 base_2 피처 (Type 1 base_1 + building features)
type2_base2_features = type1_base1_features + [
    'total_area', 'cooling_area', 'building_number', 'building_type'
]

# Type 2 detail_2 추가 피처
type2_detail2_features = [
    'summer_sin', 'summer_cos', 'day_max_temperature', 'day_min_temperature',
    'day_mean_temperature', 'day_diff_temperature', 'pv_temp', 'ess_pcs_std'
]

print("=== 피처 생성 완료 ===")
print(f"Train 데이터 shape: {train.shape}")
print(f"Test 데이터 shape: {test.shape}")
print(f"Type 1 base_1 피처 수: {len(type1_base1_features)}")
print(f"Type 1 detail_1 추가 피처 수: {len(type1_detail1_features)}")
print(f"Type 2 base_2 피처 수: {len(type2_base2_features)}")
print(f"Type 2 detail_2 추가 피처 수: {len(type2_detail2_features)}")

# 피처별 존재 여부 확인
print("\n=== 피처 존재 여부 확인 ===")
all_required_features = list(set(type1_base1_features + type1_detail1_features + type2_base2_features + type2_detail2_features))
missing_features = []

for feature in all_required_features:
    if feature in train.columns:
        print(f"✓ {feature}")
    else:
        print(f"✗ {feature} - MISSING!")
        missing_features.append(feature)

if missing_features:
    print(f"\n누락된 피처: {missing_features}")
else:
    print("\n모든 필수 피처가 생성되었습니다!")

# 건물별 휴일 데이터 확인
print(f"\n=== 건물별 휴일 데이터 확인 ===")
holiday_by_building_type = train[train['is_holiday'] == 1].groupby('building_type').size()
for building_type, count in holiday_by_building_type.items():
    print(f"{building_type}: {count}개")

holiday_count_train = (train['is_holiday'] == 1).sum()
holiday_count_test = (test['is_holiday'] == 1).sum()
print(f"\nTrain 총 휴일 데이터: {holiday_count_train}개")
print(f"Test 총 휴일 데이터: {holiday_count_test}개")

In [None]:
print("=== 데이터 타입 수정 ===")

# XGBoost 호환 데이터 타입으로 변환
def fix_dtypes_for_xgb(df):
    df = df.copy()

    # 숫자형 컬럼들을 float32로 변환
    numeric_cols = df.select_dtypes(include=['int64', 'int32', 'uint32', 'uint64']).columns
    for col in numeric_cols:
        df[col] = df[col].astype('float32')
        print(f"  {col}: {df[col].dtype} -> float32")

    # 범주형 컬럼 처리
    if 'building_type' in df.columns:
        df['building_type'] = df['building_type'].astype('category')

    return df
# train과 test 데이터 타입 수정
train = fix_dtypes_for_xgb(train)
test = fix_dtypes_for_xgb(test)

print(f"Train dtypes 확인:")
print(train.select_dtypes(include=['object', 'int64']).dtypes)
print(f"\nTest dtypes 확인:")
print(test.select_dtypes(include=['object', 'int64']).dtypes)

In [None]:
train.columns

In [None]:
import xgboost

print (f"xgboost version: {xgboost.__version__}")

In [None]:
"""
Energy Load Forecasting — A→D Pipeline (Hybrid HPO + Early Stopping + Run Folder Saving)

What's new vs prior version:
- HPO backend by stage: stage 1–3 -> random, stage 4–6 -> optuna (auto mapping)
- Optional Bayes backend (skopt) via FORCE_BACKEND='bayes'
- Early stopping:
  * CV fold: XGB/LGBM use eval_set on each fold; MLP uses early_stopping=True
  * Final refit: hold out last 10% (time-based) as eval for XGB/LGBM, MLP uses internal val
- Per-building vs Global: different HPO budgets
- Saving outputs under runs/{YYYYMMDD_HHMM}_{tag}/
"""

from __future__ import annotations

import os
import json
import math
import pathlib
from dataclasses import dataclass
from typing import List, Dict, Tuple, Optional

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import Ridge

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neural_network import MLPRegressor
from datetime import datetime

# ============================================================
# 0) Global Config
# ============================================================
TARGET_COL = "power_consumption"
TIME_COL = "num_date_time"         # submission ID와 동일한 시간/ID 컬럼 (이미 조합되어 있다고 가정)
BUILDING_COL = "building_number"

# CV defaults
DEFAULT_CV_TYPE = "kfold"      # 'kfold' | 'timesplit'
DEFAULT_N_SPLITS = 5
DEFAULT_GAP = 0
DEFAULT_RANDOM_STATE = 42

# Seeds (stage 5/6)
DEFAULT_SEEDS = [13, 21, 42, 77, 123]

# Search metric & alpha for weighted MSE
DEFAULT_SEARCH_METRIC = "weighted_mse"  # 'weighted_mse'|'smape'|'mae'|'mse'
DEFAULT_ALPHA = 3.0

# Early stopping config
ES_ROUNDS_XGB = 100
ES_ROUNDS_LGBM = 100
MLP_EARLY_STOP = True
MLP_N_NO_CHANGE = 20
MLP_VAL_FRAC = 0.1

# HPO budgets (n_trials) by scope & backend
BUDGETS = {
    "per_building": {"random": 20, "optuna": 40, "bayes": 40},
    "global":       {"random": 60, "optuna":120, "bayes": 80},
}

# Backend mapping by stage
HPO_BACKEND_FOR_STAGE = {
    1: "random", 2: "random", 3: "random",
    4: "random", 5: "random", 6: "random", #optuna
}
# To force a backend globally (e.g., 'bayes'), set to 'random'|'optuna'|'bayes' or None to auto
FORCE_BACKEND: Optional[str] = None

# ============================================================
# 1) Feature Sets (as provided)
# ============================================================
TYPE1_BASE = [
    "temperature", "humidity", "windspeed",
    "day_of_week", "month", "week", "is_holiday",
    "sin_hour", "cos_hour",
    "day_hour_mean", "day_hour_std",
    "holiday_hour_mean", "holiday_std",
    "THI", "WCT", "CDH"
]
TYPE1_DETAIL = [
    "summer_sin", "summer_cos",
    "day_max_temperature", "day_min_temperature",
    "day_mean_temperature", "day_diff_temperature"
]

TYPE2_BASE = [
    "temperature", "humidity", "windspeed",
    "day_of_week", "month", "week", "is_holiday",
    "sin_hour", "cos_hour",
    "day_hour_mean", "day_hour_std",
    "holiday_hour_mean", "holiday_std",
    "THI", "WCT", "CDH",
    "total_area", "cooling_area",
    "building_number", "building_type"
]
TYPE2_DETAIL = [
    "summer_sin", "summer_cos",
    "day_max_temperature", "day_min_temperature",
    "day_mean_temperature", "day_diff_temperature",
    "pv_temp", "ess_pcs_std"
]

@dataclass
class ModelSpec:
    name: str            # 'A'|'B'|'C'|'D'
    scope: str           # 'per_building' | 'global'
    features: List[str]

def build_model_specs() -> List[ModelSpec]:
    return [
        ModelSpec("A", "per_building", TYPE1_BASE),
        ModelSpec("B", "per_building", TYPE1_BASE + TYPE1_DETAIL),
        ModelSpec("C", "global", TYPE2_BASE),
        ModelSpec("D", "global", TYPE2_BASE + TYPE2_DETAIL),
    ]

# ============================================================
# 2) Utils & Metrics
# ============================================================
def sanitize_features(df: pd.DataFrame, features: List[str]) -> List[str]:
    present = [c for c in features if c in df.columns]
    missing = [c for c in features if c not in df.columns]
    if missing:
        print(f"[warn] Missing features ignored: {missing}")
    return present

def load_data(train_path: str, test_path: str, binfo_path: Optional[str]=None, submission_path: Optional[str]=None):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    binfo = pd.read_csv(binfo_path) if binfo_path else None
    sub = pd.read_csv(submission_path) if submission_path else None
    return train, test, binfo, sub

def merge_static(df: pd.DataFrame, binfo: Optional[pd.DataFrame]) -> pd.DataFrame:
    if binfo is None:
        return df
    if BUILDING_COL not in df.columns:
        raise KeyError(f"'{BUILDING_COL}' not in df")
    return df.merge(binfo, on=BUILDING_COL, how="left")

def smape(y_true: np.ndarray, y_pred: np.ndarray, eps: float=1e-6) -> float:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) + eps
    return float(np.mean(2.0 * np.abs(y_pred - y_true) / denom) * 100.0)

def weighted_mse(y_true: np.ndarray, y_pred: np.ndarray, alpha: float=DEFAULT_ALPHA, eps: float=1e-6) -> float:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    w = (np.abs(y_true) + eps) ** float(alpha)
    se = (y_true - y_pred) ** 2
    return float(np.average(se, weights=w))

def make_search_scorer(metric: str = DEFAULT_SEARCH_METRIC):
    metric = metric.lower()
    def _score(estimator, X, y):
        y_hat = estimator.predict(X)
        if metric == "weighted_mse":
            alpha = estimator.get_params().get("meta__alpha", DEFAULT_ALPHA)
            return -weighted_mse(y, y_hat, alpha=alpha)  # sklearn: higher is better
        elif metric == "smape":
            return -smape(y, y_hat)
        elif metric == "mae":
            return -mean_absolute_error(y, y_hat)
        elif metric == "mse":
            return -float(np.mean((y - y_hat) ** 2))
        else:
            raise ValueError("Unknown search metric")
    return _score

def _metric_value(y_true: np.ndarray, y_pred: np.ndarray, metric: str, alpha: float) -> float:
    metric = metric.lower()
    if metric == "weighted_mse":
        return weighted_mse(y_true, y_pred, alpha=alpha)
    if metric == "smape":
        return smape(y_true, y_pred)
    if metric == "mae":
        return mean_absolute_error(y_true, y_pred)
    if metric == "mse":
        return float(np.mean((y_true - y_pred) ** 2))
    raise ValueError("Unknown metric")

# ============================================================
# 3) Preprocess & Models (+alpha carrier)
# ============================================================
class AlphaCarrier(BaseEstimator, TransformerMixin):
    def __init__(self, alpha: float = DEFAULT_ALPHA):
        self.alpha = alpha
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X

def make_preprocessor(feature_cols: List[str], categorical_cols: List[str]) -> ColumnTransformer:
    cats = [c for c in categorical_cols if c in feature_cols]
    return ColumnTransformer(
        transformers=[("cat", OneHotEncoder(handle_unknown="ignore"), cats)],
        remainder="passthrough",
        sparse_threshold=0.0,
    )

def make_pipeline(algo: str, feature_cols: List[str], categorical_cols: List[str], random_state: int, alpha: float=DEFAULT_ALPHA) -> Pipeline:
    meta = ("meta", AlphaCarrier(alpha=alpha))
    # LGBM, MLP 모델에만 전처리기 사용
    if algo in ["lgbm", "mlp"]:
        if not any(c in feature_cols for c in categorical_cols):
            pre = ("prep", "passthrough")
        else:
            pre = ("prep", make_preprocessor(feature_cols, categorical_cols))
    # XGBoost는 전처리기 없이 바로 모델로 연결
    else: # algo == "xgb"
        pre = ("prep", "passthrough")

    if algo == "xgb":
        est = XGBRegressor(
            n_estimators=1000, learning_rate=0.05, max_depth=8,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            tree_method="hist", random_state=random_state, n_jobs=-1,
            enable_categorical=True
        )
        return Pipeline([meta, pre, ("est", est)])
    if algo == "lgbm":
        est = LGBMRegressor(
            n_estimators=1000, learning_rate=0.05, num_leaves=64,
            subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
            random_state=random_state, n_jobs=-1
        )
        return Pipeline([meta, pre, ("est", est)])
    if algo == "mlp":
        scaler = ("scaler", StandardScaler())
        est = ("est", MLPRegressor(
            hidden_layer_sizes=(256, 128), activation="relu",
            solver="adam", learning_rate_init=1e-3, max_iter=500,
            early_stopping=MLP_EARLY_STOP, n_iter_no_change=MLP_N_NO_CHANGE,
            validation_fraction=MLP_VAL_FRAC, random_state=random_state
        ))
        return Pipeline([meta, pre, scaler, est])
    raise ValueError(f"Unknown algo: {algo}")

def default_search_spaces() -> Dict[str, Dict[str, List]]:
    alpha_grid = [3.0] #
    return {
        "xgb": {
            "meta__alpha": alpha_grid,
            "est__n_estimators": [600, 800, 1200, 1600],
            "est__learning_rate": [0.03, 0.05, 0.07, 0.1],
            "est__max_depth": [4, 6, 8, 10],
            "est__subsample": [0.6, 0.8, 1.0],
            "est__colsample_bytree": [0.6, 0.8, 1.0],
            "est__reg_lambda": [0.0, 0.5, 1.0, 2.0],
        },
        "lgbm": {
            "meta__alpha": alpha_grid,
            "est__n_estimators": [600, 800, 1200, 1600],
            "est__learning_rate": [0.03, 0.05, 0.07, 0.1],
            "est__num_leaves": [31, 63, 127, 255],
            "est__subsample": [0.6, 0.8, 1.0],
            "est__colsample_bytree": [0.6, 0.8, 1.0],
            "est__reg_lambda": [0.0, 0.5, 1.0, 2.0],
        },
        "mlp": {
            "meta__alpha": alpha_grid,
            "est__hidden_layer_sizes": [(256,), (256,128), (512,256), (512,256,128)],
            "est__learning_rate_init": [5e-4, 1e-3, 2e-3],
            "est__alpha": [1e-5, 1e-4, 1e-3],
            "est__max_iter": [300, 500, 800],
        },
    }

# ============================================================
# 4) CV Splitter
# ============================================================
def make_splitter(cv_type: str=DEFAULT_CV_TYPE, n_splits: int=DEFAULT_N_SPLITS, gap: int=DEFAULT_GAP):
    if cv_type.lower() == "kfold":
        return KFold(n_splits=n_splits, shuffle=True, random_state=DEFAULT_RANDOM_STATE)
    if cv_type.lower() == "timesplit":
        return TimeSeriesSplit(n_splits=n_splits, gap=gap)
    raise ValueError("cv_type must be 'kfold' or 'timesplit'")

# ============================================================
# 5) HPO Backends (random / optuna / bayes) with Early Stopping per fold
# ============================================================
def _fit_with_es(est: Pipeline, algo: str, X_tr, y_tr, X_val=None, y_val=None, verbose: int=0):
    fit_params = {}
    if algo == "xgb":
        if X_val is not None and y_val is not None:
            est.set_params(
                est__early_stopping_rounds=ES_ROUNDS_XGB
            )
            fit_params.update({
                "est__eval_set": [(X_val, y_val)],
                # "est__early_stopping_rounds": ES_ROUNDS_XGB,
                "est__verbose": verbose,
            })
    elif algo == "lgbm":
        if X_val is not None and y_val is not None:
            est.set_params(
                est__early_stopping_rounds=ES_ROUNDS_LGBM
            )
            fit_params.update({
                "est__eval_set": [(X_val, y_val)],
                #"est__early_stopping_rounds": ES_ROUNDS_LGBM,
                "est__verbose": verbose,
            })
    # MLP early stopping already in params
    est.fit(X_tr, y_tr, **fit_params)
    return est

def _sample_from_list(rng, values):
    return values[rng.integers(0, len(values))]

def run_random_search_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=50, random_state=42):
    """Manual random search so we can pass per-fold eval_set for early stopping."""
    rng = np.random.default_rng(seed=random_state)
    scorer = make_search_scorer(search_metric)

    best_score = -np.inf
    best_params = {}
    for t in range(n_iter):
        # Random param dict
        params = {k: _sample_from_list(rng, v) for k, v in param_space.items()}
        est_ = clone(est).set_params(**params)

        # inner-CV with ES
        fold_scores = []
        for tr_idx, va_idx in cv.split(X):
            X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
            X_va, y_va = X.iloc[va_idx], y[va_idx]
            est_fold = clone(est_)

            _fit_with_es(est_fold, algo, X_tr, y_tr, X_va, y_va, verbose=0)
            fold_scores.append(scorer(est_fold, X_va, y_va))

        mean_sc = float(np.mean(fold_scores))
        if mean_sc > best_score:
            best_score, best_params = mean_sc, params

    best_est = clone(est).set_params(**best_params)
    # fit on full data w/o val (or small split in final-refit step)
    best_est.fit(X, y)
    return best_est, best_params

def run_optuna_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=50, random_state=42):
    import optuna
    from sklearn.base import clone
    scorer = make_search_scorer(search_metric)

    def suggest_from_list(trial, name, values):
        v0 = values[0]
        # try numeric ranges
        if all(isinstance(v, (int, np.integer)) for v in values):
            return trial.suggest_int(name, int(min(values)), int(max(values)))
        if all(isinstance(v, (float, np.floating)) for v in values):
            return trial.suggest_float(name, float(min(values)), float(max(values)))
        return trial.suggest_categorical(name, values)

    def objective(trial):
        params = {k: suggest_from_list(trial, k, vals) for k, vals in param_space.items()}
        est_ = clone(est).set_params(**params)
        scores = []
        for tr_idx, va_idx in cv.split(X):
            X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
            X_va, y_va = X.iloc[va_idx], y[va_idx]
            est_fold = clone(est_)
            _fit_with_es(est_fold, algo, X_tr, y_tr, X_va, y_va, verbose=0)
            score = scorer(est_fold, X_va, y_va)
            scores.append(score)
            trial.report(score, step=len(scores))
            if trial.should_prune():
                raise optuna.TrialPruned()
        return float(np.mean(scores))

    pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
    study = optuna.create_study(direction="maximize", pruner=pruner, sampler=optuna.samplers.TPESampler(seed=random_state))
    study.optimize(objective, n_trials=n_iter, gc_after_trial=True)

    best_params = study.best_trial.params
    best_est = clone(est).set_params(**best_params).fit(X, y)
    return best_est, best_params

def run_bayes_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=50, random_state=42):
    """Bayesian optimization via skopt.gp_minimize with Categorical dims (simple & robust)."""
    from skopt import gp_minimize
    from skopt.space import Categorical
    from sklearn.base import clone
    scorer = make_search_scorer(search_metric)

    dims = []
    keys = []
    for k, vals in param_space.items():
        dims.append(Categorical(vals, name=k))
        keys.append(k)

    def objective(list_vals):
        params = {k: v for k, v in zip(keys, list_vals)}
        est_ = clone(est).set_params(**params)
        scores = []
        for tr_idx, va_idx in cv.split(X):
            X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
            X_va, y_va = X.iloc[va_idx], y[va_idx]
            est_fold = clone(est_)
            _fit_with_es(est_fold, algo, X_tr, y_tr, X_va, y_va, verbose=0)
            score = scorer(est_fold, X_va, y_va)
            scores.append(score)
        # gp_minimize minimizes → return negative (we want to maximize scorer)
        return -float(np.mean(scores))

    res = gp_minimize(
        objective, dimensions=dims, n_calls=n_iter, random_state=random_state, verbose=False
    )
    best_params = {k: v for k, v in zip(keys, res.x)}
    best_est = clone(est).set_params(**best_params).fit(X, y)
    return best_est, best_params

def run_hpo_backend(est, X, y, param_space, cv, search_metric, backend, algo, n_iter, random_state=42):
    backend = backend.lower()
    if backend == "random":
        return run_random_search_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=n_iter, random_state=random_state)
    if backend == "optuna":
        return run_optuna_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=n_iter, random_state=random_state)
    if backend == "bayes":
        return run_bayes_cv(est, X, y, param_space, cv, search_metric, algo, n_iter=n_iter, random_state=random_state)
    raise ValueError("backend must be 'random'|'optuna'|'bayes'")

# ============================================================
# 6) OOF Training + Final Refit with ES + Predictions
# ============================================================
def oof_fit_predict(
    pipeline: Pipeline,
    X: pd.DataFrame,
    y: np.ndarray,
    X_test: pd.DataFrame,
    do_search: bool,
    algo: str,
    param_spaces: Dict[str, Dict[str, List]],
    outer_splitter,
    inner_cv_type: str,
    inner_splits: int,
    random_state: int,
    search_metric: str,
    search_backend: str,
    search_n_iter: int,
) -> Tuple[np.ndarray, np.ndarray, Pipeline, List[Dict], List[float]]:
    """Return OOF, test_pred(mean-of-fold), last_est, fold best_params, fold metric values."""
    oof = np.zeros(len(X))
    test_fold_preds = []
    last_est = None
    fold_smapes = []
    fold_best_params: List[Dict] = []
    fold_metric_vals: List[float] = []

    for fold, (tr_idx, val_idx) in enumerate(outer_splitter.split(X)):
        X_tr, y_tr = X.iloc[tr_idx], y[tr_idx]
        X_val, y_val = X.iloc[val_idx], y[val_idx]

        est = clone(pipeline)
        chosen_params = {}
        if do_search:
            inner_splitter = make_splitter(cv_type=inner_cv_type, n_splits=inner_splits, gap=0)
            est, chosen_params = run_hpo_backend(
                est, X_tr, y_tr,
                param_space=param_spaces.get(algo, {}),
                cv=inner_splitter,
                search_metric=search_metric,
                backend=search_backend,
                algo=algo,
                n_iter=search_n_iter,
                random_state=random_state,
            )
        else:
            # no search: just fit with ES on val
            est = _fit_with_es(est, algo, X_tr, y_tr, X_val, y_val, verbose=0)

        # Validate
        y_hat_val = est.predict(X_val)
        oof[val_idx] = y_hat_val
        sm = smape(y_val, y_hat_val)
        alpha_here = est.get_params().get("meta__alpha", DEFAULT_ALPHA)
        mv = _metric_value(y_val, y_hat_val, search_metric, alpha_here)
        fold_smapes.append(sm)
        fold_metric_vals.append(mv)
        fold_best_params.append(chosen_params)
        print(f"[val] fold {fold+1}/{getattr(outer_splitter, 'n_splits', '?')}  SMAPE={sm:.3f}% | {search_metric}={mv:.6f}")

        # Test via current fold model (for completeness; final_refit will also run)
        test_fold_preds.append(est.predict(X_test))
        last_est = est

    print(f"[val] mean SMAPE: {np.mean(fold_smapes):.3f}% (±{np.std(fold_smapes):.3f})")
    test_pred = np.mean(test_fold_preds, axis=0)
    return oof, test_pred, last_est, fold_best_params, fold_metric_vals

def _final_refit_with_holdout(est: Pipeline, algo: str, X: pd.DataFrame, y: np.ndarray, holdout_frac: float=0.1):
    """Time-based final refit with early stopping using last `holdout_frac` of rows."""
    # n = len(X)
    # if n >= 20 and holdout_frac > 0:
    #     k = max(int(n * (1 - holdout_frac)), n - 48)  # at least 48 points in holdout if possible
    #     X_tr, y_tr = X.iloc[:k], y[:k]
    #     X_val, y_val = X.iloc[k:], y[k:]
    #     est = _fit_with_es(est, algo, X_tr, y_tr, X_val, y_val, verbose=0)
    # else:
    #     est.fit(X, y)  # fallback
    return est.fit(X, y) #est

def train_predict_for_spec(
    spec: ModelSpec,
    algo: str,
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    do_search: bool,
    seeds: List[int],
    cv_type: str,
    n_splits: int,
    param_spaces: Dict[str, Dict[str, List]],
    gap: int,
    inner_cv_type: Optional[str],
    inner_splits: Optional[int],
    search_metric: str,
    search_backend: str,
    search_n_iter: int,
    do_final_refit: bool=True,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Returns (pred_df, oof_df) with test predictions from final refit models."""
    inner_cv_type = inner_cv_type or cv_type
    inner_splits = inner_splits or 3
    feat_cols = sanitize_features(train_df, spec.features)
    categorical = ["building_type"] if spec.scope == "global" else []

    def _pipe(rs: int) -> Pipeline:
        return make_pipeline(algo=algo, feature_cols=feat_cols, categorical_cols=categorical, random_state=rs, alpha=DEFAULT_ALPHA)

    all_pred_parts = []
    all_oof_parts = []

    if spec.scope == "per_building":
        for b_id, tr_b in train_df.groupby(BUILDING_COL):
            te_b = test_df.loc[test_df[BUILDING_COL] == b_id]
            if te_b.empty:
                continue
            X_tr = tr_b[feat_cols]
            y_tr = tr_b[TARGET_COL].values
            X_te = te_b[feat_cols]
            outer_splitter = make_splitter(cv_type=cv_type, n_splits=n_splits, gap=gap)

            seed_oofs = []
            seed_final_preds = []
            for s in seeds:
                pipe = _pipe(s)
                oof, _, last_est, fold_params, fold_scores = oof_fit_predict(
                    pipe, X_tr, y_tr, X_te, do_search, algo, param_spaces,
                    outer_splitter, inner_cv_type, inner_splits, s,
                    search_metric=search_metric,
                    search_backend=search_backend,
                    search_n_iter=search_n_iter,
                )
                seed_oofs.append(oof)

                best_idx = int(np.argmin(fold_scores)) if fold_scores else 0
                best_params = fold_params[best_idx] if fold_params else {}

                if do_final_refit:
                    final_model = clone(_pipe(s))
                    if best_params:
                        final_model.set_params(**best_params)
                    final_model = _final_refit_with_holdout(final_model, algo, X_tr, y_tr, holdout_frac=0.1)
                    train_pred_full = final_model.predict(X_tr)
                    f_smape = smape(y_tr, train_pred_full)
                    f_r2 = r2_score(y_tr, train_pred_full)
                    print(f"[final-fit][Spec {spec.name} | Algo {algo} | B{b_id}] Final Train SMAPE={f_smape:.4f}% | R2={f_r2:.4f} | Params={best_params}")
                    seed_final_preds.append(final_model.predict(X_te))
                else:
                    seed_final_preds.append(last_est.predict(X_te))

            te_avg = np.mean(seed_final_preds, axis=0)
            oof_avg = np.mean(seed_oofs, axis=0)

            pred_b = te_b[[TIME_COL, BUILDING_COL]].copy()
            pred_b[f"pred_{spec.name}_{algo}"] = te_avg
            all_pred_parts.append(pred_b)

            oof_b = tr_b[[TIME_COL, BUILDING_COL]].copy()
            oof_b["oof"] = oof_avg
            all_oof_parts.append(oof_b)

        pred_df = pd.concat(all_pred_parts, ignore_index=True)
        oof_df = pd.concat(all_oof_parts, ignore_index=True)

    else:  # global
        X_tr = train_df[feat_cols]; y_tr = train_df[TARGET_COL].values
        X_te = test_df[feat_cols]
        outer_splitter = make_splitter(cv_type=cv_type, n_splits=n_splits, gap=gap)

        seed_oofs = []
        seed_final_preds = []
        for s in seeds:
            pipe = _pipe(s)
            oof, _, last_est, fold_params, fold_scores = oof_fit_predict(
                pipe, X_tr, y_tr, X_te, do_search, algo, param_spaces,
                outer_splitter, inner_cv_type, inner_splits, s,
                search_metric=search_metric,
                search_backend=search_backend,
                search_n_iter=search_n_iter,
            )
            seed_oofs.append(oof)

            best_idx = int(np.argmin(fold_scores)) if fold_scores else 0
            best_params = fold_params[best_idx] if fold_params else {}

            if do_final_refit:
                final_model = clone(_pipe(s))
                if best_params:
                    final_model.set_params(**best_params)
                final_model = _final_refit_with_holdout(final_model, algo, X_tr, y_tr, holdout_frac=0.1)
                train_pred_full = final_model.predict(X_tr)
                f_smape = smape(y_tr, train_pred_full)
                f_r2 = r2_score(y_tr, train_pred_full)
                print(f"[final-fit][Spec {spec.name} | Algo {algo} | GLOBAL] Final Train SMAPE={f_smape:.4f}% | R2={f_r2:.4f} | Params={best_params}")
                seed_final_preds.append(final_model.predict(X_te))
            else:
                seed_final_preds.append(last_est.predict(X_te))

        te_avg = np.mean(seed_final_preds, axis=0)
        oof_avg = np.mean(seed_oofs, axis=0)

        pred_df = test_df[[TIME_COL, BUILDING_COL]].copy()
        pred_df[f"pred_{spec.name}_{algo}"] = te_avg

        oof_df = train_df[[TIME_COL, BUILDING_COL]].copy()
        oof_df["oof"] = oof_avg

    return pred_df, oof_df

# ============================================================
# 7) Ensembling
# ============================================================
from functools import reduce
def simple_mean_ensemble(pred_dfs: List[pd.DataFrame]) -> pd.DataFrame:
    merged = reduce(lambda l, r: l.merge(r, on=[TIME_COL, BUILDING_COL], how="left"), pred_dfs)
    pred_cols = [c for c in merged.columns if c.startswith("pred_")]
    merged["answer"] = merged[pred_cols].mean(axis=1)
    return merged[[TIME_COL, BUILDING_COL, "answer"]]

def stacking_ensemble(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    oof_pack: List[Tuple[pd.DataFrame, str]],
    pred_pack: List[Tuple[pd.DataFrame, str]],
    meta_model = None,
) -> pd.DataFrame:
    meta_model = meta_model or Ridge(alpha=1.0, random_state=DEFAULT_RANDOM_STATE)
    # Build oof matrix
    oof_merged = None; names = []
    for df, name in oof_pack:
        names.append(name)
        df2 = df.rename(columns={"oof": name})
        oof_merged = df2 if oof_merged is None else oof_merged.merge(df2, on=[TIME_COL, BUILDING_COL], how="left")
    oof_merged = oof_merged.merge(train_df[[TIME_COL, BUILDING_COL, TARGET_COL]], on=[TIME_COL, BUILDING_COL], how="left")

    pred_merged = None
    for df, name in pred_pack:
        df2 = df.rename(columns={name: name})
        pred_merged = df2 if pred_merged is None else pred_merged.merge(df2, on=[TIME_COL, BUILDING_COL], how="left")

    X_meta = oof_merged[names].values
    y_meta = oof_merged[TARGET_COL].values
    X_meta_test = pred_merged[names].values

    meta_model.fit(X_meta, y_meta)
    meta_pred = meta_model.predict(X_meta_test)

    out = pred_merged[[TIME_COL, BUILDING_COL]].copy()
    out["answer"] = meta_pred
    return out

# ============================================================
# 8) Stage Runner + Backend auto-select + Save helpers
# ============================================================
def _backend_for_stage(stage: int) -> str:
    if FORCE_BACKEND:
        return FORCE_BACKEND
    return HPO_BACKEND_FOR_STAGE.get(stage, "random")

def _budget_for(scope: str, backend: str) -> int:
    return BUDGETS[scope][backend]

def run_stage(
    stage: int,
    train: pd.DataFrame,
    test: pd.DataFrame,
    submission_template: pd.DataFrame,
    algorithms: List[str],
    do_search: bool,
    seeds: List[int],
    cv_type: str,
    n_splits: int,
    param_spaces: Dict[str, Dict[str, List]],
    stacking: bool = False,
    gap: int = DEFAULT_GAP,
    inner_cv_type: Optional[str] = None,
    inner_splits: Optional[int] = None,
    search_metric: str = DEFAULT_SEARCH_METRIC,
    force_backend: Optional[str] = None,
) -> Tuple[pd.DataFrame, List[pd.DataFrame], List[Tuple[pd.DataFrame, str]]]:
    specs = build_model_specs()
    preds = []; oofs = []

    # backend choice
    backend = force_backend or _backend_for_stage(stage)

    for spec in specs:
        # per-building vs global budget
        search_n_iter = _budget_for(spec.scope, backend)
        for algo in algorithms:
            print(f"\n=== Training Spec={spec.name}({spec.scope}) Algo={algo} | backend={backend} trials={search_n_iter} ===")
            pred_df, oof_df = train_predict_for_spec(
                spec=spec, algo=algo, train_df=train, test_df=test,
                do_search=do_search, seeds=seeds,
                cv_type=cv_type, n_splits=n_splits,
                param_spaces=param_spaces, gap=gap,
                inner_cv_type=inner_cv_type, inner_splits=inner_splits,
                search_metric=search_metric, search_backend=backend,
                search_n_iter=search_n_iter,
                do_final_refit=True,
            )
            preds.append(pred_df)
            oofs.append((oof_df, f"{spec.name}_{algo}"))

    if stacking:
        # Build pred_pack with unified names
        pred_pack = []
        for (oof_df, name), pred_df in zip(oofs, preds):
            col = [c for c in pred_df.columns if c.startswith("pred_")][0]
            pred_pack.append((pred_df.rename(columns={col: name}), name))
        final = stacking_ensemble(train, test, oof_pack=oofs, pred_pack=pred_pack, meta_model=Ridge(alpha=1.0))
    else:
        final = simple_mean_ensemble(preds)

    sub = submission_template.copy()
    if TIME_COL in sub.columns:
        out = sub[[TIME_COL]].merge(final[[TIME_COL, "answer"]], on=TIME_COL, how="left")
    else:
        raise KeyError("submission_template must include TIME_COL")

    return out, preds, oofs

# ---- Saving helpers ----
def make_run_dir(tag: str = "v1") -> str:
    dt = datetime.now().strftime("%Y%m%d_%H%M")
    run_dir = pathlib.Path("runs") / f"{dt}_{tag}"
    run_dir.mkdir(parents=True, exist_ok=True)
    return str(run_dir)

def save_artifacts(run_dir: str, final_sub: pd.DataFrame, stage_name: str, preds: List[pd.DataFrame], oofs: List[Tuple[pd.DataFrame, str]], config: dict):
    # submission
    f_sub = os.path.join(run_dir, f"submission_{stage_name}.csv")
    final_sub.to_csv(f_sub, index=False)
    print(f"[save] {f_sub}")

    # preds & oofs
    for i, df in enumerate(preds):
        df.to_csv(os.path.join(run_dir, f"preds_{stage_name}_{i}.csv"), index=False)
    for (df, name) in oofs:
        df.to_csv(os.path.join(run_dir, f"oof_{stage_name}_{name}.csv"), index=False)

    # config
    with open(os.path.join(run_dir, f"config_{stage_name}.json"), "w", encoding="utf-8") as f:
        json.dump(config, f, ensure_ascii=False, indent=2)
    print(f"[save] artifacts saved to {run_dir}")

# ============================================================
# 9) Stage Shortcuts
# ============================================================
def stage_1_xgb_mean(train, test, sub, **kwargs):
    return run_stage(1, train, test, sub, algorithms=["xgb"], do_search=False, seeds=[DEFAULT_RANDOM_STATE], **kwargs)

def stage_2_xgb_search_mean(train, test, sub, **kwargs):
    return run_stage(2, train, test, sub, algorithms=["xgb"], do_search=True, seeds=[DEFAULT_RANDOM_STATE], **kwargs)

def stage_3_triple_mean(train, test, sub, **kwargs):
    return run_stage(3, train, test, sub, algorithms=["xgb","lgbm","mlp"], do_search=False, seeds=[DEFAULT_RANDOM_STATE], **kwargs)

def stage_4_triple_search_mean(train, test, sub, **kwargs):
    return run_stage(4, train, test, sub, algorithms=["xgb","lgbm","mlp"], do_search=True, seeds=[DEFAULT_RANDOM_STATE], **kwargs)

def stage_5_triple_search_seedavg_mean(train, test, sub, seeds: List[int]=DEFAULT_SEEDS, **kwargs):
    return run_stage(5, train, test, sub, algorithms=["xgb","lgbm","mlp"], do_search=True, seeds=seeds, **kwargs)

def stage_6_triple_search_seedavg_stacking(train, test, sub, seeds: List[int]=DEFAULT_SEEDS, **kwargs):
    return run_stage(6, train, test, sub, algorithms=["xgb","lgbm","mlp"], do_search=True, seeds=seeds, stacking=True, **kwargs)

# ============================================================
# 10) Example main
# ============================================================
if __name__ == "__main__":
    # Paths
    TRAIN_PATH = "../data/raw/train.csv"
    TEST_PATH = "../data/raw/test.csv"
    BINFO_PATH = "../data/raw/building_info.csv"     # or None
    SUB_PATH = "../data/raw/sample_submission.csv"

    # Load & merge
    _, _, binfo, sub = load_data(TRAIN_PATH, TEST_PATH, BINFO_PATH, SUB_PATH)
    train = train
    test  = test

    # CV / search config
    PARAM_SPACES = default_search_spaces()
    cv_type = "kfold"   # or 'timesplit'
    n_splits = 5
    gap = 0
    inner_cv_type = cv_type
    inner_splits = 3
    search_metric = DEFAULT_SEARCH_METRIC  # 'weighted_mse'|'smape'|'mae'|'mse'

    # # Choose stage here:
    # final_sub, preds, oofs = stage_1_xgb_mean(train, test, sub,
    #     cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
    #     gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
    #     search_metric=search_metric, force_backend=None)

    final_sub, preds, oofs = stage_2_xgb_search_mean(train, test, sub,
        cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
        gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
        search_metric=search_metric, force_backend=None)

    # final_sub, preds, oofs = stage_3_triple_mean(train, test, sub,
    #     cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
    #     gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
    #     search_metric=search_metric, force_backend=None)
    #
    # final_sub, preds, oofs = stage_4_triple_search_mean(train, test, sub,
    #     cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
    #     gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
    #     search_metric=search_metric, force_backend=None)

    # final_sub, preds, oofs = stage_5_triple_search_seedavg_mean(train, test, sub, seeds=DEFAULT_SEEDS,
    #     cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
    #     gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
    #     search_metric=search_metric, force_backend=None)

    # final_sub, preds, oofs = stage_6_triple_search_seedavg_stacking(train, test, sub, seeds=DEFAULT_SEEDS,
    #     cv_type=cv_type, n_splits=n_splits, param_spaces=PARAM_SPACES,
    #     gap=gap, inner_cv_type=inner_cv_type, inner_splits=inner_splits,
    #     search_metric=search_metric, force_backend=None)

    # Save under runs/{date_tag}/
    run_dir = make_run_dir(tag="stage2")
    cfg = dict(
        stage="2",
        cv_type=cv_type, n_splits=n_splits, gap=gap,
        inner_cv_type=inner_cv_type, inner_splits=inner_splits,
        search_metric=search_metric,
        backend=FORCE_BACKEND or HPO_BACKEND_FOR_STAGE[2],
        budgets=BUDGETS,
    )
    save_artifacts(run_dir, final_sub, "stage2", preds, oofs, cfg)