In [None]:
!nvidia-smi

Wed Aug 20 08:49:19 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   67C    P8             11W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

### 전처리

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import sklearn
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
import random as rn
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
from datetime import datetime
import warnings
import lightgbm as lgb
import pickle
import os
import json

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

os.makedirs('models', exist_ok=True)

In [None]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/electric/train.csv", parse_dates=['일시'])
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/electric/test.csv', parse_dates=['일시'])
building_info = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/electric/building_info.csv')

In [None]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temp',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'wind',
    '습도(%)': 'hum',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})

test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temp',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'wind',
    '습도(%)': 'hum',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})


building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '학교': 'University',
    '백화점': 'Department Store',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    'IDC(전화국)': 'IDC',
    '호텔': 'Hotel'
}

building_info['building'] = building_info['building'].replace(translation_dict)

building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)

train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

In [None]:
# 로그 변환: 전력 사용량에 log1p 적용 (모델 입력용 별도 컬럼 유지)
train['power_consumption_log'] = np.log1p(train['power_consumption'])

# 2. 날짜/시간 관련 기본 특성 생성
train['date'] = train['date_time'].dt.date         # 날짜 (연-월-일)
test['date'] = test['date_time'].dt.date

train['dow'] = train['date_time'].dt.weekday       # 요일 (0=월, ..., 6=일)
test['dow'] = test['date_time'].dt.weekday

train['day'] = train['date_time'].dt.day           # 일 추가
test['day'] = test['date_time'].dt.day

train['month'] = train['date_time'].dt.month       # 월
test['month'] = test['date_time'].dt.month

# 주차 (연 기준 주 번호)
train['week'] = train['date_time'].dt.isocalendar().week.astype(int)
test['week'] = test['date_time'].dt.isocalendar().week.astype(int)

# 월 내 몇째주(n_week) 특성 (휴일 패턴 인식용)
train['n_week'] = train['date_time'].dt.day.map(lambda x: (x-1)//7 + 1)
test['n_week'] = test['date_time'].dt.day.map(lambda x: (x-1)//7 + 1)

# 시간 관련 주기 특성: 시간 (hour), 일간 주기 사이클
train['hour'] = train['date_time'].dt.hour
test['hour'] = test['date_time'].dt.hour

# 주기성 인코딩 - 첫 번째 베이스라인 방식 적용
# 시간 (24시간 주기, 수정된 버전)
train['sin_hour'] = np.sin(2 * np.pi * train['hour'] / 24.0)
train['cos_hour'] = np.cos(2 * np.pi * train['hour'] / 24.0)
test['sin_hour'] = np.sin(2 * np.pi * test['hour'] / 24.0)
test['cos_hour'] = np.cos(2 * np.pi * test['hour'] / 24.0)

# 월 주기성 (12개월 주기)
train['sin_month'] = np.sin(2 * np.pi * train['month'] / 12.0)
train['cos_month'] = np.cos(2 * np.pi * train['month'] / 12.0)
test['sin_month'] = np.sin(2 * np.pi * test['month'] / 12.0)
test['cos_month'] = np.cos(2 * np.pi * test['month'] / 12.0)

# 요일 주기성 (7일 주기)
train['sin_dayofweek'] = np.sin(2 * np.pi * train['dow'] / 7.0)
train['cos_dayofweek'] = np.cos(2 * np.pi * train['dow'] / 7.0)
test['sin_dayofweek'] = np.sin(2 * np.pi * test['dow'] / 7.0)
test['cos_dayofweek'] = np.cos(2 * np.pi * test['dow'] / 7.0)

# 날짜 복합 주기성
train['sin_date'] = np.sin(2 * np.pi * (train['month'] + train['day'] / 31) / 12)
train['cos_date'] = np.cos(2 * np.pi * (train['month'] + train['day'] / 31) / 12)
test['sin_date'] = np.sin(2 * np.pi * (test['month'] + test['day'] / 31) / 12)
test['cos_date'] = np.cos(2 * np.pi * (test['month'] + test['day'] / 31) / 12)

# 연간 주기 특성
train['day_of_year'] = train['date_time'].dt.dayofyear.astype(float)
test['day_of_year'] = test['date_time'].dt.dayofyear.astype(float)
train['summer_sin'] = np.sin(2 * np.pi * train['day_of_year'] / 365.0)
train['summer_cos'] = np.cos(2 * np.pi * train['day_of_year'] / 365.0)
test['summer_sin'] = np.sin(2 * np.pi * test['day_of_year'] / 365.0)
test['summer_cos'] = np.cos(2 * np.pi * test['day_of_year'] / 365.0)

In [None]:
train['date'] = pd.to_datetime(train['date'])
test['date'] = pd.to_datetime(test['date'])

# 여름 기간 진행률 사이클릭 특성
year = train['date'].dt.year.iloc[0]
summer_start = pd.to_datetime(f"{year}-06-01")
summer_end = pd.to_datetime(f"{year}-08-31")
train['day_of_summer'] = (train['date'] - summer_start).dt.days.clip(lower=0)
test['day_of_summer'] = (test['date'] - summer_start).dt.days.clip(lower=0)
total_summer_days = (summer_end - summer_start).days + 1
train['summer_progress_sin'] = np.sin(2 * np.pi * train['day_of_summer'] / total_summer_days)
train['summer_progress_cos'] = np.cos(2 * np.pi * train['day_of_summer'] / total_summer_days)
test['summer_progress_sin'] = np.sin(2 * np.pi * test['day_of_summer'] / total_summer_days)
test['summer_progress_cos'] = np.cos(2 * np.pi * test['day_of_summer'] / total_summer_days)

# 전일(Previous-day) 일별 온도 통계 ===
def attach_prevday_temp_features(train: pd.DataFrame, test: pd.DataFrame):
    # 1) 일별 요약
    tmp = pd.concat(
        [train[['building_number','date','temp']],
         test[['building_number','date','temp']]],
        axis=0, ignore_index=True
    )
    daily = (tmp.groupby(['building_number','date'])['temp']
               .agg(['max','mean','min'])
               .rename(columns={'max':'max_temp','mean':'mean_temp','min':'min_temp'})
             )

    # 2) 건물별 날짜순 전일 시프트
    daily = daily.sort_index()
    prev = daily.groupby(level=0)[['max_temp','mean_temp','min_temp']].shift(1)
    prev = prev.rename(columns={
        'max_temp':'prevday_max_temperature',
        'mean_temp':'prevday_mean_temperature',
        'min_temp':'prevday_min_temperature'
    })

    # 3) 전일 일교차
    prev['prevday_temperature_range'] = (
        prev['prevday_max_temperature'] - prev['prevday_min_temperature']
    )

    # 4) merge
    prev = prev.reset_index()
    train_out = train.merge(prev, on=['building_number','date'], how='left')
    test_out  = test.merge(prev,  on=['building_number','date'], how='left')
    return train_out, test_out

train, test = attach_prevday_temp_features(train, test)

In [None]:
# 냉방면적 비율
train['cooling_ratio'] = train['cooling_area'] / train['total_area']
train['cooling_ratio'] = train['cooling_ratio'].fillna(0)
test['cooling_ratio'] = test['cooling_area'] / test['total_area']
test['cooling_ratio'] = test['cooling_ratio'].fillna(0)

In [None]:
# 기상 지표 특성 생성 (THI: 온습도지수, WC: 체감온도, CDH: 불쾌지수)
def calc_THI(temp, hum):
    # 온습도지수 (Temperature-Humidity Index)
    return (1.8 * temp + 32) - (0.55 - 0.0055 * hum) * (1.8 * temp - 26.8)

def calc_discomfort_index(temp, hum):
    # 불쾌지수 (Discomfort Index)
    return 0.81 * temp + 0.01 * hum * (0.99 * temp - 14.3) + 46.3

def calc_heat_index(temp, hum):
    """열지수 (Heat Index) - 여름철 체감온도"""
    # 화씨 변환
    temp_f = temp * 9/5 + 32
    hi = -42.379 + 2.04901523 * temp_f + 10.14333127 * hum
    hi += -0.22475541 * temp_f * hum - 6.83783e-3 * temp_f**2
    hi += -5.481717e-2 * hum**2 + 1.22874e-3 * temp_f**2 * hum
    hi += 8.5282e-4 * temp_f * hum**2 - 1.99e-6 * temp_f**2 * hum**2
    # 섭씨 변환
    return (hi - 32) * 5/9

train['THI'] = calc_THI(train['temp'], train['hum'])
train['CDH'] = calc_discomfort_index(train['temp'], train['hum'])
train['HI'] = calc_heat_index(train['temp'], train['hum'])
test['THI'] = calc_THI(test['temp'], test['hum'])
test['CDH'] = calc_discomfort_index(test['temp'], test['hum'])
test['HI'] = calc_heat_index(test['temp'], test['hum'])


In [None]:
# 휴일 처리
official_holidays = ['2024-06-06', '2024-08-15']  # 공식 공휴일
train['holiday'] = (
    (train['dow'] >= 5) |
    (train['date'].dt.strftime('%Y-%m-%d').isin(official_holidays))
).astype(int)
test['holiday'] = (
    (test['dow'] >= 5) |
    (test['date'].dt.strftime('%Y-%m-%d').isin(official_holidays))
).astype(int)

In [None]:
# 각 폴드별 통계 피처 생성 (모든 데이터에 대해서 통계 피처를 생성해버리면 누수 발생)

def _build_time_stats(df_sub):
    """df_sub: '해당 폴드의 train 구간'만 들어온 부분 데이터프레임"""
    # building_number × hour × dow
    power_mean = (pd.pivot_table(df_sub, values='power_consumption',
                                 index=['building_number','hour','dow'], aggfunc=np.mean)
                    .reset_index()
                    .rename(columns={'power_consumption':'day_hour_mean'}))
    power_std  = (pd.pivot_table(df_sub, values='power_consumption',
                                 index=['building_number','hour','dow'], aggfunc=np.std)
                    .reset_index()
                    .rename(columns={'power_consumption':'day_hour_std'}))
    # building_number × hour
    power_hour_mean = (pd.pivot_table(df_sub, values='power_consumption',
                                      index=['building_number','hour'], aggfunc=np.mean)
                         .reset_index()
                         .rename(columns={'power_consumption':'hour_mean'}))
    power_hour_std  = (pd.pivot_table(df_sub, values='power_consumption',
                                      index=['building_number','hour'], aggfunc=np.std)
                         .reset_index()
                         .rename(columns={'power_consumption':'hour_std'}))
    return power_mean, power_std, power_hour_mean, power_hour_std


def attach_time_stats(df_target, stats):
    """df_target: 머지 대상(폴드 train 또는 valid 또는 test)
       stats: _build_time_stats() 결과 튜플
    """
    power_mean, power_std, power_hour_mean, power_hour_std = stats
    out = (df_target
           .merge(power_mean, on=['building_number','hour','dow'], how='left')
           .merge(power_std,  on=['building_number','hour','dow'], how='left')
           .merge(power_hour_mean, on=['building_number','hour'], how='left')
           .merge(power_hour_std,  on=['building_number','hour'], how='left'))

    for c in ('day_hour_std','hour_std'):
        if c in out.columns:
            out[c] = out[c].fillna(0.0)
    return out


In [None]:
train

Unnamed: 0,num_date_time,building_number,date_time,temp,rainfall,wind,hum,sunshine,solar_radiation,power_consumption,building,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,solar_power_utility,ess_utility,power_consumption_log,date,dow,day,month,week,n_week,hour,sin_hour,cos_hour,sin_month,cos_month,sin_dayofweek,cos_dayofweek,sin_date,cos_date,day_of_year,summer_sin,summer_cos,day_of_summer,summer_progress_sin,summer_progress_cos,prevday_max_temperature,prevday_mean_temperature,prevday_min_temperature,prevday_temperature_range,cooling_ratio,THI,CDH,HI,holiday
0,1_20240601 00,1,2024-06-01 00:00:00,18.3,0.0,2.6,82.0,0.0,0.00,5794.80,Hotel,82912.71,77586.0,-,-,-,0,0,8.664889,2024-06-01,5,1,6,22,1,0,0.000000,1.000000,1.224647e-16,-1.0,-0.974928,-0.222521,-0.016889,-0.999857,153.0,0.486273,-0.873807,0,0.000000,1.000000,,,,,0.935755,64.33214,64.25294,20.578330,1
1,1_20240601 01,1,2024-06-01 01:00:00,18.3,0.0,2.7,82.0,0.0,0.00,5591.85,Hotel,82912.71,77586.0,-,-,-,0,0,8.629244,2024-06-01,5,1,6,22,1,1,0.258819,0.965926,1.224647e-16,-1.0,-0.974928,-0.222521,-0.016889,-0.999857,153.0,0.486273,-0.873807,0,0.000000,1.000000,,,,,0.935755,64.33214,64.25294,20.578330,1
2,1_20240601 02,1,2024-06-01 02:00:00,18.1,0.0,2.6,80.0,0.0,0.00,5338.17,Hotel,82912.71,77586.0,-,-,-,0,0,8.582825,2024-06-01,5,1,6,22,1,2,0.500000,0.866025,1.224647e-16,-1.0,-0.974928,-0.222521,-0.016889,-0.999857,153.0,0.486273,-0.873807,0,0.000000,1.000000,,,,,0.935755,63.94420,63.85620,21.231987,1
3,1_20240601 03,1,2024-06-01 03:00:00,18.0,0.0,2.6,81.0,0.0,0.00,4554.42,Hotel,82912.71,77586.0,-,-,-,0,0,8.424073,2024-06-01,5,1,6,22,1,3,0.707107,0.707107,1.224647e-16,-1.0,-0.974928,-0.222521,-0.016889,-0.999857,153.0,0.486273,-0.873807,0,0.000000,1.000000,,,,,0.935755,63.81480,63.73120,20.960120,1
4,1_20240601 04,1,2024-06-01 04:00:00,17.8,0.0,1.3,81.0,0.0,0.00,3602.25,Hotel,82912.71,77586.0,-,-,-,0,0,8.189591,2024-06-01,5,1,6,22,1,4,0.866025,0.500000,1.224647e-16,-1.0,-0.974928,-0.222521,-0.016889,-0.999857,153.0,0.486273,-0.873807,0,0.000000,1.000000,,,,,0.935755,63.49242,63.40882,21.026653,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100_20240824 19,100,2024-08-24 19:00:00,29.1,0.0,4.4,76.0,0.4,0.18,3276.00,Hotel,162070.24,152943.0,-,-,-,0,0,8.094684,2024-08-24,5,24,8,34,4,19,-0.965926,0.258819,-8.660254e-01,-0.5,-0.974928,-0.222521,-0.993019,-0.117957,237.0,-0.806480,-0.591261,84,-0.519584,0.854419,32.1,29.266667,26.9,5.2,0.943683,81.00344,80.89784,34.151872,1
203996,100_20240824 20,100,2024-08-24 20:00:00,28.6,0.0,3.7,74.0,0.0,0.00,3197.52,Hotel,162070.24,152943.0,-,-,-,0,0,8.070443,2024-08-24,5,24,8,34,4,20,-0.866025,0.500000,-8.660254e-01,-0.5,-0.974928,-0.222521,-0.993019,-0.117957,237.0,-0.806480,-0.591261,84,-0.519584,0.854419,32.1,29.266667,26.9,5.2,0.943683,79.95076,79.83636,32.550503,1
203997,100_20240824 21,100,2024-08-24 21:00:00,28.3,0.0,2.9,74.0,0.0,0.00,3006.60,Hotel,162070.24,152943.0,-,-,-,0,0,8.008898,2024-08-24,5,24,8,34,4,21,-0.707107,0.707107,-8.660254e-01,-0.5,-0.974928,-0.222521,-0.993019,-0.117957,237.0,-0.806480,-0.591261,84,-0.519584,0.854419,32.1,29.266667,26.9,5.2,0.943683,79.48798,79.37358,31.870187,1
203998,100_20240824 22,100,2024-08-24 22:00:00,28.0,0.0,1.7,76.0,0.0,0.00,2649.72,Hotel,162070.24,152943.0,-,-,-,0,0,7.882587,2024-08-24,5,24,8,34,4,22,-0.500000,0.866025,-8.660254e-01,-0.5,-0.974928,-0.222521,-0.993019,-0.117957,237.0,-0.806480,-0.591261,84,-0.519584,0.854419,32.1,29.266667,26.9,5.2,0.943683,79.28480,79.17920,31.500110,1


### 건물별 전력 시계열 확인

In [None]:
def plot_buildings_by_type_with_fold_mark(train, tscv):
    # 건물 유형 리스트 만들기
    building_types = train['building'].unique()
    # 각 건물번호에 해당하는 유형 딕셔너리 생성
    building_type_map = train[['building_number', 'building']].drop_duplicates().set_index('building_number')['building'].to_dict()
    # 건물 번호를 건물 유형별로 정렬
    sorted_building_nums = []
    for btype in building_types:
        nums = train[train['building'] == btype]['building_number'].unique()
        sorted_building_nums.extend(nums)

    n_buildings = len(sorted_building_nums)
    plt.figure(figsize=(15, 4 * n_buildings))

    # 각 건물마다 시각화
    for i, bld_num in enumerate(sorted_building_nums, 1):
        plt.subplot(n_buildings, 1, i)
        bld_data = train[train['building_number'] == bld_num].sort_values('date_time')
        dates = bld_data['date_time']
        y = bld_data['power_consumption']
        btype = building_type_map[bld_num]

        plt.plot(dates, y, label=f'Building {bld_num}', color='blue')
        plt.title(f'Building {bld_num} ({btype})', fontsize=11, loc='left')
        plt.xlabel('Date Time')
        plt.ylabel('Power Consumption')
        plt.xticks(rotation=45)
        plt.grid(alpha=0.3)

        # TSCV fold마다 val 영역 마킹
        X = bld_data.reset_index(drop=True)
        n_samples = len(X)
        split_indices = []
        for fold, (train_idx, val_idx) in enumerate(tscv.split(X)):
            # val 시작~끝 horizontal line 마킹
            val_start = X['date_time'].iloc[val_idx[0]]
            val_end = X['date_time'].iloc[val_idx[-1]]
            plt.axvspan(val_start, val_end, color='red', alpha=0.10)
            # fold 번호 텍스트로 표시
            plt.text(val_start, max(y)*0.95, f'val {fold+1}', color='red', fontsize=8)

    plt.suptitle('Power Consumption Time Series: 각 건물 유형별, fold 검증 마킹 포함', fontsize=17, y=1.01)
    plt.subplots_adjust(top=0.97, hspace=0.7)
    plt.show()

tscv = TimeSeriesSplit(n_splits=5, test_size=24*7, gap=24)
plot_buildings_by_type_with_fold_mark(train, tscv)

Output hidden; open in https://colab.research.google.com to view.

### 건물개별 학습

In [None]:
feature_cols = [
    # 기본 식별 정보
    'building_number',

    # 기상 정보 (공통)
    'temp', 'rainfall', 'wind', 'hum',

    # 시간 특성
    'dow', 'month', 'day', 'hour', 'week', 'n_week',

    # 주기성, 사이클릭 인코딩
    'sin_hour', 'cos_hour',
    'sin_month', 'cos_month',
    'sin_dayofweek', 'cos_dayofweek',
    'sin_date', 'cos_date',
    'summer_sin', 'summer_cos',
    'summer_progress_sin', 'summer_progress_cos',

    # 일별/건물별 온도 통계
    'prevday_max_temperature', 'prevday_min_temperature', 'prevday_mean_temperature', 'prevday_temperature_range',

    # 기상지수
    'THI', 'CDH', 'HI',

    # 휴일 특성 (간단한 주말/공휴일 플래그)
    'holiday',

    # 건물 속성 (유형: 숫자코드로 변환, 면적 등)
    'total_area', 'cooling_area', 'cooling_ratio',
    'solar_power_utility', 'ess_utility',

    # 통계 피처 (이름을 전처리에서 생성된 대로 일치)
    'day_hour_mean', 'day_hour_std',
    'hour_mean', 'hour_std'
]

In [None]:
def smape(y_true, y_pred):
    """개선된 sMAPE 계산 함수"""
    y_true = np.maximum(y_true, 0)
    y_pred = np.maximum(y_pred, 0)

    denominator = np.abs(y_true) + np.abs(y_pred)
    mask = denominator > 1e-8

    if np.sum(mask) == 0:
        return 0.0

    numerator = 2 * np.abs(y_pred - y_true)[mask]
    return 100 * np.mean(numerator / denominator[mask])


def smape_eval(y_pred, y_true):
    """XGBoost용 sMAPE 평가 함수 (log1p 변환 대응)"""
    y_true_val = y_true.get_label()

    # log1p 변환의 역함수는 expm1 (exp(x) - 1)
    y_true_orig = np.expm1(y_true_val)
    y_pred_orig = np.expm1(y_pred)

    # 전력 소비량 음수 방지
    y_true_orig = np.maximum(y_true_orig, 0)
    y_pred_orig = np.maximum(y_pred_orig, 0)

    smape_score = smape(y_true_orig, y_pred_orig)
    return 'smape', smape_score


def weighted_mse(alpha=3.0):
    """가중 MSE 목적함수 (과소예측 페널티)"""
    def objective(y_pred, y_true):
        y_true_val = y_true.get_label()
        residual = y_true_val - y_pred

        # 과소예측(residual > 0)에 더 큰 페널티
        weights = np.where(residual > 0, alpha, 1.0)

        # 수치적 안정성을 위한 가중치 클리핑
        weights = np.clip(weights, 0.1, 10.0)

        grad = -2 * weights * residual
        hess = 2 * weights

        return grad, hess
    return objective


In [None]:
PROJECT_PATH = '/content/drive/MyDrive/power_prediction_project'
MODEL_PATH_INDIVIDUAL = f'{PROJECT_PATH}/models/individual_buildings'

os.makedirs(PROJECT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH_INDIVIDUAL, exist_ok=True)


print(f"✅ 프로젝트 경로: {PROJECT_PATH}")
print(f"✅ 건물유형별 모델 경로: {MODEL_PATH_INDIVIDUAL}")

# 작업 디렉토리 변경
%cd {PROJECT_PATH}

✅ 프로젝트 경로: /content/drive/MyDrive/power_prediction_project
✅ 건물유형별 모델 경로: /content/drive/MyDrive/power_prediction_project/models/individual_buildings
/content/drive/MyDrive/power_prediction_project


In [None]:
#========================================
# Utility for model check & load
#========================================
def check_building_completion(building_num, n_folds=5):
    """해당 건물의 모든 fold 모델이 존재하는지 확인"""
    safe_building_name = str(building_num).replace(' ', '_').replace('/', '_')
    for fold in range(n_folds):
        json_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.json"
        pkl_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.pkl"
        if not (os.path.exists(json_path) or os.path.exists(pkl_path)):
            return False, fold  # 해당 fold부터 재시작
    return True, -1


def load_existing_models(building_num, n_folds=5):
    """저장된 모델들 로드"""
    safe_building_name = str(building_num).replace(' ', '_').replace('/', '_')
    models = []
    for fold in range(n_folds):
        json_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.json"
        pkl_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.pkl"
        if os.path.exists(json_path):
            model = xgb.Booster()
            model.load_model(json_path)
            models.append(model)
        elif os.path.exists(pkl_path):
            with open(pkl_path, 'rb') as f:
                model = pickle.load(f)
            models.append(model)
        else:
            raise FileNotFoundError(f"모델 파일이 없습니다: {json_path}, {pkl_path}")
    return models


def regenerate_oof_and_cv_results(building_data, models, feature_cols, tscv):
    """저장된 모델로 OOF 및 CV 결과 재계산"""
    n_samples = len(building_data)
    oof_preds = np.zeros(n_samples)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(tscv.split(building_data)):
        tr_df = building_data.iloc[train_idx].copy()
        va_df = building_data.iloc[val_idx].copy()
        stats = _build_time_stats(tr_df)
        va_df = attach_time_stats(va_df, stats)
        X_val_cv = va_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        dvalid = xgb.DMatrix(X_val_cv, feature_names=X_val_cv.columns.tolist())

        # OOF 예측
        val_pred_log = models[fold].predict(dvalid)
        val_pred = np.expm1(val_pred_log)
        oof_preds[val_idx] = val_pred

        # SMAPE 계산 및 CV 결과 저장
        y_val = building_data.iloc[val_idx]['power_consumption'].values
        smape_score = smape(y_val, val_pred)
        rmse_score = np.sqrt(np.mean((val_pred - y_val) ** 2))

        fold_results.append({
            'fold': fold + 1,
            'smape': smape_score,
            'rmse': rmse_score,
            'val_size': len(val_idx),
            'building_type': building_data['building'].iloc[0]
        })

    return oof_preds, fold_results


#========================================
# 최근성 가중 smape 함수 그대로 유지
def recent_weighted_score(fold_smape_list, decay=0.5):
    K = len(fold_smape_list)
    weights = [decay ** (K - 1 - i) for i in range(K)]
    weights = np.array(weights) / sum(weights)
    weighted_mean = np.sum(weights * np.array(fold_smape_list))
    weighted_std = np.sqrt(np.sum(weights * (np.array(fold_smape_list) - weighted_mean) ** 2))
    lambda_penalty = 1.0
    score = weighted_mean + lambda_penalty * weighted_std
    return score, weighted_mean, weighted_std, weights


In [None]:
#========================================
# 하이퍼파라미터 및 설정
individual_building_params = {
    "max_depth": 10,
    "learning_rate": 0.005,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "min_child_weight": 4,
    "gamma": 0.05,
    "lambda": 1.2,
    "alpha": 0.4,
    "tree_method": "hist",
    "device": "cuda",
    "max_bin": 256,
    "verbosity": 0,
    "random_state": RANDOM_SEED,
    "eval_metric": "mae",
}

N_BOOST_ROUND = 8000
ES_ROUNDS = 600

print(f"\n건물별 개별 모델 학습/복원 시작... (총 {len(train['building_number'].unique())}개 건물)")

tscv = TimeSeriesSplit(n_splits=5, test_size=24*7, gap=24)
individual_models = {}
individual_cv_results = {}
oof_preds_per_building = {}

for bld_idx, building_num in enumerate(train['building_number'].unique()):
    building_data = train[train['building_number'] == building_num].copy()
    building_type = building_data['building'].iloc[0]
    building_data = building_data.sort_values('date_time').reset_index(drop=True)
    n_samples = len(building_data)

    if n_samples < 500:
        print(f"  데이터가 부족합니다 ({n_samples}행). 건물 개별 학습 스킵.")
        continue

    is_complete, resume_fold = check_building_completion(building_num)
    fold_results = []
    safe_building_name = str(building_num).replace(' ', '_').replace('/', '_')

    if is_complete:
        print(f"  Building {building_num} ({building_type}): 모델 모두 존재, OOF 및 CV 결과 재계산")
        models = load_existing_models(building_num)
        individual_models[building_num] = models
        # ✅ 수정: OOF와 CV 결과를 함께 계산
        oof_preds, fold_results = regenerate_oof_and_cv_results(building_data, models, feature_cols, tscv)
        oof_preds_per_building[building_num] = oof_preds
        individual_cv_results[building_num] = fold_results  # ✅ 추가: CV 결과 저장

        # CV 결과 요약 출력
        smape_values = [r['smape'] for r in fold_results]
        print(f"    → 복원된 CV SMAPE: {' → '.join([f'{s:.3f}' for s in smape_values])} (평균: {np.mean(smape_values):.3f})")
        continue

    print(f"  Building {building_num} ({building_type}): fold {resume_fold}부터 학습 재개")
    models = []
    oof_preds = np.zeros(n_samples)  # 전체 fold oof

    for fold, (train_idx, val_idx) in enumerate(tscv.split(building_data)):
        if fold < resume_fold:
            # 이미 저장된 모델 불러오기
            model = None
            json_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.json"
            pkl_path = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.pkl"
            if os.path.exists(json_path):
                model = xgb.Booster()
                model.load_model(json_path)
            elif os.path.exists(pkl_path):
                with open(pkl_path, 'rb') as f:
                    model = pickle.load(f)
            if model is not None:
                models.append(model)
                print(f"    Fold {fold+1}: 이미 저장된 모델 사용")
            else:
                raise FileNotFoundError(f"Fold {fold} 모델 파일이 없습니다.")
            continue

        print(f"    Fold {fold+1}/{tscv.n_splits} 학습 중... ", end="")
        tr_df = building_data.iloc[train_idx].copy()
        va_df = building_data.iloc[val_idx].copy()
        stats = _build_time_stats(tr_df)
        tr_df = attach_time_stats(tr_df, stats)
        va_df = attach_time_stats(va_df, stats)
        X_train_cv = tr_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        X_val_cv = va_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        y_train_cv = tr_df['power_consumption_log']
        y_val_cv = va_df['power_consumption_log']

        dtrain = xgb.DMatrix(X_train_cv, label=y_train_cv, feature_names=X_train_cv.columns.tolist())
        dvalid = xgb.DMatrix(X_val_cv, label=y_val_cv, feature_names=X_train_cv.columns.tolist())
        evals_result = {}
        model = xgb.train(
            params=individual_building_params,
            dtrain=dtrain,
            num_boost_round=N_BOOST_ROUND,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            obj=weighted_mse(alpha=3.0),
            custom_metric=smape_eval,
            early_stopping_rounds=ES_ROUNDS,
            verbose_eval=False,
            evals_result=evals_result
        )
        models.append(model)
        val_pred_log = model.predict(dvalid, iteration_range=(0, model.best_iteration+1))
        val_pred = np.expm1(val_pred_log)
        oof_preds[val_idx] = val_pred
        y_val = np.expm1(y_val_cv)
        smape_score = smape(y_val, val_pred)
        rmse_score = np.sqrt(np.mean((val_pred - y_val) ** 2))
        fold_results.append({
            'fold': fold + 1,
            'smape': smape_score,
            'rmse': rmse_score,
            'best_iteration': model.best_iteration,
            'train_size': len(train_idx),
            'val_size': len(val_idx),
            'train_mae_final': evals_result['train']['mae'][model.best_iteration],
            'valid_mae_final': evals_result['valid']['mae'][model.best_iteration],
            'train_smape_final': evals_result['train']['smape'][model.best_iteration],
            'valid_smape_final': evals_result['valid']['smape'][model.best_iteration],
            'train_val_gap': evals_result['valid']['mae'][model.best_iteration] - evals_result['train']['mae'][model.best_iteration],
            'building_type': building_type
        })
        print(f"SMAPE: {smape_score:.4f}, best_iter: {model.best_iteration}, "
              f"train_mae: {evals_result['train']['mae'][model.best_iteration]:.4f}, "
              f"valid_mae: {evals_result['valid']['mae'][model.best_iteration]:.4f}")
        # 모델 저장
        model_file = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.json"
        try:
            model.save_model(model_file)
        except Exception as e:
            print(f"\n    JSON 저장 실패 (Fold {fold}), Pickle로 저장: {e}")
            pickle_file = f"{MODEL_PATH_INDIVIDUAL}/xgb_building_{safe_building_name}_fold{fold}.pkl"
            with open(pickle_file, 'wb') as f:
                pickle.dump(model, f)

    # fold loop 종료 후 결과 저장
    individual_models[building_num] = models
    oof_preds_per_building[building_num] = oof_preds
    individual_cv_results[building_num] = fold_results

print(f"\n=== 건물별 개별 모델 학습/복원 완료 ===")

#========================================
# 디버깅용 완료 상태 확인 및 요약
#========================================
print(f"\n=== 개별 모델 학습/복원 완료 상태 확인 ===")
print(f"총 학습 대상 건물 수: {len(train['building_number'].unique())}")
print(f"실제 학습 완료 건물 수: {len(individual_models)}")
print(f"OOF 예측값 저장된 건물 수: {len(oof_preds_per_building)}")
print(f"CV 결과 저장된 건물 수: {len(individual_cv_results)}")

# 각 딕셔너리의 키가 일치하는지 확인
models_keys = set(individual_models.keys())
oof_keys = set(oof_preds_per_building.keys())
cv_keys = set(individual_cv_results.keys())

print(f"\n=== 데이터 일관성 확인 ===")
if models_keys == oof_keys == cv_keys:
    print("✅ 모든 딕셔너리의 건물 키가 일치합니다")
else:
    print("❌ 딕셔너리 간 건물 키 불일치 발견:")
    print(f"  models만 있는 키: {models_keys - oof_keys - cv_keys}")
    print(f"  oof만 있는 키: {oof_keys - models_keys - cv_keys}")
    print(f"  cv만 있는 키: {cv_keys - models_keys - oof_keys}")

# CV 결과 샘플 확인
print(f"\n=== CV 결과 샘플 확인 ===")
sample_buildings = list(individual_cv_results.keys())[:3]
for building_num in sample_buildings:
    results = individual_cv_results[building_num]
    if results:
        smape_values = [r['smape'] for r in results]
        print(f"건물 {building_num}: {len(results)}개 fold, SMAPE = {smape_values}")
    else:
        print(f"건물 {building_num}: CV 결과 없음")

# 전체 성능 요약
print(f"\n=== 전체 개별 모델 성능 요약 ===")
all_smapes = []
building_type_counts = {}

for building_num, results in individual_cv_results.items():
    if results:
        building_type = results[0]['building_type']
        smape_values = [r['smape'] for r in results]
        avg_smape = np.mean(smape_values)
        all_smapes.append(avg_smape)

        if building_type not in building_type_counts:
            building_type_counts[building_type] = []
        building_type_counts[building_type].append(avg_smape)

if all_smapes:
    print(f"전체 개별 모델 SMAPE: 평균 {np.mean(all_smapes):.3f} ± {np.std(all_smapes):.3f}")
    print(f"                    범위: {np.min(all_smapes):.3f} ~ {np.max(all_smapes):.3f}")

    print(f"\n=== 건물 유형별 개별 모델 성능 ===")
    for building_type, smapes in building_type_counts.items():
        print(f"{building_type:>15}: {len(smapes):>2}개 건물, 평균 SMAPE {np.mean(smapes):.3f} ± {np.std(smapes):.3f}")

print(f"\n=== OOF 데이터 크기 확인 ===")
oof_sizes = {building_num: len(oof_pred) for building_num, oof_pred in oof_preds_per_building.items()}
unique_sizes = set(oof_sizes.values())
print(f"OOF 예측값 크기 분포: {dict(zip(*np.unique(list(oof_sizes.values()), return_counts=True)[::-1]))}")
if len(unique_sizes) == 1:
    print(f"✅ 모든 건물의 OOF 크기가 동일합니다: {list(unique_sizes)[0]}개")
else:
    print(f"❌ 건물별 OOF 크기가 다릅니다: {unique_sizes}")

print(f"\n=== 개별 모델 준비 완료 ===")



건물별 개별 모델 학습/복원 시작... (총 100개 건물)
  Building 1 (Hotel): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 12.017 → 8.981 → 9.040 → 8.852 → 9.991 (평균: 9.776)
  Building 2 (Commercial): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 5.383 → 7.231 → 5.916 → 7.032 → 5.506 (평균: 6.214)
  Building 3 (Hospital): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 4.686 → 3.107 → 3.095 → 4.570 → 4.614 (평균: 4.014)
  Building 4 (Hotel): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 5.951 → 5.382 → 4.936 → 5.035 → 5.860 (평균: 5.433)
  Building 5 (University): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 3.218 → 3.118 → 3.870 → 2.388 → 4.531 (평균: 3.425)
  Building 6 (Commercial): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 10.424 → 8.772 → 7.635 → 16.614 → 12.678 (평균: 11.224)
  Building 7 (Other Buildings): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 14.161 → 13.348 → 85.953 → 7.901 → 3.586 (평균: 24.990)
  Building 8 (University): 모델 모두 존재, OOF 및 CV 결과 재계산
    → 복원된 CV SMAPE: 8.152 → 16.346 → 9.497 → 12.8

### 건물 유형별 학습

In [None]:
# 유형별 모델 저장 경로 설정
MODEL_PATH_BUILDING_TYPE = f'{PROJECT_PATH}/models/building_types'
os.makedirs(MODEL_PATH_BUILDING_TYPE, exist_ok=True)

In [None]:
#========================================
# 건물 유형별 모델용 유틸리티 함수들
#========================================
def check_building_type_completion(building_type, n_folds=5):
    """해당 건물 유형의 모든 fold 모델이 존재하는지 확인"""
    safe_type_name = str(building_type).replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '')
    for fold in range(n_folds):
        json_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.json"
        pkl_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.pkl"
        if not (os.path.exists(json_path) or os.path.exists(pkl_path)):
            return False, fold  # 해당 fold부터 재시작
    return True, -1


def load_existing_type_models(building_type, n_folds=5):
    """저장된 건물 유형별 모델들 로드"""
    safe_type_name = str(building_type).replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '')
    models = []
    for fold in range(n_folds):
        json_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.json"
        pkl_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.pkl"
        if os.path.exists(json_path):
            model = xgb.Booster()
            model.load_model(json_path)
            models.append(model)
        elif os.path.exists(pkl_path):
            with open(pkl_path, 'rb') as f:
                model = pickle.load(f)
            models.append(model)
        else:
            raise FileNotFoundError(f"모델 파일이 없습니다: {json_path}, {pkl_path}")
    return models


def regenerate_oof_from_type_models(type_data, models, feature_cols, tscv):
    """저장된 건물 유형별 모델로 OOF 재계산"""
    n_samples = len(type_data)
    oof_preds = np.zeros(n_samples)
    for fold, (train_idx, val_idx) in enumerate(tscv.split(type_data)):
        tr_df = type_data.iloc[train_idx].copy()
        va_df = type_data.iloc[val_idx].copy()
        stats = _build_time_stats(tr_df)
        va_df = attach_time_stats(va_df, stats)
        X_val_cv = va_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        dvalid = xgb.DMatrix(X_val_cv, feature_names=X_val_cv.columns.tolist())
        val_pred_log = models[fold].predict(dvalid)
        val_pred = np.expm1(val_pred_log)
        oof_preds[val_idx] = val_pred
    return oof_preds


#========================================
# 건물 유형별 하이퍼파라미터 (개별 모델과 동일하게 시작)
building_type_params = {
    "max_depth": 10,
    "learning_rate": 0.005,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "min_child_weight": 4,
    "gamma": 0.05,
    "lambda": 1.2,
    "alpha": 0.4,
    "tree_method": "hist",
    "device": "cuda",
    "max_bin": 256,
    "verbosity": 0,
    "random_state": RANDOM_SEED,
    "eval_metric": "mae",
}

N_BOOST_ROUND = 8000
ES_ROUNDS = 600

print(f"\n건물 유형별 모델 학습/복원 시작...")

# 건물 유형별 데이터 및 결과 저장용 딕셔너리
building_type_models = {}
building_type_cv_results = {}
oof_preds_per_type = {}

# 건물 유형 목록 확인
building_types = train['building'].unique()
print(f"건물 유형 목록: {building_types}")

tscv = TimeSeriesSplit(n_splits=5, test_size=24*7, gap=24)

for type_idx, building_type in enumerate(building_types):
    print(f"\n=== [{type_idx+1}/{len(building_types)}] {building_type} 유형 모델 학습 ===")

    # 해당 유형 데이터만 필터링
    type_data = train[train['building'] == building_type].copy()
    type_data = type_data.sort_values('date_time').reset_index(drop=True)
    n_samples = len(type_data)
    n_buildings = type_data['building_number'].nunique()

    print(f"  데이터 크기: {n_samples}행, {n_buildings}개 건물")

    if n_samples < 1000:  # 유형별 최소 데이터 기준
        print(f"  데이터가 부족합니다 ({n_samples}행). 유형별 학습 스킵.")
        continue

    # 모델 완성 여부 확인
    is_complete, resume_fold = check_building_type_completion(building_type)
    fold_results = []
    safe_type_name = str(building_type).replace(' ', '_').replace('/', '_').replace('(', '').replace(')', '')

    if is_complete:
        print(f"  {building_type} 유형: 모델 모두 존재, OOF 재계산")
        models = load_existing_type_models(building_type)
        building_type_models[building_type] = models
        oof_preds = regenerate_oof_from_type_models(type_data, models, feature_cols, tscv)
        oof_preds_per_type[building_type] = oof_preds
        continue

    print(f"  {building_type} 유형: fold {resume_fold}부터 학습 재개")
    models = []
    oof_preds = np.zeros(n_samples)

    for fold, (train_idx, val_idx) in enumerate(tscv.split(type_data)):
        if fold < resume_fold:
            # 이미 저장된 모델 불러오기
            model = None
            json_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.json"
            pkl_path = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.pkl"
            if os.path.exists(json_path):
                model = xgb.Booster()
                model.load_model(json_path)
            elif os.path.exists(pkl_path):
                with open(pkl_path, 'rb') as f:
                    model = pickle.load(f)
            if model is not None:
                models.append(model)
                print(f"    Fold {fold+1}: 이미 저장된 모델 사용")
            else:
                raise FileNotFoundError(f"Fold {fold} 모델 파일이 없습니다.")
            continue

        print(f"    Fold {fold+1}/{tscv.n_splits} 학습 중... ", end="")
        tr_df = type_data.iloc[train_idx].copy()
        va_df = type_data.iloc[val_idx].copy()

        # 통계 피처 생성 (해당 fold 훈련 데이터만 사용)
        stats = _build_time_stats(tr_df)
        tr_df = attach_time_stats(tr_df, stats)
        va_df = attach_time_stats(va_df, stats)

        # 피처 준비
        X_train_cv = tr_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        X_val_cv = va_df[feature_cols].copy().drop(columns=['building_number'], errors='ignore')
        y_train_cv = tr_df['power_consumption_log']
        y_val_cv = va_df['power_consumption_log']

        # XGBoost 데이터 준비
        dtrain = xgb.DMatrix(X_train_cv, label=y_train_cv, feature_names=X_train_cv.columns.tolist())
        dvalid = xgb.DMatrix(X_val_cv, label=y_val_cv, feature_names=X_train_cv.columns.tolist())

        # 모델 학습
        evals_result = {}
        model = xgb.train(
            params=building_type_params,
            dtrain=dtrain,
            num_boost_round=N_BOOST_ROUND,
            evals=[(dtrain, "train"), (dvalid, "valid")],
            obj=weighted_mse(alpha=3.0),
            custom_metric=smape_eval,
            early_stopping_rounds=ES_ROUNDS,
            verbose_eval=False,
            evals_result=evals_result
        )

        models.append(model)

        # OOF 예측 및 성능 평가
        val_pred_log = model.predict(dvalid, iteration_range=(0, model.best_iteration+1))
        val_pred = np.expm1(val_pred_log)
        oof_preds[val_idx] = val_pred

        y_val = np.expm1(y_val_cv)
        smape_score = smape(y_val, val_pred)
        rmse_score = np.sqrt(np.mean((val_pred - y_val) ** 2))

        # 결과 저장
        fold_results.append({
            'fold': fold + 1,
            'smape': smape_score,
            'rmse': rmse_score,
            'best_iteration': model.best_iteration,
            'train_size': len(train_idx),
            'val_size': len(val_idx),
            'train_mae_final': evals_result['train']['mae'][model.best_iteration],
            'valid_mae_final': evals_result['valid']['mae'][model.best_iteration],
            'train_smape_final': evals_result['train']['smape'][model.best_iteration],
            'valid_smape_final': evals_result['valid']['smape'][model.best_iteration],
            'train_val_gap': evals_result['valid']['mae'][model.best_iteration] - evals_result['train']['mae'][model.best_iteration],
            'building_type': building_type,
            'n_buildings_in_fold': len(tr_df['building_number'].unique())
        })

        print(f"SMAPE: {smape_score:.4f}, best_iter: {model.best_iteration}, "
              f"train_mae: {evals_result['train']['mae'][model.best_iteration]:.4f}, "
              f"valid_mae: {evals_result['valid']['mae'][model.best_iteration]:.4f}")

        # 모델 저장
        model_file = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.json"
        try:
            model.save_model(model_file)
        except Exception as e:
            print(f"\n    JSON 저장 실패 (Fold {fold}), Pickle로 저장: {e}")
            pickle_file = f"{MODEL_PATH_BUILDING_TYPE}/xgb_type_{safe_type_name}_fold{fold}.pkl"
            with open(pickle_file, 'wb') as f:
                pickle.dump(model, f)

    # fold 루프 종료 후 결과 저장
    building_type_models[building_type] = models
    oof_preds_per_type[building_type] = oof_preds
    building_type_cv_results[building_type] = fold_results

    # 유형별 성능 분석
    if fold_results:
        smape_list = [r['smape'] for r in fold_results]
        score, weighted_mean, weighted_std, weights = recent_weighted_score(smape_list, decay=0.5)

        print(f"\n  === {building_type} 유형 상세 분석 ===")
        print(f"  시간순 SMAPE 추세: {' -> '.join([f'{s:.3f}' for s in smape_list])}")
        print(f"  최근성 가중평균 SMAPE: {weighted_mean:.4f}, 안정성 패널티: {weighted_std:.4f}, 최종 Score: {score:.4f}")
        print(f"  가중치: {weights}")

        avg_best_iter = np.mean([r['best_iteration'] for r in fold_results])
        avg_gap = np.mean([r['train_val_gap'] for r in fold_results])
        gap_std = np.std([r['train_val_gap'] for r in fold_results])
        avg_buildings_per_fold = np.mean([r['n_buildings_in_fold'] for r in fold_results])

        print(f"\n  === 모델 진단 ===")
        print(f"  평균 best_iteration: {avg_best_iter:.0f} (목표: {N_BOOST_ROUND}의 10-50%)")
        print(f"  평균 train-valid gap: {avg_gap:+.4f} ± {gap_std:.4f}")
        print(f"  fold당 평균 건물 수: {avg_buildings_per_fold:.1f}개")

        if avg_best_iter < N_BOOST_ROUND * 0.1:
            print("  >> 과적합 의심: best_iteration이 너무 작음")
        elif avg_best_iter > N_BOOST_ROUND * 0.8:
            print("  >> 과소적합 의심: early stopping이 거의 작동하지 않음")
        else:
            print("  >> 정상 범위: 적절한 학습 종료")

        if avg_gap > 0.02:
            print("  >> 과적합 의심: train-valid 성능 차이가 큼")
        elif avg_gap < -0.01:
            print("  >> 이상: valid가 train보다 좋음")
        else:
            print("  >> 정상: train-valid 성능 차이 적절")

        smape_std = np.std(smape_list)
        if smape_std > np.mean(smape_list) * 0.1:
            print(f"  >> 불안정: fold간 성능 차이 큼 (SMAPE std: {smape_std:.4f})")
        else:
            print(f"  >> 안정: fold간 성능 일관성 좋음 (SMAPE std: {smape_std:.4f})")

print(f"\n=== 건물 유형별 모델 학습 완료 요약 ===")

# 유형별 최종 성능 요약
type_performance_summary = {}
for building_type, results in building_type_cv_results.items():
    if not results:
        continue

    smape_trend = [r['smape'] for r in results]
    best_iter_avg = np.mean([r['best_iteration'] for r in results])
    gap_avg = np.mean([r['train_val_gap'] for r in results])
    buildings_count = len(train[train['building'] == building_type]['building_number'].unique())

    type_performance_summary[building_type] = {
        'final_smape': smape_trend[-1],
        'avg_smape': np.mean(smape_trend),
        'avg_iter': best_iter_avg,
        'avg_gap': gap_avg,
        'n_buildings': buildings_count,
        'data_size': len(train[train['building'] == building_type])
    }

    print(f"{building_type:>20} | "
          f"최종SMAPE: {smape_trend[-1]:.4f} | "
          f"평균SMAPE: {np.mean(smape_trend):.4f} | "
          f"평균iter: {best_iter_avg:.0f} | "
          f"평균gap: {gap_avg:+.4f} | "
          f"건물수: {buildings_count:>2}개 | "
          f"데이터: {len(train[train['building'] == building_type]):>4}행")

print(f"\n=== 건물 유형별 모델 성능 종합 요약 ===")
smapes = [summary['final_smape'] for summary in type_performance_summary.values()]
if smapes:
    print(f"전체 유형별 최종 SMAPE - 평균: {np.mean(smapes):.3f} ± {np.std(smapes):.3f} | "
          f"범위: {np.min(smapes):.3f} ~ {np.max(smapes):.3f}")



건물 유형별 모델 학습/복원 시작...
건물 유형 목록: ['Hotel' 'Commercial' 'Hospital' 'University' 'Other Buildings'
 'Apartment' 'Research Institute' 'Department Store' 'IDC' 'Public']

=== [1/10] Hotel 유형 모델 학습 ===
  데이터 크기: 20400행, 10개 건물
  Hotel 유형: fold 0부터 학습 재개
    Fold 1/5 학습 중... SMAPE: 7.4005, best_iter: 1225, train_mae: 0.0428, valid_mae: 0.0742
    Fold 2/5 학습 중... SMAPE: 8.3624, best_iter: 1490, train_mae: 0.0415, valid_mae: 0.0845
    Fold 3/5 학습 중... SMAPE: 11.7773, best_iter: 1314, train_mae: 0.0417, valid_mae: 0.1195
    Fold 4/5 학습 중... SMAPE: 8.5224, best_iter: 2635, train_mae: 0.0414, valid_mae: 0.0855
    Fold 5/5 학습 중... SMAPE: 7.1946, best_iter: 2122, train_mae: 0.0416, valid_mae: 0.0721

  === Hotel 유형 상세 분석 ===
  시간순 SMAPE 추세: 7.401 -> 8.362 -> 11.777 -> 8.522 -> 7.195
  최근성 가중평균 SMAPE: 8.2105, 안정성 패널티: 1.4907, 최종 Score: 9.7012
  가중치: [0.03225806 0.06451613 0.12903226 0.25806452 0.51612903]

  === 모델 진단 ===
  평균 best_iteration: 1757 (목표: 8000의 10-50%)
  평균 train-valid gap: +0.0453

### 앙상블 - 소프트보팅

In [None]:
def create_test_predictions_with_weighted_voting(test_data):
    """테스트 데이터에 대한 가중 소프트 보팅 예측"""
    print("=== 테스트 데이터 가중 소프트 보팅 예측 ===")

    # 결과 저장용
    final_test_predictions = {}

    # 테스트 데이터의 건물별로 처리
    test_buildings = test_data['building_number'].unique()

    for building_num in test_buildings:
        print(f"건물 {building_num} 테스트 예측 중...")

        # 1. 해당 건물의 테스트 데이터
        building_test = test_data[test_data['building_number'] == building_num].copy()
        building_test = building_test.sort_values('date_time').reset_index(drop=True)

        if building_test.empty:
            continue

        building_type = building_test['building'].iloc[0]
        n_test_samples = len(building_test)

        # 2. 10개 모델의 예측값과 가중치 수집
        all_predictions = []  # shape: (n_models, n_test_samples)
        all_weights = []      # shape: (n_models,)
        model_info = []

        # 2-1. 개별 모델 5폴드 예측
        if building_num in individual_models and building_num in individual_cv_results:
            models = individual_models[building_num]
            cv_results = individual_cv_results[building_num]

            # 훈련 데이터로 통계 피처 생성 (전체 훈련 데이터 사용)
            building_train = train[train['building_number'] == building_num].copy()
            building_train = building_train.sort_values('date_time').reset_index(drop=True)
            train_stats = _build_time_stats(building_train)

            # 테스트 데이터에 통계 피처 적용
            building_test_featured = attach_time_stats(building_test.copy(), train_stats)

            for fold in range(5):
                if fold < len(models) and fold < len(cv_results):
                    try:
                        # 테스트 데이터 예측
                        X_test = building_test_featured[feature_cols].copy()
                        X_test = X_test.drop(columns=['building_number'], errors='ignore')

                        dtest = xgb.DMatrix(X_test, feature_names=X_test.columns.tolist())
                        test_pred_log = models[fold].predict(dtest)
                        test_pred = np.expm1(test_pred_log)

                        # 가중치 계산
                        fold_smape = cv_results[fold]['smape']
                        smape_weight = 1.0 / (fold_smape + 1e-8)
                        temporal_weight = (0.7) ** (4 - fold)  # 지수감쇠
                        final_weight = smape_weight * temporal_weight

                        all_predictions.append(test_pred)
                        all_weights.append(final_weight)
                        model_info.append({
                            'type': 'individual',
                            'fold': fold,
                            'smape': fold_smape,
                            'final_weight': final_weight
                        })

                        print(f"  개별 Fold {fold}: 가중치 {final_weight:.4f}")

                    except Exception as e:
                        print(f"  개별 Fold {fold} 예측 실패: {str(e)}")

        # 2-2. 유형별 모델 5폴드 예측
        if building_type in building_type_models and building_type in building_type_cv_results:
            models = building_type_models[building_type]
            cv_results = building_type_cv_results[building_type]

            # 해당 유형의 전체 훈련 데이터로 통계 피처 생성
            type_train = train[train['building'] == building_type].copy()
            type_train = type_train.sort_values('date_time').reset_index(drop=True)
            type_train_stats = _build_time_stats(type_train)

            # 테스트 데이터에 유형별 통계 피처 적용
            building_test_type_featured = attach_time_stats(building_test.copy(), type_train_stats)

            for fold in range(5):
                if fold < len(models) and fold < len(cv_results):
                    try:
                        # 테스트 데이터 예측
                        X_test = building_test_type_featured[feature_cols].copy()
                        X_test = X_test.drop(columns=['building_number'], errors='ignore')

                        dtest = xgb.DMatrix(X_test, feature_names=X_test.columns.tolist())
                        test_pred_log = models[fold].predict(dtest)
                        test_pred = np.expm1(test_pred_log)

                        # 가중치 계산
                        fold_smape = cv_results[fold]['smape']
                        smape_weight = 1.0 / (fold_smape + 1e-8)
                        temporal_weight = (0.7) ** (4 - fold)
                        final_weight = smape_weight * temporal_weight

                        all_predictions.append(test_pred)
                        all_weights.append(final_weight)
                        model_info.append({
                            'type': 'type',
                            'fold': fold,
                            'smape': fold_smape,
                            'final_weight': final_weight
                        })

                        print(f"  유형 Fold {fold}: 가중치 {final_weight:.4f}")

                    except Exception as e:
                        print(f"  유형 Fold {fold} 예측 실패: {str(e)}")

        # 3. 가중 평균으로 최종 예측
        if len(all_predictions) > 0:
            # 가중치 정규화
            weights_array = np.array(all_weights)
            weights_normalized = weights_array / np.sum(weights_array)

            # 가중 평균 계산
            predictions_array = np.array(all_predictions)  # (n_models, n_samples)
            final_pred = np.average(predictions_array, axis=0, weights=weights_normalized)

            final_test_predictions[building_num] = final_pred

            print(f"  ✅ 완료: {len(all_predictions)}개 모델 결합, 테스트 샘플 {n_test_samples}개")
            print(f"      가중치 분포: 개별={np.sum([w for i, w in enumerate(weights_normalized) if model_info[i]['type']=='individual']):.3f}, "
                  f"유형={np.sum([w for i, w in enumerate(weights_normalized) if model_info[i]['type']=='type']):.3f}")
        else:
            print(f"  ❌ 건물 {building_num}: 사용 가능한 모델 없음")

    print(f"\n=== 테스트 예측 완료 ===")
    print(f"예측 완료 건물 수: {len(final_test_predictions)}")

    return final_test_predictions


def save_test_predictions(final_test_predictions, test_data, output_path='submission.csv'):
    """테스트 예측 결과를 제출 형태로 저장 (num_date_answer 형식 with hour)"""
    print("\n=== 제출 파일 생성 ===")

    # 제출 데이터프레임 준비
    submission_rows = []

    for building_num in sorted(test_data['building_number'].unique()):
        if building_num in final_test_predictions:
            building_test = test_data[test_data['building_number'] == building_num].copy()
            building_test = building_test.sort_values('date_time').reset_index(drop=True)
            predictions = final_test_predictions[building_num]

            for idx, (_, row) in enumerate(building_test.iterrows()):
                if idx < len(predictions):
                    # datetime을 파싱
                    dt = pd.to_datetime(row['date_time'])

                    # YYYYMMDD 형식으로 날짜 변환
                    date_str = dt.strftime('%Y%m%d')

                    # 시간을 2자리 형식으로 변환 (00, 01, 02, ...)
                    hour_str = dt.strftime('%H')

                    # num_date 형식 생성 (building_num_YYYYMMDD HH)
                    num_date_id = f"{building_num}_{date_str} {hour_str}"

                    submission_rows.append({
                        'num_date_time': num_date_id,
                        'answer': predictions[idx]
                    })
        else:
            print(f"경고: 건물 {building_num}의 예측값이 없습니다")

    # DataFrame 생성
    submission_df = pd.DataFrame(submission_rows)

    # 음수 예측값 처리
    negative_count = np.sum(submission_df['answer'] < 0)
    if negative_count > 0:
        print(f"음수 예측값 {negative_count}개를 0으로 조정")
        submission_df['answer'] = np.maximum(submission_df['answer'], 0)

    # CSV 저장
    submission_df.to_csv(output_path, index=False)

    print(f"✅ 제출 파일 저장: {output_path}")
    print(f"   총 예측값 수: {len(submission_df)}")
    print(f"   예측값 범위: {submission_df['answer'].min():.2f} ~ {submission_df['answer'].max():.2f}")
    print(f"   예측값 평균: {submission_df['answer'].mean():.2f}")
    print(f"   샘플 ID 형식: {submission_df['num_date_answer'].iloc[0] if len(submission_df) > 0 else 'N/A'}")

    return submission_df

# 🚀 최종 테스트 예측 실행
print("\n" + "="*60)
print("가중 소프트 보팅 기반 테스트 예측")
print("="*60)

# 1. 테스트 예측 생성
final_test_predictions = create_test_predictions_with_weighted_voting(test)

# 2. 제출 파일 저장
submission_df = save_test_predictions(final_test_predictions, test, 'weighted_soft_voting_submission.csv')

# 3. 최종 요약
print(f"\n🎯 최종 요약:")
print(f"   - 사용된 가중치: SMAPE 역수 × 지수감쇠(0.7^(4-fold))")
print(f"   - 최대 모델 수: 10개 (개별 5폴드 + 유형 5폴드)")
print(f"   - 예측 완료 건물: {len(final_test_predictions)}개")
print(f"   - 제출 파일 형식: num_date_answer (건물번호_날짜 시간), answer (예측값)")
print(f"   - 샘플 형식: 1_20240825 00, 1_20240825 01, ...")
print(f"   - 제출 파일: weighted_soft_voting_submission.csv")
print(f"✅ 완료!")



가중 소프트 보팅 기반 테스트 예측
=== 테스트 데이터 가중 소프트 보팅 예측 ===
건물 1 테스트 예측 중...
  개별 Fold 0: 가중치 0.0200
  개별 Fold 1: 가중치 0.0382
  개별 Fold 2: 가중치 0.0542
  개별 Fold 3: 가중치 0.0791
  개별 Fold 4: 가중치 0.1001
  유형 Fold 0: 가중치 0.0324
  유형 Fold 1: 가중치 0.0410
  유형 Fold 2: 가중치 0.0416
  유형 Fold 3: 가중치 0.0821
  유형 Fold 4: 가중치 0.1390
  ✅ 완료: 10개 모델 결합, 테스트 샘플 168개
      가중치 분포: 개별=0.464, 유형=0.536
건물 2 테스트 예측 중...
  개별 Fold 0: 가중치 0.0446
  개별 Fold 1: 가중치 0.0474
  개별 Fold 2: 가중치 0.0828
  개별 Fold 3: 가중치 0.0996
  개별 Fold 4: 가중치 0.1816
  유형 Fold 0: 가중치 0.0588
  유형 Fold 1: 가중치 0.0995
  유형 Fold 2: 가중치 0.1645
  유형 Fold 3: 가중치 0.2523
  유형 Fold 4: 가중치 0.2762
  ✅ 완료: 10개 모델 결합, 테스트 샘플 168개
      가중치 분포: 개별=0.349, 유형=0.651
건물 3 테스트 예측 중...
  개별 Fold 0: 가중치 0.0512
  개별 Fold 1: 가중치 0.1104
  개별 Fold 2: 가중치 0.1583
  개별 Fold 3: 가중치 0.1532
  개별 Fold 4: 가중치 0.2167
  유형 Fold 0: 가중치 0.1091
  유형 Fold 1: 가중치 0.1194
  유형 Fold 2: 가중치 0.2052
  유형 Fold 3: 가중치 0.2880
  유형 Fold 4: 가중치 0.3288
  ✅ 완료: 10개 모델 결합, 테스트 샘플 168개
      가중치 분포: 개별=0.3

###full train

In [None]:
# 하이퍼파라미터 설정 (주어진 파라미터 사용)
building_type_params = {
    "max_depth": 10,
    "learning_rate": 0.005,
    "subsample": 0.8,
    "colsample_bytree": 0.6,
    "min_child_weight": 4,
    "gamma": 0.05,
    "lambda": 1.2,
    "alpha": 0.4,
    "tree_method": "hist",
    "device": "cuda",
    "max_bin": 256,
    "verbosity": 0,
    "random_state": 42,
    "eval_metric": "mae",
    "objective": "reg:squarederror"
}

# 개별 건물용 파라미터 (유형별과 동일하게 설정)
individual_building_params = building_type_params.copy()

N_BOOST_ROUND = 8000
ES_ROUNDS = 600

def train_full_models_for_final_prediction(train_data):
    """전체 train 데이터로 최종 모델 학습 (평가 없음)"""
    print("=== Full Train 데이터 기반 최종 모델 학습 ===")

    final_individual_models = {}
    final_building_type_models = {}

    # 1. 개별 건물 모델 학습
    print("\n1. 개별 건물 모델 학습")
    buildings = train_data['building_number'].unique()

    for building_num in buildings:
        print(f"건물 {building_num} 학습 중...")

        # 해당 건물 데이터
        building_data = train_data[train_data['building_number'] == building_num].copy()
        building_data = building_data.sort_values('date_time').reset_index(drop=True)

        if len(building_data) < 100:  # 최소 데이터 체크
            print(f"  ⚠️ 건물 {building_num}: 데이터 부족 ({len(building_data)}개)")
            continue

        try:
            # 통계 피처 생성
            train_stats = _build_time_stats(building_data)
            building_featured = attach_time_stats(building_data.copy(), train_stats)

            # 피처 및 타겟 준비
            X = building_featured[feature_cols].copy()
            X = X.drop(columns=['building_number'], errors='ignore')
            y = building_featured['power_consumption_log']  # 이미 log 변환된 것 사용

            # XGBoost 데이터 준비
            dtrain = xgb.DMatrix(X, label=y, feature_names=X.columns.tolist())

            # 모델 학습 (early stopping 없이)
            model = xgb.train(
                params=individual_building_params,
                dtrain=dtrain,
                num_boost_round=N_BOOST_ROUND,
                obj=weighted_mse(alpha=3.0),  # 기존 사용하던 custom objective
                verbose_eval=False
            )

            final_individual_models[building_num] = {
                'model': model,
                'train_stats': train_stats,
                'data_count': len(building_data),
                'building_type': building_featured['building'].iloc[0]
            }

            print(f"  ✅ 건물 {building_num}: {len(building_data)}개 샘플로 학습 완료")

        except Exception as e:
            print(f"  ❌ 건물 {building_num} 학습 실패: {str(e)}")

    # 2. 건물 유형별 모델 학습
    print("\n2. 건물 유형별 모델 학습")
    building_types = train_data['building'].unique()

    for building_type in building_types:
        print(f"유형 {building_type} 학습 중...")

        # 해당 유형 데이터
        type_data = train_data[train_data['building'] == building_type].copy()
        type_data = type_data.sort_values('date_time').reset_index(drop=True)

        if len(type_data) < 200:  # 유형별 최소 데이터 체크
            print(f"  ⚠️ 유형 {building_type}: 데이터 부족 ({len(type_data)}개)")
            continue

        try:
            # 통계 피처 생성
            type_train_stats = _build_time_stats(type_data)
            type_featured = attach_time_stats(type_data.copy(), type_train_stats)

            # 피처 및 타겟 준비
            X = type_featured[feature_cols].copy()
            X = X.drop(columns=['building_number'], errors='ignore')
            y = type_featured['power_consumption_log']  # 이미 log 변환된 것 사용

            # XGBoost 데이터 준비
            dtrain = xgb.DMatrix(X, label=y, feature_names=X.columns.tolist())

            # 모델 학습 (early stopping 없이)
            model = xgb.train(
                params=building_type_params,
                dtrain=dtrain,
                num_boost_round=N_BOOST_ROUND,
                obj=weighted_mse(alpha=3.0),  # 기존 사용하던 custom objective
                verbose_eval=False
            )

            final_building_type_models[building_type] = {
                'model': model,
                'train_stats': type_train_stats,
                'data_count': len(type_data),
                'n_buildings': type_data['building_number'].nunique()
            }

            print(f"  ✅ 유형 {building_type}: {len(type_data)}개 샘플 ({type_data['building_number'].nunique()}개 건물)로 학습 완료")

        except Exception as e:
            print(f"  ❌ 유형 {building_type} 학습 실패: {str(e)}")

    print(f"\n=== Full Train 학습 완료 ===")
    print(f"개별 모델: {len(final_individual_models)}개")
    print(f"유형 모델: {len(final_building_type_models)}개")

    return final_individual_models, final_building_type_models


def create_full_train_predictions(test_data, individual_models, type_models):
    """Full train 모델로 테스트 예측 (단순 평균)"""
    print("\n=== Full Train 모델 기반 테스트 예측 ===")

    final_predictions = {}

    for building_num in test_data['building_number'].unique():
        print(f"건물 {building_num} 예측 중...")

        # 테스트 데이터 준비
        building_test = test_data[test_data['building_number'] == building_num].copy()
        building_test = building_test.sort_values('date_time').reset_index(drop=True)

        if building_test.empty:
            continue

        building_type = building_test['building'].iloc[0]
        predictions = []
        model_names = []

        # 1. 개별 모델 예측
        if building_num in individual_models:
            try:
                model_info = individual_models[building_num]
                model = model_info['model']
                train_stats = model_info['train_stats']

                # 테스트 데이터에 통계 피처 적용
                test_featured = attach_time_stats(building_test.copy(), train_stats)
                X_test = test_featured[feature_cols].copy()
                X_test = X_test.drop(columns=['building_number'], errors='ignore')

                # 예측 (log scale에서 예측 후 역변환)
                dtest = xgb.DMatrix(X_test, feature_names=X_test.columns.tolist())
                pred_log = model.predict(dtest)
                pred = np.expm1(pred_log)  # log 역변환

                predictions.append(pred)
                model_names.append('individual')
                print(f"  ✅ 개별 모델 예측 완료")

            except Exception as e:
                print(f"  ❌ 개별 모델 예측 실패: {str(e)}")

        # 2. 유형별 모델 예측
        if building_type in type_models:
            try:
                model_info = type_models[building_type]
                model = model_info['model']
                train_stats = model_info['train_stats']

                # 테스트 데이터에 통계 피처 적용
                test_featured = attach_time_stats(building_test.copy(), train_stats)
                X_test = test_featured[feature_cols].copy()
                X_test = X_test.drop(columns=['building_number'], errors='ignore')

                # 예측 (log scale에서 예측 후 역변환)
                dtest = xgb.DMatrix(X_test, feature_names=X_test.columns.tolist())
                pred_log = model.predict(dtest)
                pred = np.expm1(pred_log)  # log 역변환

                predictions.append(pred)
                model_names.append('type')
                print(f"  ✅ 유형 모델 예측 완료")

            except Exception as e:
                print(f"  ❌ 유형 모델 예측 실패: {str(e)}")

        # 3. 단순 평균으로 최종 예측
        if len(predictions) > 0:
            if len(predictions) == 1:
                final_pred = predictions[0]
                print(f"  ✅ 단일 모델 ({model_names[0]}) 사용")
            else:
                final_pred = np.mean(predictions, axis=0)
                print(f"  ✅ {len(predictions)}개 모델 평균: {', '.join(model_names)}")

            final_predictions[building_num] = final_pred
        else:
            print(f"  ❌ 건물 {building_num}: 사용 가능한 모델 없음")

    print(f"\n=== 예측 완료 ===")
    print(f"예측 완료 건물: {len(final_predictions)}개")

    return final_predictions


def save_test_predictions(final_test_predictions, test_data, output_path='submission.csv'):
    """테스트 예측 결과를 제출 형태로 저장 (num_date_answer 형식 with hour)"""
    print("\n=== 제출 파일 생성 ===")

    # 제출 데이터프레임 준비
    submission_rows = []

    for building_num in sorted(test_data['building_number'].unique()):
        if building_num in final_test_predictions:
            building_test = test_data[test_data['building_number'] == building_num].copy()
            building_test = building_test.sort_values('date_time').reset_index(drop=True)
            predictions = final_test_predictions[building_num]

            for idx, (_, row) in enumerate(building_test.iterrows()):
                if idx < len(predictions):
                    # datetime을 파싱
                    dt = pd.to_datetime(row['date_time'])

                    # YYYYMMDD 형식으로 날짜 변환
                    date_str = dt.strftime('%Y%m%d')

                    # 시간을 2자리 형식으로 변환 (00, 01, 02, ...)
                    hour_str = dt.strftime('%H')

                    # num_date 형식 생성 (building_num_YYYYMMDD HH)
                    num_date_id = f"{building_num}_{date_str} {hour_str}"

                    submission_rows.append({
                        'num_date_time': num_date_id,
                        'answer': predictions[idx]
                    })
        else:
            print(f"경고: 건물 {building_num}의 예측값이 없습니다")

    # DataFrame 생성
    submission_df = pd.DataFrame(submission_rows)

    # 음수 예측값 처리
    negative_count = np.sum(submission_df['answer'] < 0)
    if negative_count > 0:
        print(f"음수 예측값 {negative_count}개를 0으로 조정")
        submission_df['answer'] = np.maximum(submission_df['answer'], 0)

    # CSV 저장
    submission_df.to_csv(output_path, index=False)

    print(f"✅ 제출 파일 저장: {output_path}")
    print(f"   총 예측값 수: {len(submission_df)}")
    print(f"   예측값 범위: {submission_df['answer'].min():.2f} ~ {submission_df['answer'].max():.2f}")
    print(f"   예측값 평균: {submission_df['answer'].mean():.2f}")
    print(f"   샘플 ID 형식: {submission_df['num_date_answer'].iloc[0] if len(submission_df) > 0 else 'N/A'}")

    return submission_df

# 🚀 Full Train 기반 최종 예측 실행
print("\n" + "="*70)
print("High-Performance Full Train 기반 소프트 보팅 예측")
print("="*70)

print(f"\n📋 사용된 하이퍼파라미터:")
print(f"   - max_depth: {building_type_params['max_depth']}")
print(f"   - learning_rate: {building_type_params['learning_rate']}")
print(f"   - subsample: {building_type_params['subsample']}")
print(f"   - colsample_bytree: {building_type_params['colsample_bytree']}")
print(f"   - min_child_weight: {building_type_params['min_child_weight']}")
print(f"   - gamma: {building_type_params['gamma']}")
print(f"   - lambda: {building_type_params['lambda']}")
print(f"   - alpha: {building_type_params['alpha']}")
print(f"   - n_boost_round: {N_BOOST_ROUND}")
print(f"   - 사용 GPU: {building_type_params['device']}")

# 1. Full train 모델 학습
final_individual_models, final_type_models = train_full_models_for_final_prediction(train)

# 2. 테스트 예측
full_train_predictions = create_full_train_predictions(test, final_individual_models, final_type_models)

# 3. 제출 파일 저장
submission_df = save_test_predictions(full_train_predictions, test, 'high_performance_full_train_submission.csv')



High-Performance Full Train 기반 소프트 보팅 예측

📋 사용된 하이퍼파라미터:
   - max_depth: 10
   - learning_rate: 0.005
   - subsample: 0.8
   - colsample_bytree: 0.6
   - min_child_weight: 4
   - gamma: 0.05
   - lambda: 1.2
   - alpha: 0.4
   - n_boost_round: 8000
   - 사용 GPU: cuda
=== Full Train 데이터 기반 최종 모델 학습 ===

1. 개별 건물 모델 학습
건물 1 학습 중...
  ✅ 건물 1: 2040개 샘플로 학습 완료
건물 2 학습 중...
  ✅ 건물 2: 2040개 샘플로 학습 완료
건물 3 학습 중...
  ✅ 건물 3: 2040개 샘플로 학습 완료
건물 4 학습 중...
  ✅ 건물 4: 2040개 샘플로 학습 완료
건물 5 학습 중...
  ✅ 건물 5: 2040개 샘플로 학습 완료
건물 6 학습 중...
  ✅ 건물 6: 2040개 샘플로 학습 완료
건물 7 학습 중...
  ✅ 건물 7: 2040개 샘플로 학습 완료
건물 8 학습 중...
  ✅ 건물 8: 2040개 샘플로 학습 완료
건물 9 학습 중...
  ✅ 건물 9: 2040개 샘플로 학습 완료
건물 10 학습 중...
  ✅ 건물 10: 2040개 샘플로 학습 완료
건물 11 학습 중...
  ✅ 건물 11: 2040개 샘플로 학습 완료
건물 12 학습 중...
  ✅ 건물 12: 2040개 샘플로 학습 완료
건물 13 학습 중...
  ✅ 건물 13: 2040개 샘플로 학습 완료
건물 14 학습 중...
  ✅ 건물 14: 2040개 샘플로 학습 완료
건물 15 학습 중...
  ✅ 건물 15: 2040개 샘플로 학습 완료
건물 16 학습 중...
  ✅ 건물 16: 2040개 샘플로 학습 완료
건물 17 학습 중...
  ✅ 건물 17: 2040개 샘플로 학습 완료
건물