#import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import pandas as pd
import numpy as np
import os, gc, sys, time, math, json, random, hashlib, warnings
import matplotlib.pyplot as plt
import seaborn as sns
from dataclasses import dataclass
from typing import List, Tuple, Dict, Any, Optional

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings(action='ignore')


# 시각화 설정
plt.style.use('ggplot')
sns.set(font_scale=1.0)
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Simple installation
!pip install tabpfn

# Local development installation
!git clone https://github.com/PriorLabs/TabPFN.git
!pip install -e "TabPFN[dev]"

# install AutoTabPFN
!git clone https://github.com/priorlabs/tabpfn-extensions.git
!pip install -e tabpfn-extensions
!pip install hyperopt

#설치 후 세션 다시 시작

Collecting tabpfn
  Downloading tabpfn-2.1.3-py3-none-any.whl.metadata (27 kB)
Collecting eval-type-backport>=0.2.2 (from tabpfn)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Downloading tabpfn-2.1.3-py3-none-any.whl (160 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.8/160.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Installing collected packages: eval-type-backport, tabpfn
Successfully installed eval-type-backport-0.2.2 tabpfn-2.1.3
Cloning into 'TabPFN'...
remote: Enumerating objects: 4191, done.[K
remote: Counting objects: 100% (520/520), done.[K
remote: Compressing objects: 100% (225/225), done.[K
remote: Total 4191 (delta 465), reused 295 (delta 295), pack-reused 3671 (from 3)[K
Receiving objects: 100% (4191/4191), 267.59 MiB | 51.10 MiB/s, done.
Resolving deltas: 100% (2632/2632), done.
Obtaining file:///content/TabPFN
  Installing build dependencies ..

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

#data load, eda

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/KUBIG/25_summer_contest/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/KUBIG/25_summer_contest/test.csv')
building_df = pd.read_csv('/content/drive/MyDrive/KUBIG/25_summer_contest/building_info.csv')

In [None]:
train_df = train_df.replace("-", np.nan)
test_df = test_df.replace("-", np.nan)
building_df = building_df.replace("-", np.nan)


In [None]:
column_mapping = {
    "건물번호": "building_id",
    "일시": "datetime",
    "기온(°C)": "temperature",
    "강수량(mm)": "precipitation",
    "풍속(m/s)": "wind_speed",
    "습도(%)": "humidity",
    "일조(hr)": "sunshine_hour",
    "일사(MJ/m2)": "solar_radiation",
    "전력소비량(kWh)": "consumption",
    "month": "month",
    "day": "day",
    "time": "hour",
    "log_consumption": "log_consumption",
    "연면적(m2)": "total_area",
    "냉방면적(m2)": "cooling_area",
    "태양광용량(kW)": "solar_capacity",
    "ESS저장용량(kWh)": "ess_capacity",
    "PCS용량(kW)": "pcs_capacity",
    "건물유형" : "building_type"
}

train_df = train_df.rename(columns=column_mapping)
test_df = test_df.rename(columns=column_mapping)
building_df = building_df.rename(columns=column_mapping)



In [None]:
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)
print("Building shape:", building_df.shape)


Train shape: (204000, 10)
Test shape: (16800, 7)
Building shape: (100, 7)


In [None]:
print("\nTrain preview:")
print(train_df.head())



Train preview:
   num_date_time  building_id     datetime  temperature  precipitation  \
0  1_20240601 00            1  20240601 00         18.3            0.0   
1  1_20240601 01            1  20240601 01         18.3            0.0   
2  1_20240601 02            1  20240601 02         18.1            0.0   
3  1_20240601 03            1  20240601 03         18.0            0.0   
4  1_20240601 04            1  20240601 04         17.8            0.0   

   wind_speed  humidity  sunshine_hour  solar_radiation  consumption  
0         2.6      82.0            0.0              0.0      5794.80  
1         2.7      82.0            0.0              0.0      5591.85  
2         2.6      80.0            0.0              0.0      5338.17  
3         2.6      81.0            0.0              0.0      4554.42  
4         1.3      81.0            0.0              0.0      3602.25  


In [None]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['datetime'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['datetime'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['datetime'].apply(lambda x : int(x[9:11]))

In [None]:
# 시계열 특성을 학습에 반영하기 위해 test 데이터도 동일하게 처리합니다
test_df['month'] = test_df['datetime'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['datetime'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['datetime'].apply(lambda x : int(x[9:11]))

In [None]:
print("\nData types:")
print(train_df.dtypes)


Data types:
num_date_time       object
building_id          int64
datetime            object
temperature        float64
precipitation      float64
wind_speed         float64
humidity           float64
sunshine_hour      float64
solar_radiation    float64
consumption        float64
month                int64
day                  int64
time                 int64
dtype: object


In [None]:
print("\nDescriptive statistics:")
print(train_df.describe(include='all'))


Descriptive statistics:
          num_date_time    building_id     datetime    temperature  \
count            204000  204000.000000       204000  204000.000000   
unique           204000            NaN         2040            NaN   
top     100_20240823 08            NaN  20240824 23            NaN   
freq                  1            NaN          100            NaN   
mean                NaN      50.500000          NaN      26.098130   
std                 NaN      28.866141          NaN       4.052888   
min                 NaN       1.000000          NaN       8.400000   
25%                 NaN      25.750000          NaN      23.500000   
50%                 NaN      50.500000          NaN      26.300000   
75%                 NaN      75.250000          NaN      28.800000   
max                 NaN     100.000000          NaN      38.700000   

        precipitation     wind_speed       humidity  sunshine_hour  \
count   204000.000000  204000.000000  204000.000000  204000.0000

In [None]:
print("고유 건물 유형 목록:")
print(building_df['building_type'].unique())


고유 건물 유형 목록:
['호텔' '상용' '병원' '학교' '건물기타' '아파트' '연구소' '백화점' 'IDC(전화국)' '공공']


In [None]:
# 건물유형별 그룹화 후 통계 요약
grouped_summary = building_df.groupby('building_type').describe()
display(grouped_summary)


Unnamed: 0_level_0,building_id,building_id,building_id,building_id,building_id,building_id,building_id,building_id,total_area,total_area,total_area,total_area,total_area,cooling_area,cooling_area,cooling_area,cooling_area,cooling_area,cooling_area,cooling_area,cooling_area
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
building_type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
IDC(전화국),9.0,51.666667,17.117243,30.0,36.0,52.0,64.0,81.0,9.0,42075.58,...,44676.67,83432.35,9.0,19153.718889,9944.763916,337.84,13487.0,19232.0,26701.0,34650.63
건물기타,10.0,62.1,29.152663,7.0,49.75,65.0,81.0,97.0,10.0,434137.0689,...,202836.255,3260213.41,10.0,262467.4625,599117.77543,3975.52,25193.0,60885.5,150446.88875,1956128.0
공공,8.0,62.375,20.479519,33.0,47.0,67.0,74.0,92.0,8.0,133799.3025,...,139881.0,373141.0,8.0,54304.15875,33158.002212,5628.0,32217.0525,54222.0,78534.75,100746.2
백화점,16.0,51.8125,24.884316,18.0,31.25,49.5,73.25,95.0,16.0,133947.159375,...,152151.5,338093.0,16.0,63127.165,43775.877407,12066.86,33138.5,53329.0,85812.4525,167868.56
병원,9.0,42.111111,27.451978,3.0,21.0,42.0,48.0,90.0,9.0,157621.507778,...,129583.77,560431.0,9.0,117535.68,116864.297249,45061.76,68513.41,72627.98,96896.0,418992.0
상용,10.0,45.3,34.263846,2.0,17.0,46.0,71.0,99.0,10.0,81677.582,...,119797.3325,329867.95,10.0,38479.131,51003.120838,1089.28,8311.6475,21118.675,32077.6725,157835.0
아파트,9.0,60.222222,30.388229,11.0,31.0,70.0,85.0,93.0,9.0,304948.496667,...,340630.0,492485.514,9.0,205633.047667,109890.1403,22210.39,131500.0,233263.0,247380.0,415124.587
연구소,9.0,47.666667,28.770645,13.0,23.0,49.0,62.0,94.0,9.0,74157.081111,...,86546.94,124037.0,9.0,48866.634444,19657.424221,16014.23,35742.0,44576.27,62015.99,77267.0
학교,10.0,33.3,27.305067,5.0,12.5,23.0,52.75,87.0,10.0,351290.4813,...,479186.3475,596251.0,10.0,190519.879,86878.170779,82112.0,123331.5725,177814.75,237914.75,339131.0
호텔,10.0,50.0,42.807061,1.0,9.25,52.5,87.75,100.0,10.0,148146.785,...,155741.93,435993.5,10.0,103288.628,107534.343707,5619.7,23631.1775,68037.655,141963.25,341983.0


In [None]:
# 건물유형별 건물번호 목록 출력
grouped = building_df.groupby('building_type')['building_id'].apply(list)

# 보기 좋게 출력
for building_type, building_ids in grouped.items():
    print(f"[{building_type}]")
    print(sorted(building_ids))
    print()


[IDC(전화국)]
[30, 35, 36, 43, 52, 57, 64, 67, 81]

[건물기타]
[7, 26, 47, 58, 61, 69, 78, 82, 96, 97]

[공공]
[33, 38, 50, 66, 68, 72, 80, 92]

[백화점]
[18, 19, 27, 29, 32, 34, 40, 45, 54, 59, 63, 73, 74, 79, 88, 95]

[병원]
[3, 17, 21, 39, 42, 44, 48, 75, 90]

[상용]
[2, 6, 16, 20, 41, 51, 56, 76, 86, 99]

[아파트]
[11, 25, 31, 65, 70, 71, 85, 91, 93]

[연구소]
[13, 15, 23, 37, 49, 53, 62, 83, 94]

[학교]
[5, 8, 12, 14, 22, 24, 46, 55, 60, 87]

[호텔]
[1, 4, 9, 10, 28, 77, 84, 89, 98, 100]



In [None]:
# 건물유형 정보 병합
train_df_merged = pd.merge(train_df, building_df[['building_id', 'building_type']], on='building_id', how='left')
test_df_merged = pd.merge(test_df, building_df[['building_id', 'building_type']], on='building_id', how='left')


In [None]:
# train 데이터
train_counts = train_df_merged['building_type'].value_counts().sort_index()
print("Train 데이터 - 건물유형별 관측값 개수:")
print(train_counts)

# test 데이터
test_counts = test_df_merged['building_type'].value_counts().sort_index()
print("\nTest 데이터 - 건물유형별 관측값 개수:")
print(test_counts)


Train 데이터 - 건물유형별 관측값 개수:
building_type
IDC(전화국)    18360
건물기타        20400
공공          16320
백화점         32640
병원          18360
상용          20400
아파트         18360
연구소         18360
학교          20400
호텔          20400
Name: count, dtype: int64

Test 데이터 - 건물유형별 관측값 개수:
building_type
IDC(전화국)    1512
건물기타        1680
공공          1344
백화점         2688
병원          1512
상용          1680
아파트         1512
연구소         1512
학교          1680
호텔          1680
Name: count, dtype: int64


In [None]:
# 건물유형별 평균 전력소비량 계산
type_mean = (
    train_df_merged
    .groupby('building_type')['consumption']
    .mean()
    .sort_values(ascending=False)
    .round(2)
)

# DataFrame으로 보기 좋게 출력
type_mean_df = type_mean.reset_index()
type_mean_df.columns = ['building_type', 'consumption']
display(type_mean_df)


Unnamed: 0,building_type,consumption
0,IDC(전화국),10316.94
1,병원,4454.06
2,학교,3462.68
3,호텔,3175.02
4,백화점,2729.74
5,상용,2513.7
6,건물기타,2285.96
7,연구소,2111.67
8,공공,1625.91
9,아파트,1106.31


#모델링

In [None]:
# --- 설정 ---
RANDOM_SEED = 2025
N_SPLITS = 10          # Walk-forward 폴드 수
EMBARGO  = 24         # 검증 직전 금지 구간(시간)
USE_LOG1P = True
DO_BLEND_SEASONAL = True
BLEND_ALPHA = 0.2     # pred = (1-α)*xgb + α*seasonal(t-168)
AUTO_DROP_IF_MISSING = {'sunshine_hour', 'solar_radiation', '일조', '일사'}  # 테스트에 없으면 제거

def set_seed(seed=RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
set_seed()

def log(msg): print(f"[{time.strftime('%H:%M:%S')}] {msg}")

def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.abs(y_true) + np.abs(y_pred)
    denom[denom == 0] = 1e-12
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

# --- GPU 파라미터 ---
XGB_PARAMS = dict(
    n_estimators=10000,
    learning_rate=0.05,
    max_depth=8,
    min_child_weight=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=0.5,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    tree_method="gpu_hist",      # GPU
    predictor="gpu_predictor",
    max_bin=256,
)

In [None]:
import numpy as np
import pandas as pd
from typing import List, Tuple

# =================================================================
# --- 필수 컬럼 점검/정규화 ---
# =================================================================
# datetime
if 'datetime' not in train_df.columns:
    if 'date_time' in train_df.columns:
        train_df = train_df.rename(columns={'date_time': 'datetime'})
    else:
        raise KeyError("train_df에 datetime 컬럼이 없습니다.")
if 'datetime' not in test_df.columns:
    if 'date_time' in test_df.columns:
        test_df = test_df.rename(columns={'date_time': 'datetime'})
    else:
        raise KeyError("test_df에 datetime 컬럼이 없습니다.")

train_df['datetime'] = pd.to_datetime(train_df['datetime'])
test_df['datetime']  = pd.to_datetime(test_df['datetime'])

# building_id
if 'building_id' not in train_df.columns:
    raise KeyError("train_df에 building_id가 없습니다.")
if 'building_id' not in test_df.columns:
    # 보통 test의 num_date_time에서 추출 가능
    if 'num_date_time' in test_df.columns:
        test_df['building_id'] = test_df['num_date_time'].astype(str).str.extract(r'^(\d+)').astype(int)
    else:
        raise KeyError("test_df에 building_id가 없습니다.")

# target
if 'target' not in train_df.columns:
    # 대회 원본명 매핑 예시
    for cand in ['consumption', 'load', 'target_kwh']:
        if cand in train_df.columns:
            train_df = train_df.rename(columns={cand: 'target'})
            break
if 'target' not in train_df.columns:
    raise KeyError("train_df에 target(전력소비량) 컬럼이 없습니다.")

# 원본 코드 재현을 위해 사본 생성
train_df = train_df.copy()
test_df  = test_df.copy()

# 수치화
train_df['target'] = pd.to_numeric(train_df['target'], errors='coerce')

# FIX: downstream 호환을 위해 'consumption' 컬럼이 없으면 target을 복제
if 'consumption' not in train_df.columns:
    train_df['consumption'] = train_df['target']

# FIX: 기존 코드의 'log_consumption' 계산이 'consumption' 유무에 의존 → target 기준으로 안전하게 계산
train_df['log_consumption'] = np.log1p(train_df['consumption'])

# =================================================================
# --- 피처 엔지니어링 함수들 ---
# =================================================================

def add_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """datetime 기반 캘린더/주기 피처 (df만 수정; train/test 외부 참조 금지)"""
    dt = df['datetime']
    df = df.copy()
    df['hour'] = dt.dt.hour
    df['dayofweek'] = dt.dt.dayofweek
    df['week'] = dt.dt.isocalendar().week.astype(int)
    df['month'] = dt.dt.month
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    df['sin_hour'] = np.sin(2*np.pi*df['hour']/24.0)
    df['cos_hour'] = np.cos(2*np.pi*df['hour']/24.0)
    df['sin_dow']  = np.sin(2*np.pi*df['dayofweek']/7.0)
    df['cos_dow']  = np.cos(2*np.pi*df['dayofweek']/7.0)
    df['summer_cos'] = np.cos((df['month']-6) * np.pi/3)

    # FIX: weekday/weekend 계산을 df 기준으로만 수행
    df['weekday'] = df['datetime'].dt.weekday
    df['weekend'] = (df['weekday'] >= 5).astype(int)

    return df


def add_weather_features(df: pd.DataFrame, group_keys: Tuple[str, ...] = ('building_id',)) -> pd.DataFrame:
    """온도/습도/풍속 기반 파생, 차분/휴일 플래그 등 (df만 수정)"""
    df = df.sort_values(list(group_keys) + ['datetime']).copy() if set(group_keys).issubset(df.columns) else df.copy()

    has_T  = 'temperature' in df.columns
    has_RH = 'humidity' in df.columns
    has_W  = 'wind_speed' in df.columns

    # 1) 기온·습도 기반 (THI, apparent temp)
    if has_T and has_RH:
        # 불쾌지수 (THI)
        df['discomfort_index'] = (
            0.81*df['temperature']
            + 0.01*df['humidity'] * (0.99*df['temperature'] - 14.3)
            + 46.3
        )
    if has_T and has_W:
        # 체감온도 (Wind Chill 유사식; 고온 영역에서는 heat index 사용을 권장)
        df['apparent_temp'] = (
            13.12 + 0.6215*df['temperature']
            - 11.37*(df['wind_speed'].clip(lower=0.1)**0.16)
            + 0.3965*df['temperature']*(df['wind_speed'].clip(lower=0.1)**0.16)
        )

    # 2) 날씨 차이 변수 (그룹별 diff)
    def _by_group(s):
        if set(group_keys).issubset(df.columns):
            return s.groupby(list(group_keys))
        return s

    if has_T:
        df['temp_diff'] = _by_group(df['temperature']).diff()
    if has_RH:
        df['hum_diff'] = _by_group(df['humidity']).diff()
    if has_W:
        df['wind_diff'] = _by_group(df['wind_speed']).diff()

    # 3) 공휴일 여부 (미국 연방휴일 예시; 필요시 한국 공휴일로 대체)
    #    참고: from pandas.tseries.holiday import Holiday, AbstractHolidayCalendar 로 커스텀 가능
    try:
        from pandas.tseries.holiday import USFederalHolidayCalendar
        cal = USFederalHolidayCalendar()
        holidays = cal.holidays(start=df['datetime'].min(), end=df['datetime'].max())
        df['is_holiday'] = df['datetime'].dt.normalize().isin(holidays).astype(int)
    except Exception:
        # 라이브러리/환경 문제 시 0으로 대체
        df['is_holiday'] = 0

    return df


def add_summer_features(df: pd.DataFrame, group_keys: Tuple[str, ...] = ('building_id',)) -> pd.DataFrame:
    """여름(6~8월) 특화 보강 피처 (상호작용/런-길이 등)"""
    if 'datetime' not in df.columns:
        raise AssertionError("datetime 컬럼이 필요합니다.")

    if not np.issubdtype(df['datetime'].dtype, np.datetime64):
        df['datetime'] = pd.to_datetime(df['datetime'])

    df = df.sort_values(list(group_keys) + ['datetime']).copy() if set(group_keys).issubset(df.columns) else df.copy()

    # 0) 여름 내 위치/캘린더 보강
    df['date'] = df['datetime'].dt.date
    df['doy'] = df['datetime'].dt.dayofyear
    # summer_day_idx: 해당 연도 6/1 기준 일수
    start_summer = pd.Timestamp(df['datetime'].dt.year.min(), 6, 1)
    df['summer_day_idx'] = (df['datetime'].dt.normalize() - start_summer).dt.days.clip(lower=0) + 1

    # 영업/출퇴근 시간
    df['business_hours'] = df['hour'].between(9, 18).astype(int) if 'hour' in df.columns else 0
    df['rush_morning'] = df['hour'].between(7, 9).astype(int) if 'hour' in df.columns else 0
    df['rush_evening'] = df['hour'].between(18, 20).astype(int) if 'hour' in df.columns else 0

    # 방학/성수기 플래그(한국 감각; 7/20~8/20)
    if 'month' in df.columns:
        m = df['month']
    else:
        m = df['datetime'].dt.month
    d = df['datetime'].dt.day
    df['vacation_peak'] = (((m == 7) & (d >= 20)) | ((m == 8) & (d <= 20))).astype(int)

    has_T  = 'temperature' in df.columns
    has_RH = 'humidity' in df.columns
    has_W  = 'wind_speed' in df.columns

    # 1) 더위/습도 스트레스 지표
    if has_T:
        df['CDD24'] = np.maximum(df['temperature'] - 24.0, 0.0)
        df['CDD26'] = np.maximum(df['temperature'] - 26.0, 0.0)
        df['temp_above26'] = (df['temperature'] - 26.0).clip(lower=0.0)

    if has_T and has_RH:
        # 이슬점
        a, b = 17.27, 237.7
        RHc = df['humidity'].clip(1, 100) / 100.0
        alpha = ((a * df['temperature']) / (b + df['temperature'])) + np.log(RHc)
        df['dew_point'] = (b * alpha) / (a - alpha)

        # 습구온도 (Stull 2011 근사)
        RH = df['humidity'].clip(1, 100)
        T  = df['temperature']
        df['wet_bulb'] = (
            T*np.arctan(0.151977*np.sqrt(RH+8.313659))
            + np.arctan(T+RH)
            - np.arctan(RH-1.676331)
            + 0.00391838*(RH**1.5)*np.arctan(0.023101*RH)
            - 4.686035
        )

        # Humidex
        dewK = (df['dew_point'] + 273.15).clip(lower=200, upper=350)
        expo = 5417.7530 * (1/273.16 - 1/dewK)
        df['humidex'] = T + 0.5555*((6.11*np.exp(expo)) - 10.0)

        # Heat Index(간이)
        R = df['humidity'].clip(1, 100)
        df['heat_index'] = (
            -8.784695 + 1.61139411*T + 2.338549*R
            - 0.14611605*T*R - 0.012308094*(T**2)
            - 0.016424828*(R**2) + 0.002211732*(T**2)*R
            + 0.00072546*T*(R**2) - 0.000003582*(T**2)*(R**2)
        )

    if has_T and has_W and 'apparent_temp' not in df.columns:
        df['apparent_temp'] = (
            13.12 + 0.6215*df['temperature']
            - 11.37*(df['wind_speed'].clip(lower=0.1)**0.16)
            + 0.3965*df['temperature']*(df['wind_speed'].clip(lower=0.1)**0.16)
        )

    # 2) 상호작용 항
    if has_T:
        df['temp_x_business']   = df['temperature'] * df['business_hours']
        df['temp_x_weekend']    = df['temperature'] * (df['dayofweek'] >= 5).astype(int) if 'dayofweek' in df.columns else 0
        df['temp_x_peakvac']    = df['temperature'] * df['vacation_peak']
        df['CDD26_x_business']  = df['CDD26'] * df['business_hours']

    if has_RH:
        df['rh_x_business'] = df['humidity'] * df['business_hours']

    # 3) 날씨 lag/rolling (그룹 기준)
    def _by_group(s):
        if set(group_keys).issubset(df.columns):
            return s.groupby(list(group_keys))
        return s

    for col in ['temperature', 'humidity', 'wind_speed']:
        if col in df.columns:
            df[f'{col}_lag1']     = _by_group(df[col]).shift(1)
            df[f'{col}_lag24']    = _by_group(df[col]).shift(24)
            df[f'{col}_ma3']      = _by_group(df[col]).apply(lambda x: x.rolling(3,  min_periods=1).mean())
            df[f'{col}_ma24']     = _by_group(df[col]).apply(lambda x: x.rolling(24, min_periods=1).mean())
            df[f'{col}_dev_ma24'] = df[col] - df[f'{col}_ma24']

    # 4) 목표변수 lag/rolling (학습 시점)
    if 'target' in df.columns:
        df['y_lag1']    = _by_group(df['target']).shift(1)
        df['y_lag24']   = _by_group(df['target']).shift(24)
        df['y_lag168']  = _by_group(df['target']).shift(168)
        df['y_ma3']     = _by_group(df['target']).apply(lambda x: x.shift(1).rolling(3,  min_periods=1).mean())
        df['y_ma24']    = _by_group(df['target']).apply(lambda x: x.shift(1).rolling(24, min_periods=1).mean())
        df['y_ema24']   = _by_group(df['target']).apply(lambda x: x.shift(1).ewm(span=24, adjust=False, min_periods=1).mean())
        df['y_diff1']   = _by_group(df['target']).diff(1)
        df['y_diff24']  = _by_group(df['target']).diff(24)
        df['y_vs_ma24'] = df['target'] - df['y_ma24']

    # 5) Heatwave 런-길이 (일단위 집계)
    if has_T:
        by = list(group_keys) if set(group_keys).issubset(df.columns) else []
        daily = df.groupby(by + ['date'], as_index=False)['temperature'].max()
        daily['hot_day'] = (daily['temperature'] >= 33).astype(int)

        def _runlen(x):
            r = []
            cnt = 0
            for v in x:
                cnt = cnt + 1 if v == 1 else 0
                r.append(cnt)
            return pd.Series(r, index=x.index)

        if by:
            daily['hot_runlen'] = daily.groupby(by)['hot_day'].apply(_runlen).reset_index(level=by, drop=True)
        else:
            daily['hot_runlen'] = _runlen(daily['hot_day'])

        df = df.merge(daily[by + ['date', 'hot_day', 'hot_runlen']], on=by + ['date'], how='left')

    return df


def add_lag_roll(df: pd.DataFrame, group_keys: List[str], lag_hours=(1, 24, 168), roll_hours=(24, 168)) -> pd.DataFrame:
    df = df.sort_values(group_keys + ['datetime']).copy()
    for lh in lag_hours:
        df[f'lag_{lh}'] = df.groupby(group_keys)['target'].shift(lh)
    for rh in roll_hours:
        df[f'roll{rh}_mean'] = (
            df.groupby(group_keys)['target']
              .shift(1)
              .rolling(rh, min_periods=int(rh*0.5))
              .mean()
              .reset_index(level=0, drop=True)
        )
    return df


# 추후 컬럼 정렬 시 누락해도 되는 컬럼 목록(예: 학습 시점에만 있는 lag 등)을 필요에 따라 지정
AUTO_DROP_IF_MISSING = set()

def align_train_test_columns(X_tr: pd.DataFrame, X_te: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    common = [c for c in X_tr.columns if c in X_te.columns]
    common = [c for c in common if c not in AUTO_DROP_IF_MISSING]
    return X_tr[common].copy(), X_te[common].copy()


def seasonal_naive(series: pd.Series, horizon: int, period: int = 168) -> np.ndarray:
    if len(series) < period:
        return np.repeat(series.iloc[-1], horizon)
    base = series.iloc[-period:]
    return base.values[:horizon]

##k fold

In [None]:
# --- 단일 건물 학습/예측 ---
@dataclass
class FoldResult:
    smape: float
    n_train: int
    n_valid: int

def walk_forward_building(df_b: pd.DataFrame, df_b_te: pd.DataFrame) -> np.ndarray:
    df_b = df_b.sort_values('datetime').reset_index(drop=True)
    df_b_te = df_b_te.sort_values('datetime').reset_index(drop=True)

    # 피처 선택: 누출/키/식별자 제거
    exclude = {'target', 'datetime', 'num_date_time', 'building_id'}
    feat_cols = [c for c in df_b.columns if c not in exclude and pd.api.types.is_numeric_dtype(df_b[c])]

    if len(feat_cols) == 0:
        # 피처가 전혀 없으면 시즌-나이브로 대체
        log(f"[B{int(df_b['building_id'].iloc[0])}] 사용 가능한 피처가 없어 seasonal naive 사용")
        return seasonal_naive(df_b['target'], horizon=len(df_b_te), period=168)

    # 학습/테스트 행렬
    X = df_b[feat_cols].to_numpy()
    X_te = df_b_te.reindex(columns=feat_cols, fill_value=0).to_numpy()

    # 타깃
    y = df_b['target'].astype(float).values
    y_trn = np.log1p(y) if USE_LOG1P else y

    # NaN/inf 방지
    X = np.nan_to_num(X, copy=False, posinf=None, neginf=None)
    X_te = np.nan_to_num(X_te, copy=False, posinf=None, neginf=None)

    n = len(df_b)
    fold_sizes = np.linspace(0.6, 0.95, N_SPLITS)  # 앞쪽 학습 비율
    oof_pred = np.zeros(n, dtype=float)

    for i, frac in enumerate(fold_sizes, start=1):
        split = int(n * float(frac))
        tr_end = max(0, split - EMBARGO)

        # 안전장치: 최소 학습/검증 길이 확보
        if tr_end < 16 or tr_end >= n-1:
            continue
        va_start, va_end = tr_end, split
        if va_end - va_start < 8:
            continue

        X_tr, y_tr = X[:tr_end], y_trn[:tr_end]
        X_va, y_va = X[va_start:va_end], y[va_start:va_end]

        model = XGBRegressor(**XGB_PARAMS)
        model.fit(X_tr, y_tr)

        va_pred = model.predict(X_va)
        va_pred = np.expm1(va_pred) if USE_LOG1P else va_pred
        va_pred = np.clip(va_pred, 0, None)
        oof_pred[va_start:va_end] = va_pred

        s = smape(y_va, va_pred)
        log(f"[B{int(df_b['building_id'].iloc[0])}] Fold {i}/{N_SPLITS} SMAPE={s:.3f} "
            f"(train {len(y_tr)}, valid {len(y_va)})")
        del model; gc.collect()

    # 최종 학습
    final_model = XGBRegressor(**XGB_PARAMS)
    final_model.fit(X, y_trn)
    pred_te = final_model.predict(X_te)
    pred_te = np.expm1(pred_te) if USE_LOG1P else pred_te
    pred_te = np.clip(pred_te, 0, None)

    # 시즌-나이브 블렌딩
    if DO_BLEND_SEASONAL:
        horizon = len(df_b_te)
        s_pred = seasonal_naive(df_b['target'], horizon=horizon, period=168)
        pred_te = (1.0 - BLEND_ALPHA) * pred_te + BLEND_ALPHA * s_pred

    return pred_te


# --- 메인 파이프라인 (중복 building_id 제거/안전 groupby) ---
def build_features_and_predict(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    tr = train_df.copy()
    te = test_df.copy()

    # dtype 정규화(안전)
    if 'building_id' in tr.columns:
        tr['building_id'] = pd.to_numeric(tr['building_id'], errors='coerce').astype('Int64')
    if 'building_id' in te.columns:
        te['building_id'] = pd.to_numeric(te['building_id'], errors='coerce').astype('Int64')

    # 시간 피처 (반환형 함수이므로 대입 필수)  <-- FIX
    tr = add_time_features(tr)
    te = add_time_features(te)

    # lag/rolling은 building 단위로 생성
    tr = add_lag_roll(tr, group_keys=['building_id'], lag_hours=(1, 24, 168), roll_hours=(24, 168))

    # 수치/결측 정리
    tr.replace([np.inf, -np.inf], np.nan, inplace=True)
    te.replace([np.inf, -np.inf], np.nan, inplace=True)
    tr = tr.fillna(0)
    te = te.fillna(0)

    # 라그로 생기는 초기 구간 제거(정보누출 방지)  <-- 짧은 그룹 보호
    def _cut_head(g, cut=200):
        if len(g) <= cut:
            return g.iloc[0:0]  # 빈 프레임
        return g.iloc[cut:].copy()

    tr = (
        tr.sort_values(['building_id', 'datetime'])
          .groupby('building_id', group_keys=False)
          .apply(lambda g: _cut_head(g, 200))
          .reset_index(drop=True)
    )

    # 드랍 컬럼 설정
    drop_cols = ['building_id', 'target', 'datetime', 'num_date_time']

    tr_feat = tr.drop(columns=[c for c in drop_cols if c in tr.columns], errors='ignore')
    te_feat = te.drop(columns=[c for c in drop_cols if c in te.columns], errors='ignore')

    # 컬럼 정렬
    tr_feat, te_feat = align_train_test_columns(tr_feat, te_feat)

    # 재조합 (주의: building_id는 학습용 df에는 남기되, 모델 입력에서는 제외함)
    tr = pd.concat(
        [tr[['building_id', 'datetime', 'target']].reset_index(drop=True),
         tr_feat.reset_index(drop=True)],
        axis=1
    )
    te = pd.concat(
        [te[['building_id', 'datetime', 'num_date_time']].reset_index(drop=True),
         te_feat.reset_index(drop=True)],
        axis=1
    )

    # 혹시 모를 중복 컬럼 최종 방지
    tr = tr.loc[:, ~tr.columns.duplicated(keep='first')]
    te = te.loc[:, ~te.columns.duplicated(keep='first')]

    preds = []
    for b_id, g_tr in tr.groupby('building_id', sort=False):
        g_te = te[te['building_id'] == b_id].copy()
        if g_te.empty or g_tr.empty:
            continue
        try:
            pred = walk_forward_building(g_tr, g_te)
        except Exception as e:
            log(f"[B{b_id}] 오류 발생({e}); 시즌 나이브로 대체")
            pred = seasonal_naive(g_tr['target'].reset_index(drop=True), len(g_te), period=168)

        preds.append(pd.DataFrame({
            'num_date_time': g_te['num_date_time'].values,
            'answer': pred
        }))

    pred_all = pd.concat(preds, axis=0, ignore_index=True) if preds else pd.DataFrame(columns=['num_date_time', 'answer'])
    submission = test_df[['num_date_time']].merge(pred_all, on='num_date_time', how='left')
    if submission['answer'].isna().any():
        log("누락 예측 0으로 채움")
        submission['answer'] = submission['answer'].fillna(0.0)
    submission['answer'] = submission['answer'].astype(float).clip(lower=0)
    return submission[['num_date_time', 'answer']]

In [None]:
submission = build_features_and_predict(train_df, test_df)


[09:24:52] [B1] Fold 1/10 SMAPE=10.690 (train 1080, valid 24)
[09:24:57] [B1] Fold 2/10 SMAPE=5.810 (train 1151, valid 24)
[09:25:03] [B1] Fold 3/10 SMAPE=2.823 (train 1223, valid 24)
[09:25:09] [B1] Fold 4/10 SMAPE=3.986 (train 1294, valid 24)
[09:25:15] [B1] Fold 5/10 SMAPE=5.140 (train 1366, valid 24)
[09:25:21] [B1] Fold 6/10 SMAPE=4.276 (train 1437, valid 24)
[09:25:27] [B1] Fold 7/10 SMAPE=5.826 (train 1509, valid 24)
[09:25:33] [B1] Fold 8/10 SMAPE=2.906 (train 1580, valid 24)
[09:25:39] [B1] Fold 9/10 SMAPE=5.041 (train 1652, valid 24)
[09:25:45] [B1] Fold 10/10 SMAPE=4.303 (train 1724, valid 24)
[09:25:57] [B2] Fold 1/10 SMAPE=4.926 (train 1080, valid 24)
[09:26:03] [B2] Fold 2/10 SMAPE=4.415 (train 1151, valid 24)
[09:26:08] [B2] Fold 3/10 SMAPE=2.514 (train 1223, valid 24)
[09:26:14] [B2] Fold 4/10 SMAPE=3.508 (train 1294, valid 24)
[09:26:20] [B2] Fold 5/10 SMAPE=3.362 (train 1366, valid 24)
[09:26:26] [B2] Fold 6/10 SMAPE=3.636 (train 1437, valid 24)
[09:26:32] [B2] Fold 7

KeyboardInterrupt: 

In [None]:
# 1) 경로 설정 (필요 시 수정)
DATA_DIR = "/content/drive/MyDrive/KUBIG/25_summer_contest"
SAMPLE_SUB_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
OUTPUT_DIR = DATA_DIR
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2) 제출 DataFrame 확보
if "submission" not in globals():
    raise RuntimeError("submission DataFrame이 없습니다. 위 파이프라인 실행 후 다시 시도하세요.")

sub = submission.copy()

# 2-1) 기본 컬럼/타입 안전화
required_cols = {"num_date_time", "answer"}
missing = required_cols - set(sub.columns)
if missing:
    raise KeyError(f"submission에 누락된 컬럼: {missing}")

# num_date_time은 문자열로 고정(대회 포맷 오류 방지)
sub["num_date_time"] = sub["num_date_time"].astype(str)

# 혹시 중복 num_date_time이 있으면 평균으로 집계(또는 마지막값 사용하려면 tail(1))
if sub["num_date_time"].duplicated().any():
    sub = (sub.groupby("num_date_time", as_index=False)["answer"]
              .mean())  # 필요시 .last() 로 변경 가능

# 3) 포맷 정렬: sample_submission.csv가 있으면 순서/행 일치시킴
if os.path.exists(SAMPLE_SUB_CSV):
    sample_sub = pd.read_csv(SAMPLE_SUB_CSV)
    if "num_date_time" not in sample_sub.columns:
        raise KeyError("sample_submission.csv에 num_date_time 컬럼이 없습니다.")

    # dtype 통일
    sample_sub["num_date_time"] = sample_sub["num_date_time"].astype(str)

    # 병합(누락은 NaN)
    sub = sample_sub[["num_date_time"]].merge(
        sub[["num_date_time", "answer"]], on="num_date_time", how="left"
    )
else:
    # sample 미제공 시, 최소 컬럼 형태 보장
    sub = sub[["num_date_time", "answer"]].copy()

# 4) 값 정리: NaN/inf 처리 및 하한 0
sub["answer"] = pd.to_numeric(sub["answer"], errors="coerce")
sub.replace([np.inf, -np.inf], np.nan, inplace=True)

missing_cnt = int(sub["answer"].isna().sum())
if missing_cnt > 0:
    print(f"[경고] 예측 누락 {missing_cnt}건 → 0으로 대체합니다.")
    sub["answer"] = sub["answer"].fillna(0.0)

# 음수 방지 및 소수점 자리수 제한(파일 크기/평가 안전)
sub["answer"] = sub["answer"].astype(float).clip(lower=0)
sub["answer"] = sub["answer"].round(6)

# 5) 파일 저장: 타임스탬프 포함
ts = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(OUTPUT_DIR, f"submission_gpu_xgb_{ts}.csv")
sub.to_csv(out_path, index=False)
print(f"[완료] 제출 파일 저장: {out_path}")

# 6) 상위 몇 줄 미리보기 + 간단 검증
print(sub.head())
if os.path.exists(SAMPLE_SUB_CSV):
    sample_rows = len(pd.read_csv(SAMPLE_SUB_CSV))
    if len(sub) != sample_rows:
        print(f"[경고] 제출 행수({len(sub)})가 sample({sample_rows})과 다릅니다. 포맷 확인 필요.")

RuntimeError: submission DataFrame이 없습니다. 위 파이프라인 실행 후 다시 시도하세요.

##time split


In [None]:
from sklearn.model_selection import TimeSeriesSplit

# 전역 기본값
N_SPLITS = globals().get('N_SPLITS', 7)          # 권장: 5
EMBARGO  = globals().get('EMBARGO', 48)          # 24~48 추천
USE_LOG1P = globals().get('USE_LOG1P', True)
DO_BLEND_SEASONAL = globals().get('DO_BLEND_SEASONAL', True)
BLEND_ALPHA = globals().get('BLEND_ALPHA', 0.2)

def _iter_timeseries_splits(n_samples: int, n_splits: int, test_size: int):
    """
    sklearn 버전 상관없이 '확장형(Expanding) + 고정 test_size' 스플릿 생성기.
    학습: [0 : train_end), 검증: [train_end + EMBARGO : train_end + EMBARGO + test_size)
    """
    if n_samples <= test_size + 16:
        return  # 너무 짧으면 폴드 생성 불가

    # 균등하게 n_splits개 지점에서 검증 세그먼트 생성
    # 각 fold의 검증 시작점 후보를 만들어준다.
    # 검증 구간: [va_start, va_start + test_size)
    # 학습 구간: [0, va_start - EMBARGO)
    max_start = n_samples - test_size
    # 시작 후보를 n_splits개 균등 분할(초기 학습 최소 16 보장)
    starts = np.linspace(16 + EMBARGO, max_start, num=n_splits, dtype=int)
    # 단조 증가/유효성 보장
    starts = np.unique(starts[starts + test_size <= n_samples])
    for va_start in starts:
        va_end = va_start + test_size
        tr_end = max(0, va_start - EMBARGO)
        if tr_end < 16 or (va_end - va_start) < 8:
            continue
        tr_idx = np.arange(0, tr_end, dtype=int)
        va_idx = np.arange(va_start, va_end, dtype=int)
        if len(tr_idx) >= 16 and len(va_idx) >= 8:
            yield tr_idx, va_idx

def walk_forward_building(df_b: pd.DataFrame, df_b_te: pd.DataFrame) -> np.ndarray:
    df_b = df_b.sort_values('datetime').reset_index(drop=True)
    df_b_te = df_b_te.sort_values('datetime').reset_index(drop=True)

    # 피처 선택: 식별자/키 제외 + 숫자형만
    exclude = {'target', 'datetime', 'num_date_time', 'building_id'}
    feat_cols = [c for c in df_b.columns if c not in exclude and pd.api.types.is_numeric_dtype(df_b[c])]

    if len(feat_cols) == 0:
        log(f"[B{int(df_b['building_id'].iloc[0])}] 사용 가능한 피처가 없어 seasonal naive 사용")
        return seasonal_naive(df_b['target'], horizon=len(df_b_te), period=168)

    X = df_b[feat_cols].to_numpy()
    X_te = df_b_te.reindex(columns=feat_cols, fill_value=0).to_numpy()
    y = df_b['target'].astype(float).values
    y_trn = np.log1p(y) if USE_LOG1P else y

    # 안전값 처리
    X = np.nan_to_num(X, copy=False, posinf=None, neginf=None)
    X_te = np.nan_to_num(X_te, copy=False, posinf=None, neginf=None)

    n = len(df_b)

    # === TimeSeriesSplit 설정 ===
    # 기본: 예측 호라이즌/주기로 168(1주) 권장. 데이터가 너무 짧으면 자동 축소.
    TEST_SIZE = min(336, max(8, 168 if n >= 168*3 else int(n*0.1)))

    # 우선 sklearn TimeSeriesSplit(test_size=...) 시도, 안 되면 커스텀 분할 사용
    folds = []
    try:
        tscv = TimeSeriesSplit(n_splits=N_SPLITS, test_size=TEST_SIZE)
        for tr_idx, va_idx in tscv.split(X):
            # Embargo 적용
            if EMBARGO > 0:
                va_start = va_idx[0]
                tr_keep = tr_idx[tr_idx < max(0, va_start - EMBARGO)]
            else:
                tr_keep = tr_idx
            if len(tr_keep) >= 16 and len(va_idx) >= 8:
                folds.append((tr_keep, va_idx))
    except TypeError:
        # test_size 인자가 없는 sklearn 버전 대비
        for tr_idx, va_idx in _iter_timeseries_splits(n, N_SPLITS, TEST_SIZE):
            folds.append((tr_idx, va_idx))

    # 만약 위 시도에서 폴드가 거의 안 나왔다면 커스텀 분할로 보강
    if len(folds) < max(2, int(N_SPLITS*0.6)):
        folds = list(_iter_timeseries_splits(n, N_SPLITS, TEST_SIZE))

    oof_pred = np.zeros(n, dtype=float)
    any_fold = False

    for i, (tr_idx, va_idx) in enumerate(folds, start=1):
        X_tr, y_tr = X[tr_idx], y_trn[tr_idx]
        X_va, y_va = X[va_idx], y[va_idx]

        model = XGBRegressor(**XGB_PARAMS)
        model.fit(X_tr, y_tr)

        va_pred = model.predict(X_va)
        va_pred = np.expm1(va_pred) if USE_LOG1P else va_pred
        va_pred = np.clip(va_pred, 0, None)
        oof_pred[va_idx] = va_pred

        s = smape(y_va, va_pred)
        log(f"[B{int(df_b['building_id'].iloc[0])}] Fold {i}/{len(folds)} "
            f"(train {len(tr_idx)}, valid {len(va_idx)}), SMAPE={s:.3f}")
        del model; gc.collect()
        any_fold = True

    # 최종 학습(전체 구간)
    final_model = XGBRegressor(**XGB_PARAMS)
    final_model.fit(X, y_trn)
    pred_te = final_model.predict(X_te)
    pred_te = np.expm1(pred_te) if USE_LOG1P else pred_te
    pred_te = np.clip(pred_te, 0, None)

    # 시즌-나이브 블렌딩(옵션)
    if DO_BLEND_SEASONAL:
        horizon = len(df_b_te)
        s_pred = seasonal_naive(df_b['target'], horizon=horizon, period=168)
        pred_te = (1.0 - BLEND_ALPHA) * pred_te + BLEND_ALPHA * s_pred

    if not any_fold:
        log(f"[B{int(df_b['building_id'].iloc[0])}] 유효한 폴드가 없어 seasonal naive 사용")
        return seasonal_naive(df_b['target'], horizon=len(df_b_te), period=168)

    return pred_te




# --- 메인 파이프라인 (중복 building_id 제거/안전 groupby) ---
def build_features_and_predict(train_df: pd.DataFrame, test_df: pd.DataFrame) -> pd.DataFrame:
    tr = train_df.copy()
    te = test_df.copy()

    # dtype 정규화(안전)
    if 'building_id' in tr.columns:
        tr['building_id'] = pd.to_numeric(tr['building_id'], errors='coerce').astype('Int64')
    if 'building_id' in te.columns:
        te['building_id'] = pd.to_numeric(te['building_id'], errors='coerce').astype('Int64')

    # 시간 피처 (반환형 함수이므로 대입 필수)  <-- FIX
    tr = add_time_features(tr)
    te = add_time_features(te)

    # lag/rolling은 building 단위로 생성
    tr = add_lag_roll(tr, group_keys=['building_id'], lag_hours=(1, 24, 168), roll_hours=(24, 168))

    # 수치/결측 정리
    tr.replace([np.inf, -np.inf], np.nan, inplace=True)
    te.replace([np.inf, -np.inf], np.nan, inplace=True)
    tr = tr.fillna(0)
    te = te.fillna(0)

    # 라그로 생기는 초기 구간 제거(정보누출 방지)  <-- 짧은 그룹 보호
    def _cut_head(g, cut=200):
        if len(g) <= cut:
            return g.iloc[0:0]  # 빈 프레임
        return g.iloc[cut:].copy()

    tr = (
        tr.sort_values(['building_id', 'datetime'])
          .groupby('building_id', group_keys=False)
          .apply(lambda g: _cut_head(g, 200))
          .reset_index(drop=True)
    )

    # 드랍 컬럼 설정
    drop_cols = ['building_id', 'target', 'datetime', 'num_date_time']

    tr_feat = tr.drop(columns=[c for c in drop_cols if c in tr.columns], errors='ignore')
    te_feat = te.drop(columns=[c for c in drop_cols if c in te.columns], errors='ignore')

    # 컬럼 정렬
    tr_feat, te_feat = align_train_test_columns(tr_feat, te_feat)

    # 재조합 (주의: building_id는 학습용 df에는 남기되, 모델 입력에서는 제외함)
    tr = pd.concat(
        [tr[['building_id', 'datetime', 'target']].reset_index(drop=True),
         tr_feat.reset_index(drop=True)],
        axis=1
    )
    te = pd.concat(
        [te[['building_id', 'datetime', 'num_date_time']].reset_index(drop=True),
         te_feat.reset_index(drop=True)],
        axis=1
    )

    # 혹시 모를 중복 컬럼 최종 방지
    tr = tr.loc[:, ~tr.columns.duplicated(keep='first')]
    te = te.loc[:, ~te.columns.duplicated(keep='first')]

    preds = []
    for b_id, g_tr in tr.groupby('building_id', sort=False):
        g_te = te[te['building_id'] == b_id].copy()
        if g_te.empty or g_tr.empty:
            continue
        try:
            pred = walk_forward_building(g_tr, g_te)
        except Exception as e:
            log(f"[B{b_id}] 오류 발생({e}); 시즌 나이브로 대체")
            pred = seasonal_naive(g_tr['target'].reset_index(drop=True), len(g_te), period=168)

        preds.append(pd.DataFrame({
            'num_date_time': g_te['num_date_time'].values,
            'answer': pred
        }))

    pred_all = pd.concat(preds, axis=0, ignore_index=True) if preds else pd.DataFrame(columns=['num_date_time', 'answer'])
    submission = test_df[['num_date_time']].merge(pred_all, on='num_date_time', how='left')
    if submission['answer'].isna().any():
        log("누락 예측 0으로 채움")
        submission['answer'] = submission['answer'].fillna(0.0)
    submission['answer'] = submission['answer'].astype(float).clip(lower=0)
    return submission[['num_date_time', 'answer']]

In [None]:
submission = build_features_and_predict(train_df, test_df)


[23:44:07] [B1] Fold 1/10 (train 136, valid 168), SMAPE=6.292


KeyboardInterrupt: 

In [None]:
# 1) 경로 설정 (필요 시 수정)
DATA_DIR = "/content/drive/MyDrive/KUBIG/25_summer_contest"
SAMPLE_SUB_CSV = os.path.join(DATA_DIR, "sample_submission.csv")
OUTPUT_DIR = DATA_DIR
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 2) 제출 DataFrame 확보
if "submission" not in globals():
    raise RuntimeError("submission DataFrame이 없습니다. 위 파이프라인 실행 후 다시 시도하세요.")

sub = submission.copy()

# 2-1) 기본 컬럼/타입 안전화
required_cols = {"num_date_time", "answer"}
missing = required_cols - set(sub.columns)
if missing:
    raise KeyError(f"submission에 누락된 컬럼: {missing}")

# num_date_time은 문자열로 고정(대회 포맷 오류 방지)
sub["num_date_time"] = sub["num_date_time"].astype(str)

# 혹시 중복 num_date_time이 있으면 평균으로 집계(또는 마지막값 사용하려면 tail(1))
if sub["num_date_time"].duplicated().any():
    sub = (sub.groupby("num_date_time", as_index=False)["answer"]
              .mean())  # 필요시 .last() 로 변경 가능

# 3) 포맷 정렬: sample_submission.csv가 있으면 순서/행 일치시킴
if os.path.exists(SAMPLE_SUB_CSV):
    sample_sub = pd.read_csv(SAMPLE_SUB_CSV)
    if "num_date_time" not in sample_sub.columns:
        raise KeyError("sample_submission.csv에 num_date_time 컬럼이 없습니다.")

    # dtype 통일
    sample_sub["num_date_time"] = sample_sub["num_date_time"].astype(str)

    # 병합(누락은 NaN)
    sub = sample_sub[["num_date_time"]].merge(
        sub[["num_date_time", "answer"]], on="num_date_time", how="left"
    )
else:
    # sample 미제공 시, 최소 컬럼 형태 보장
    sub = sub[["num_date_time", "answer"]].copy()

# 4) 값 정리: NaN/inf 처리 및 하한 0
sub["answer"] = pd.to_numeric(sub["answer"], errors="coerce")
sub.replace([np.inf, -np.inf], np.nan, inplace=True)

missing_cnt = int(sub["answer"].isna().sum())
if missing_cnt > 0:
    print(f"[경고] 예측 누락 {missing_cnt}건 → 0으로 대체합니다.")
    sub["answer"] = sub["answer"].fillna(0.0)

# 음수 방지 및 소수점 자리수 제한(파일 크기/평가 안전)
sub["answer"] = sub["answer"].astype(float).clip(lower=0)
sub["answer"] = sub["answer"].round(6)

# 5) 파일 저장: 타임스탬프 포함
ts = time.strftime("%Y%m%d_%H%M%S")
out_path = os.path.join(OUTPUT_DIR, f"submission_gpu_xgb_{ts}.csv")
sub.to_csv(out_path, index=False)
print(f"[완료] 제출 파일 저장: {out_path}")

# 6) 상위 몇 줄 미리보기 + 간단 검증
print(sub.head())
if os.path.exists(SAMPLE_SUB_CSV):
    sample_rows = len(pd.read_csv(SAMPLE_SUB_CSV))
    if len(sub) != sample_rows:
        print(f"[경고] 제출 행수({len(sub)})가 sample({sample_rows})과 다릅니다. 포맷 확인 필요.")

##개선

In [None]:
# ============================================================
# GPU XGBoost (SMAPE 개선판, 단일 파일 실행용)
# - 도메인 파생변수(THI/불쾌지수/체감온도, wind_diff/hum_diff 등)
# - 누출 방지 lag/rolling (shift(1)), t-24/t-168 계열
# - Gap Walk‑Forward CV (시간 간격 비우기)
# - Seed 앙상블 + Seasonal‑Naive 블렌딩(소량)
# - 건물별 스케일 캘리브레이션(OOF 기반)
# - 제출 파일 생성
# 전제: train_df, test_df 가 이미 메모리에 존재 (Colab/Notebook)
# ============================================================

import os, gc, math, json, random, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Any

from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor

# =========================
# 설정
# =========================
SEED_LIST         = [2025, 1029, 77]      # 시드 앙상블
N_SPLITS          = 10                     # Walk-forward 폴드 수
GAP_HOURS         = 24                     # 폴드 사이 비우는 간격(누출 방지)
USE_LOG1P_TARGET  = True                   # 타깃 로그스케일 학습
CLIP_MIN          = 0.0                    # 음수 예측 방지
BLEND_SEASONAL    = 0.10                   # Seasonal naive 소량 블렌딩 비율 (0~0.2 권장)
BUILDING_RANGE    = None                   # 예: range(1, 101); None이면 데이터에서 자동
SUBMISSION_IN     = '/content/drive/MyDrive/KUBIG/25_summer_contest/sample_submission.csv'
SUBMISSION_OUT    = f'/content/drive/MyDrive/KUBIG/25_summer_contest/submission_gpu_xgb_smape_tuned.csv'

# XGBoost 기본 하이퍼파라미터(성능 위주, 과적합 방지 강화)
XGB_BASE = dict(
    tree_method="hist",        # xgboost>=2.0 에서 device='cuda'와 함께 사용
    device="cuda",
    n_estimators=1800,
    learning_rate=0.06,
    max_depth=7,
    min_child_weight=6.0,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=2.0,
    reg_lambda=8.0,
    gamma=0.0,
    objective="reg:squarederror",
    n_jobs=-1
)

# =========================
# 유틸
# =========================
def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)

def smape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred)
    mask = denom != 0
    return 100.0 * np.mean(diff[mask] / denom[mask])

def seasonal_naive(series: pd.Series, horizon: int, period: int = 168) -> np.ndarray:
    # 마지막 period 구간을 그대로 반복
    if len(series) < period:
        return np.repeat(series.iloc[-1], horizon).astype(float)
    ref = series.iloc[-period:].values
    reps = int(np.ceil(horizon / period))
    return np.tile(ref, reps)[:horizon].astype(float)

# =========================
# 파생변수
# =========================
def add_time_features(df: pd.DataFrame):
    dt = pd.to_datetime(df['datetime'])
    df['hour']       = dt.dt.hour
    df['dayofweek']  = dt.dt.dayofweek
    df['week']       = dt.dt.isocalendar().week.astype(int)
    df['month']      = dt.dt.month
    df['is_weekend'] = (df['dayofweek'] >= 5).astype(int)
    # 주기 인코딩
    df['sin_hour'] = np.sin(2*np.pi*df['hour']/24.0)
    df['cos_hour'] = np.cos(2*np.pi*df['hour']/24.0)
    df['sin_dow']  = np.sin(2*np.pi*df['dayofweek']/7.0)
    df['cos_dow']  = np.cos(2*np.pi*df['dayofweek']/7.0)
    # 여름 특화(6,7,8월만이지만 약한 위상 보정)
    df['summer_cos'] = np.cos((df['month']-6) * np.pi/3)

def add_weather_features(df: pd.DataFrame):
    # 컬럼 표준화 가정:
    # temperature(°C), humidity(%), wind_speed(m/s), precipitation(mm) 등 이름은 사전에 준비되어 있다고 가정
    # 없는 컬럼은 생성
    for col, default in [('temperature', np.nan), ('humidity', np.nan),
                         ('wind_speed', np.nan), ('precipitation', 0.0)]:
        if col not in df.columns:
            df[col] = default

    # 온습도지수 THI (간단식): THI = T - (0.55 - 0.0055*RH)*(T - 14.5)
    T  = pd.to_numeric(df['temperature'], errors='coerce')
    RH = pd.to_numeric(df['humidity'], errors='coerce')
    df['thi'] = T - (0.55 - 0.0055*RH) * (T - 14.5)

    # 불쾌지수 DI (섭씨, %): DI = 0.81*T + 0.01*RH*(0.99*T - 14.3) + 46.3
    df['discomfort_idx'] = 0.81*T + 0.01*RH*(0.99*T - 14.3) + 46.3

    # 체감온도(간이식): AT = T + 0.2*(0.348*RH/100*(T-4) + 0.70*wind + 0.7)
    W  = pd.to_numeric(df['wind_speed'], errors='coerce')
    df['apparent_temp'] = T + 0.2*(0.348*(RH/100.0)*(T-4.0) + 0.70*W + 0.7)

    # 차이형 파생
    df['wind_diff'] = W.diff().fillna(0.0)
    df['hum_diff']  = RH.diff().fillna(0.0)

    # 강수 플래그
    P  = pd.to_numeric(df['precipitation'], errors='coerce').fillna(0.0)
    df['is_rain'] = (P > 0).astype(int)

def _group_lag_roll(
    df: pd.DataFrame,
    group_key: str,
    target_col: str,
    lags: List[int],
    rolls: List[Tuple[int, str]]
):
    g = df.groupby(group_key, sort=False)
    for L in lags:
        df[f'{target_col}_lag{L}'] = g[target_col].shift(1).shift(L-1)  # total L with leakage-safe shift(1)
    for win, how in rolls:
        if how == 'mean':
            df[f'{target_col}_roll{win}_mean'] = g[target_col].shift(1).rolling(win).mean()
        elif how == 'median':
            df[f'{target_col}_roll{win}_median'] = g[target_col].shift(1).rolling(win).median()
        elif how == 'std':
            df[f'{target_col}_roll{win}_std'] = g[target_col].shift(1).rolling(win).std()

def add_lag_roll(df: pd.DataFrame):
    # target 컬럼 준비 (train에서는 이미 존재, test에는 없음 → skip)
    if 'target' not in df.columns and 'consumption' in df.columns:
        df['target'] = pd.to_numeric(df['consumption'], errors='coerce')

    if 'target' in df.columns:
        _group_lag_roll(
            df=df,
            group_key='building_id',
            target_col='target',
            lags=[1, 2, 3, 24, 48, 72, 168],
            rolls=[(6,'mean'), (12,'mean'), (24,'mean'), (24,'std'), (168,'mean')]
        )

def build_feature_matrix(tr: pd.DataFrame, te: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, List[str]]:
    # 공통 파생
    add_time_features(tr); add_time_features(te)
    add_weather_features(tr); add_weather_features(te)

    # lag/roll (train만)
    add_lag_roll(tr)

    # test에 train에만 있는 열 생성 (일단 NaN으로)
    for c in tr.columns:
        if c not in te.columns:
            te[c] = np.nan

    # 절대 피처에 넣지 말아야 할 열들
    hard_exclude = {
        'datetime', 'num_date_time', 'answer',
        'target', 'consumption', 'building_id',
        'sunshine_hour', 'solar_radiation'
    }

    # 숫자형만, hard_exclude 제거
    feat_cols = []
    for c in tr.columns:
        if c in hard_exclude:
            continue
        if pd.api.types.is_numeric_dtype(tr[c]):
            feat_cols.append(c)
    feat_cols = sorted(set(feat_cols))

    # 결측 대체 (lag/roll에서 생긴 NaN 포함)
    tr[feat_cols] = tr[feat_cols].fillna(0.0)
    te[feat_cols] = te[feat_cols].fillna(0.0)

    return tr, te, feat_cols

# =========================
# 모델링 (건물 단위)
# =========================
def train_predict_building(
    df_b: pd.DataFrame,
    df_b_te: pd.DataFrame,
    feat_cols: List[str],
    n_splits: int = N_SPLITS,
    gap_hours: int = GAP_HOURS
) -> Dict[str, Any]:
    df_b = df_b.sort_values('datetime').reset_index(drop=True)
    df_b_te = df_b_te.sort_values('datetime').reset_index(drop=True)

    # 사용할 피처 교차집합(안전)
    use_cols = [c for c in feat_cols if c in df_b.columns]

    if ('target' not in df_b.columns) or (len(use_cols) == 0):
        log("  [Fallback] target/feature 없음 → seasonal naive 사용")
        y_tr = df_b['consumption'] if 'consumption' in df_b.columns else pd.Series([], dtype=float)
        horizon = len(df_b_te)
        pred = seasonal_naive(y_tr.reset_index(drop=True), horizon=horizon, period=168) if len(y_tr) > 0 else np.zeros(horizon)
        return dict(oof_idx=np.array([], dtype=int), oof_pred=np.array([], dtype=float),
                    test_pred=pred, fold_smape=[], oof_smape=np.nan)

    # 항상 reindex로 안전하게 매칭 (KeyError 원천 차단)
    X   = df_b.reindex(columns=use_cols,   fill_value=0.0).to_numpy(dtype=float)
    X_te= df_b_te.reindex(columns=use_cols,fill_value=0.0).to_numpy(dtype=float)
    y   = df_b['target'].astype(float).values

    if USE_LOG1P_TARGET:
        y_fit = np.log1p(y.clip(min=0.0))
    else:
        y_fit = y

    tscv = TimeSeriesSplit(n_splits=n_splits, test_size=None)
    n = len(df_b)

    log(f"  [Info] samples={n}, features={len(use_cols)}, test_horizon={len(df_b_te)}")

    oof_pred = np.zeros(n, dtype=float)
    fold_metrics = []
    used_idx = []

    for fold, (tr_idx, va_idx) in enumerate(tscv.split(X), start=1):
        # gap 처리
        if gap_hours > 0:
            va_start = va_idx.min()
            gap_start = max(0, va_start - gap_hours)
            tr_idx = tr_idx[tr_idx < gap_start]

        if len(tr_idx) == 0 or len(va_idx) == 0:
            log(f"  [Fold {fold}/{n_splits}] 건너뜀 (train={len(tr_idx)}, val={len(va_idx)})")
            continue

        log(f"  [Fold {fold}/{n_splits}] train={len(tr_idx)}, val={len(va_idx)}")

        X_tr, y_tr = X[tr_idx], y_fit[tr_idx]
        X_va, y_va = X[va_idx], y_fit[va_idx]

        # 시드 앙상블
        va_pred_ens = np.zeros_like(y_va, dtype=float)
        for sd in SEED_LIST:
            set_seed(sd)
            params = dict(XGB_BASE)
            params.update(dict(random_state=sd))
            model = XGBRegressor(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)
            p = model.predict(X_va)
            va_pred_ens += p / len(SEED_LIST)

        # 역변환
        va_pred = np.expm1(va_pred_ens) if USE_LOG1P_TARGET else va_pred_ens
        va_pred = np.clip(va_pred, CLIP_MIN, None)
        oof_pred[va_idx] = va_pred
        used_idx.extend(list(va_idx))

        fold_sm = smape(np.expm1(y_va) if USE_LOG1P_TARGET else y_va, va_pred)
        fold_metrics.append(float(fold_sm))
        log(f"    → Fold SMAPE: {fold_sm:.3f}%")

    used_idx = np.array(sorted(set(used_idx)), dtype=int)
    if len(used_idx) > 0:
        y_oof_true = y[used_idx]
        y_oof_true_lin = np.expm1(y_oof_true) if USE_LOG1P_TARGET else y_oof_true
        oof_sm = smape(y_oof_true_lin, oof_pred[used_idx])
    else:
        oof_sm = np.nan

    # 스케일 보정
    scale_factor = 1.0
    if len(used_idx) > 0:
        eps = 1e-6
        mean_true = float(np.mean(y_oof_true_lin) + eps)
        mean_pred = float(np.mean(oof_pred[used_idx]) + eps)
        scale_factor = (mean_true / mean_pred)

    # 풀데이터 재학습 → 테스트 예측
    test_pred_ens = np.zeros(len(df_b_te), dtype=float)
    for sd in SEED_LIST:
        set_seed(sd)
        params = dict(XGB_BASE)
        params.update(dict(random_state=sd))
        model = XGBRegressor(**params)
        model.fit(X, y_fit, verbose=False)
        test_pred_ens += model.predict(X_te) / len(SEED_LIST)

    test_pred = np.expm1(test_pred_ens) if USE_LOG1P_TARGET else test_pred_ens
    test_pred = np.clip(test_pred * scale_factor, CLIP_MIN, None)

    # seasonal-naive 블렌딩
    if 'consumption' in df_b.columns:
        sn = seasonal_naive(df_b['consumption'].reset_index(drop=True), horizon=len(df_b_te), period=168)
        test_pred = (1.0 - BLEND_SEASONAL) * test_pred + BLEND_SEASONAL * sn

    log(f"  [Done] OOF SMAPE={oof_sm:.3f}%  scale={scale_factor:.4f}")
    return dict(
        oof_idx=used_idx,
        oof_pred=oof_pred,
        test_pred=test_pred,
        fold_smape=fold_metrics,
        oof_smape=float(oof_sm)
    )
# =========================
# 메인 파이프라인
# =========================
def run_pipeline(train_df: pd.DataFrame, test_df: pd.DataFrame):
    tr = train_df.copy()
    te = test_df.copy()

    # 키/타깃 표준화
    # train: 'consumption' 컬럼이 타깃이라고 가정
    if 'target' not in tr.columns:
        tr['target'] = pd.to_numeric(tr.get('consumption', np.nan), errors='coerce')

    # ID/시간 정렬 및 타입 정리
    if 'building_id' not in tr.columns:
        # 원래 '건물번호'였다면 매핑 필요
        if '건물번호' in tr.columns: tr = tr.rename(columns={'건물번호':'building_id'})
        if '건물번호' in te.columns: te = te.rename(columns={'건물번호':'building_id'})
    tr['building_id'] = pd.to_numeric(tr['building_id'], errors='coerce').astype(int)
    te['building_id'] = pd.to_numeric(te['building_id'], errors='coerce').astype(int)

    tr['datetime'] = pd.to_datetime(tr['datetime'])
    te['datetime'] = pd.to_datetime(te['datetime'])

    tr = tr.sort_values(['building_id','datetime']).reset_index(drop=True)
    te = te.sort_values(['building_id','datetime']).reset_index(drop=True)

    # 피처 구성
    tr_feat, te_feat, feat_cols = build_feature_matrix(tr, te)

    # test에 target 생성 방지
    if 'target' in te_feat.columns:
        te_feat = te_feat.drop(columns=['target'])

    # 대상 건물 목록
    if BUILDING_RANGE is None:
        b_list = sorted(tr_feat['building_id'].unique().tolist())
    else:
        b_list = list(BUILDING_RANGE)

    # 결과 저장
    results = {}
    all_oof_idx = []
    all_oof_pred = []
    all_oof_true = []

    # 제출 템플릿
    sub = pd.read_csv(SUBMISSION_IN)
    # 제출 키가 'num_date_time' 기반이면 그대로 사용
    # test와의 매핑을 위해 보조 키 생성 (building_id + datetime)
    te_feat['key'] = te_feat['building_id'].astype(str) + '_' + te_feat['datetime'].dt.strftime('%Y%m%d_%H')
    # sample_submission에도 동일 키 생성 시도
    if 'num_date_time' in sub.columns:
        # 예: "3_20210801_00"
        sub['key'] = sub['num_date_time'].astype(str)

    # 건물 루프
    for b in b_list:
        df_b = tr_feat[tr_feat['building_id'] == b].copy()
        df_b_te = te_feat[te_feat['building_id'] == b].copy()

        if len(df_b_te) == 0:
            continue

        res = train_predict_building(df_b, df_b_te, feat_cols)
        results[b] = res

        # OOF 수집
        if len(res['oof_idx']) > 0:
            sel = df_b.iloc[res['oof_idx']]
            all_oof_idx.append(sel.index.values)
            all_oof_pred.append(res['oof_pred'][res['oof_idx']])
            # y_true (lin scale)
            y_true = df_b['target'].values
            if USE_LOG1P_TARGET:
                all_oof_true.append(np.expm1(y_true[res['oof_idx']]))
            else:
                all_oof_true.append(y_true[res['oof_idx']])

        # 테스트 예측 매핑
        pred_b = res['test_pred']
        keys_b = df_b_te['key'].values
        m = pd.DataFrame({'key': keys_b, 'answer': pred_b})
        sub = sub.merge(m, on='key', how='left', suffixes=('', f'_b{b}'))
        # answer 채우기
        sub['answer'] = sub['answer'].fillna(sub[f'answer_b{b}'])
        sub = sub.drop(columns=[c for c in sub.columns if c.startswith('answer_b')])

        # 메모리 정리
        del df_b, df_b_te, res, m
        gc.collect()

    # OOF SMAPE 출력
    if len(all_oof_idx) > 0:
        y_true_all = np.concatenate(all_oof_true)
        y_pred_all = np.concatenate(all_oof_pred)
        oof_sm = smape(y_true_all, y_pred_all)
    else:
        oof_sm = np.nan

    print(f"[Global] OOF SMAPE: {oof_sm:.4f}%")
    # 폴드별은 각 건물 res['fold_smape'] 참고 가능

    # 제출 마무리
    if 'answer' not in sub.columns:
        sub['answer'] = 0.0
    sub_final = sub[['num_date_time','answer']].copy()
    sub_final['answer'] = sub_final['answer'].fillna(0.0).clip(CLIP_MIN, None)

    # 저장
    os.makedirs(os.path.dirname(SUBMISSION_OUT), exist_ok=True)
    sub_final.to_csv(SUBMISSION_OUT, index=False)
    print(f"Saved submission → {SUBMISSION_OUT}")

    return dict(
        submission=sub_final,
        results=results,
        oof_smape=oof_sm
    )

# =========================
# 실행
# =========================
out = run_pipeline(train_df, test_df)


[23:44:14]   [Info] samples=2040, features=35, test_horizon=168
[23:44:14]   [Fold 1/10] train=166, val=185
[23:44:19]     → Fold SMAPE: 2.678%
[23:44:19]   [Fold 2/10] train=351, val=185
[23:44:24]     → Fold SMAPE: 2.966%
[23:44:24]   [Fold 3/10] train=536, val=185
[23:44:29]     → Fold SMAPE: 1.330%
[23:44:29]   [Fold 4/10] train=721, val=185
[23:44:34]     → Fold SMAPE: 2.483%
[23:44:34]   [Fold 5/10] train=906, val=185
[23:44:40]     → Fold SMAPE: 1.745%
[23:44:40]   [Fold 6/10] train=1091, val=185
[23:44:45]     → Fold SMAPE: 1.509%
[23:44:45]   [Fold 7/10] train=1276, val=185
[23:44:50]     → Fold SMAPE: 1.418%
[23:44:50]   [Fold 8/10] train=1461, val=185
[23:44:56]     → Fold SMAPE: 1.807%
[23:44:56]   [Fold 9/10] train=1646, val=185
[23:45:02]     → Fold SMAPE: 0.962%
[23:45:02]   [Fold 10/10] train=1831, val=185
[23:45:07]     → Fold SMAPE: 1.631%
[23:45:11]   [Done] OOF SMAPE=nan%  scale=inf
[23:45:11]   [Info] samples=2040, features=35, test_horizon=168
[23:45:11]   [Fold 1

In [None]:
from datetime import datetime

# --- 저장 경로 자동 생성 ---
SAVE_DIR = '/content/drive/MyDrive/KUBIG/25_summer_contest'
os.makedirs(SAVE_DIR, exist_ok=True)

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
SUBMISSION_OUT = os.path.join(SAVE_DIR, f"submission_gpu_xgb_{timestamp}.csv")

# 최종 저장
sub_final.to_csv(SUBMISSION_OUT, index=False)
print(f"Saved submission → {SUBMISSION_OUT}")


NameError: name 'sub_final' is not defined