# 함수 선언 및  import

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib


import sklearn
import xgboost
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit
from optuna_integration.xgboost import XGBoostPruningCallback
import optuna
from sklearn.model_selection import KFold


import random as rn

RANDOM_SEED = 2023
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)



from datetime import datetime



import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print("pandas version: {}". format(pd.__version__))
print("NumPy version: {}". format(np.__version__))
print("scikit-learn version: {}". format(sklearn.__version__))
print("matplotlib version: {}". format(matplotlib.__version__))
print("seaborn version: {}". format(sns.__version__))
print("XGBoost version: {}". format(xgboost.__version__))

pandas version: 2.2.3
NumPy version: 2.2.5
scikit-learn version: 1.7.1
matplotlib version: 3.10.5
seaborn version: 0.13.2
XGBoost version: 3.0.3


In [3]:
def smape(gt, preds):
    gt= np.array(gt)
    preds = np.array(preds)
    v = 2 * abs(preds - gt) / (abs(preds) + abs(gt))
    score = np.mean(v) * 100
    return score

In [4]:
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(preds, dtrain):
        labels = dtrain.get_label()
        residual = (labels - preds).astype("float")
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [5]:
def custom_smape(preds, dtrain):
    labels = dtrain.get_label()
    return 'custom_smape', np.mean(2 * abs(preds - labels) / (abs(preds) + abs(labels))) * 100

# 데이터 전처리

## 데이터 불러오기

In [6]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
building_info = pd.read_csv('building_info.csv')

In [7]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop('num_date_time', axis = 1, inplace=True)

test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(°C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test.drop('num_date_time', axis = 1, inplace=True)

In [8]:
building_info = building_info.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

In [9]:
translation_dict = {
    '건물기타': 'Other Buildings',
    '공공': 'Public',
    '대학교': 'University',
    '데이터센터': 'Data Center',
    '백화점및아울렛': 'Department Store and Outlet',
    '병원': 'Hospital',
    '상용': 'Commercial',
    '아파트': 'Apartment',
    '연구소': 'Research Institute',
    '지식산업센터': 'Knowledge Industry Center',
    '할인마트': 'Discount Mart',
    '호텔및리조트': 'Hotel and Resort'
}

building_info['building_type'] = building_info['building_type'].replace(translation_dict)

In [10]:
building_info['solar_power_utility'] = np.where(building_info.solar_power_capacity !='-',1,0)
building_info['ess_utility'] = np.where(building_info.ess_capacity !='-',1,0)

In [11]:
train = pd.merge(train, building_info, on='building_number', how='left')
test = pd.merge(test, building_info, on='building_number', how='left')

## 결측치 확인 및 보간

In [12]:
train.isna().sum()

building_number         0
date_time               0
temperature             0
rainfall                0
windspeed               0
humidity                0
sunshine                0
solar_radiation         0
power_consumption       0
building_type           0
total_area              0
cooling_area            0
solar_power_capacity    0
ess_capacity            0
pcs_capacity            0
solar_power_utility     0
ess_utility             0
dtype: int64

In [13]:
train.solar_power_capacity.value_counts()

solar_power_capacity
-          95880
278.58      2040
1983.05     2040
389.76      2040
217.92      2040
1349.03     2040
276         2040
849.78      2040
1974.71     2040
97          2040
100         2040
255.88      2040
879.82      2040
858.02      2040
953.4       2040
83.95       2040
97.85       2040
344.96      2040
342         2040
12.24       2040
83.2        2040
74.25       2040
140.6       2040
77.76       2040
198.56      2040
10.08       2040
218.88      2040
1039.86     2040
105         2040
215.89      2040
1397.02     2040
199.4       2040
219.6       2040
820         2040
540.57      2040
50.88       2040
94.38       2040
1297.98     2040
1340.1      2040
103         2040
419.2       2040
36          2040
790.08      2040
605         2040
495         2040
282.95      2040
38.88       2040
77.38       2040
168         2040
322.9       2040
99.64       2040
20.25       2040
81.38       2040
171.6       2040
Name: count, dtype: int64

In [14]:
train.ess_capacity.value_counts()

ess_capacity
-        179520
2000       4080
101.5      2040
1025       2040
500        2040
3100       2040
205        2040
469.2      2040
1670       2040
150        2040
201.1      2040
209        2040
Name: count, dtype: int64

In [15]:
train.pcs_capacity.value_counts()

pcs_capacity
-       179520
101       4080
1000      4080
500       4080
250       2040
1500      2040
150       2040
550       2040
75        2040
100       2040
Name: count, dtype: int64

In [16]:
train['windspeed']= train.windspeed.interpolate()
train['humidity']= train.humidity.interpolate()

## Datetime 분리

In [17]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%Y%m%d %H')

# date time feature 생성
train['hour'] = train['date_time'].dt.hour
train['day'] = train['date_time'].dt.day
train['month'] = train['date_time'].dt.month
train['day_of_week'] = train['date_time'].dt.dayofweek #요일


test['date_time'] = pd.to_datetime(test['date_time'], format='%Y%m%d %H')

# date time feature 생성
test['hour'] = test['date_time'].dt.hour
test['day'] = test['date_time'].dt.day
test['month'] = test['date_time'].dt.month
test['day_of_week'] = test['date_time'].dt.dayofweek #요일

## Feature Engineering

### 평균기온, 최대기온 변수

In [18]:
def calculate_day_values(dataframe, target_column, output_column, aggregation_func):
    result_dict = {}

    grouped_temp = dataframe.groupby(['building_number', 'month', 'day'])[target_column].agg(aggregation_func)

    for (building, month, day), value in grouped_temp.items():
        result_dict.setdefault(building, {}).setdefault(month, {})[day] = value

    dataframe[output_column] = [
        result_dict.get(row['building_number'], {}).get(row['month'], {}).get(row['day'], None)
        for _, row in dataframe.iterrows()
    ]

    
train['day_max_temperature'] = 0.0
train['day_mean_temperature'] = 0.0

calculate_day_values(train, 'temperature', 'day_max_temperature', 'max')
calculate_day_values(train, 'temperature', 'day_mean_temperature', 'mean')
calculate_day_values(train, 'temperature', 'day_min_temperature', 'min')

train['day_temperature_range'] = train['day_max_temperature'] - train['day_min_temperature']

calculate_day_values(test, 'temperature', 'day_max_temperature', 'max')
calculate_day_values(test, 'temperature', 'day_mean_temperature', 'mean')
calculate_day_values(test, 'temperature', 'day_min_temperature', 'min')

test['day_temperature_range'] = test['day_max_temperature'] - test['day_min_temperature']


### Outlier drop

In [19]:
outlier_list = [68973,71013,112384,123132,150739,150740,150741,150742,
                150883,150884,150885,150886,138904,193120,193121,152393]

train.drop(outlier_list, axis=0,inplace=True)

### 임시 휴무 추측 데이터 drop

In [20]:
# temp_hol = {2 : ['2022-06-17'], 
#     5 : ['2022-07-25','2022-08-02','2022-08-09','2022-08-16'],
#     11 : ['2022-06-17'], 12 : ['2022-07-02'], 17 : ['2022-06-18','2022-07-25'],
#     21 : ['2022-07-01','2022-07-03','2022-07-17','2022-07-30'], 
#     37 : ['2022-06-20','2022-07-11','2022-08-08'], 
#     38 : ['2022-06-13','2022-07-25','2022-08-01'],
#     39 : ['2022-07-18','2022-08-08'],
#     40 : ['2022-06-20','2022-07-18','2022-08-08'],
#     41 : ['2022-06-27','2022-07-25','2022-08-08'],
#     42 : ['2022-06-13','2022-07-11','2022-08-22'],
#     54 : ['2022-08-16','2022-08-17'],74 : ['2022-06-03'],
#     75 : ['2022-06-15','2022-06-17','2022-06-20','2022-06-21'],
#     86 : ['2022-06-10','2022-08-10'],
#     89 : ['2022-07-09'], 91 : ['2022-06-13','2022-07-11','2022-08-22','2022-06-08'], 92 : ['2022-07-30']}


# mask = train.apply(lambda x: x['building_number'] in temp_hol and str(x['date_time'])[:10] in temp_hol[x['building_number']], axis=1)

# train.drop(train[mask].index, axis=0, inplace=True)

# train.reset_index(drop=True, inplace=True)

### 공휴일변수

In [21]:
holi_weekday = [
    '2024-06-06',  # 현충일 (목)
    '2024-08-15'   # 광복절 (목)
]

train['holiday'] = np.where((train.day_of_week >= 5) | (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)
test['holiday'] = np.where((test.day_of_week >= 5) | (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_weekday)), 1, 0)

### 대형마트 휴무일요일 변수

In [22]:
holi_sun = [
    '2024-06-09', '2024-06-23',  # 6월
    '2024-07-14', '2024-07-28',  # 7월
    '2024-08-11', '2024-08-25'   # 8월
]

# 의무 휴업 일요일이면 1, 아니면 0
train['Sunday_holiday'] = np.where((train.day_of_week == 6) & (train.date_time.dt.strftime('%Y-%m-%d').isin(holi_sun)), 1, 0)
test['Sunday_holiday'] = np.where((test.day_of_week == 6) & (test.date_time.dt.strftime('%Y-%m-%d').isin(holi_sun)), 1, 0)

### 시간변수 푸리에변환

In [23]:
#시간
train['sin_hour'] = np.sin(2 * np.pi * train['hour']/23.0)
train['cos_hour'] = np.cos(2 * np.pi * train['hour']/23.0)
test['sin_hour'] = np.sin(2 * np.pi * test['hour']/23.0)
test['cos_hour'] = np.cos(2 * np.pi * test['hour']/23.0)

#날짜
train['sin_date'] = -np.sin(2 * np.pi * (train['month']+train['day']/31)/12)
train['cos_date'] = -np.cos(2 * np.pi * (train['month']+train['day']/31)/12)
test['sin_date'] = -np.sin(2 * np.pi * (test['month']+test['day']/31)/12)
test['cos_date'] = -np.cos(2 * np.pi * (test['month']+test['day']/31)/12)

#월
train['sin_month'] = -np.sin(2 * np.pi * train['month']/12.0)
train['cos_month'] = -np.cos(2 * np.pi * train['month']/12.0)
test['sin_month'] = -np.sin(2 * np.pi * test['month']/12.0)
test['cos_month'] = -np.cos(2 * np.pi * test['month']/12.0)

#요일
train['sin_dayofweek'] = -np.sin(2 * np.pi * (train['day_of_week']+1)/7.0)
train['cos_dayofweek'] = -np.cos(2 * np.pi * (train['day_of_week']+1)/7.0)
test['sin_dayofweek'] = -np.sin(2 * np.pi * (test['day_of_week']+1)/7.0)
test['cos_dayofweek'] = -np.cos(2 * np.pi * (test['day_of_week']+1)/7.0)

### CDH(냉방도시) 변수

In [24]:
def CDH(xs):
    cumsum = np.cumsum(xs - 26)
    return np.concatenate((cumsum[:11], cumsum[11:] - cumsum[:-11]))

def calculate_and_add_cdh(dataframe):
    cdhs = []
    for i in range(1, 101):
        temp = dataframe[dataframe['building_number'] == i]['temperature'].values
        cdh = CDH(temp)
        cdhs.append(cdh)
    return np.concatenate(cdhs)

train['CDH'] = calculate_and_add_cdh(train)
test['CDH'] = calculate_and_add_cdh(test)

### THI(불쾌지수) 변수

In [25]:
train['THI'] = 9/5*train['temperature'] - 0.55*(1-train['humidity']/100)*(9/5*train['humidity']-26)+32

test['THI'] = 9/5*test['temperature'] - 0.55*(1-test['humidity']/100)*(9/5*test['humidity']-26)+32

### WCT(체감온도) 변수

In [26]:
train['WCT'] = 13.12 + 0.6125*train['temperature'] - 11.37*(train['windspeed']**
                                                            0.16) + 0.3965*(train['windspeed']**0.16)*train['temperature']
test['WCT'] = 13.12 + 0.6125*test['temperature'] - 11.37*(test['windspeed']**
                                                            0.16) + 0.3965*(test['windspeed']**0.16)*test['temperature']

### 전력소비 통계량 변수

In [27]:
# Calculate 'day_hour_mean'
power_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.mean).reset_index()
power_mean.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_mean']

# Calculate 'day_hour_std'
power_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour', 'day_of_week'], aggfunc=np.std).reset_index()
power_std.columns = ['building_number', 'hour', 'day_of_week', 'day_hour_std']

# Calculate 'hour_mean'
power_hour_mean = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.mean).reset_index()
power_hour_mean.columns = ['building_number', 'hour', 'hour_mean']

# Calculate 'hour_std'
power_hour_std = pd.pivot_table(train, values='power_consumption', index=['building_number', 'hour'], aggfunc=np.std).reset_index()
power_hour_std.columns = ['building_number', 'hour', 'hour_std']

# Merge calculated features to 'train' and 'test' dataframes
train = train.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')
test = test.merge(power_mean, on=['building_number', 'hour', 'day_of_week'], how='left')

train = train.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')
test = test.merge(power_std, on=['building_number', 'hour', 'day_of_week'], how='left')

train = train.merge(power_hour_mean, on=['building_number', 'hour'], how='left')
test = test.merge(power_hour_mean, on=['building_number', 'hour'], how='left')

train = train.merge(power_hour_std, on=['building_number', 'hour'], how='left')
test = test.merge(power_hour_std, on=['building_number', 'hour'], how='left')

train = train.reset_index(drop=True)

# 모델링

## X,Y,test 선언

In [28]:
X = train.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity',
                'power_consumption','rainfall', 'sunshine', 'solar_radiation',
                'hour','day','month','day_of_week','date_time'],axis =1 )

Y = train[['building_type','power_consumption']]

test_X = test.drop(['solar_power_capacity', 'ess_capacity', 'pcs_capacity','rainfall',
                   'hour','month','day_of_week','day','date_time'], axis=1)

In [29]:
type_list = []
for value in train.building_type.values:
    if value not in type_list:
        type_list.append(value)

## XGB 건물 유형별 단일모델

In [30]:
# ========= 1) 커스텀 SMAPE (feval) =========
# xgb.train용 커스텀 평가 함수: (name, value) 반환, is_higher_better=False
def feval_smape(preds, dtrain):
    labels = dtrain.get_label()
    # labels, preds 모두 log1p 스케일로 들어오므로 원스케일로 복원
    preds = np.expm1(preds)
    labels = np.expm1(labels)
    v = 2.0 * np.abs(preds - labels) / (np.abs(preds) + np.abs(labels) + 1e-9)
    score = float(np.mean(v) * 100.0)
    return ("smape", score)

# (외부 평가용) 배열-배열 SMAPE
def smape(y_true, y_pred):
    v = 2.0 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9)
    return float(np.mean(v) * 100.0)

In [44]:
best_params_dict = {}

for building_type in type_list:
    print(f"\n================ HPO for Building Type: {building_type} ================\n")

    # 데이터 준비 (원핫까지)
    x_opt = X[X.building_type == building_type].copy()
    y_opt = Y[Y.building_type == building_type]['power_consumption'].copy()
    x_opt.drop(['building_type'], axis=1, inplace=True)
    x_opt = pd.get_dummies(x_opt, columns=['building_number'], drop_first=False)

    # numpy float32 권장
    X_np = x_opt.values.astype(np.float32)
    y_np = y_opt.values.astype(np.float32)

    splitter = TimeSeriesSplit(n_splits=3)  # 필요 시 KFold로 교체

    def objective(trial):
        # 탐색공간
        mbin = trial.suggest_int("max_bin", 192, 384)
        params = {
            "objective": "reg:squarederror",          # obj는 아래 xgb.train에서 교체됨
            "eta": trial.suggest_float("eta", 0.01, 0.2, log=True),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "min_child_weight": trial.suggest_int("min_child_weight", 1, 12),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
            "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
            "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),

            # GPU 최적화 (xgboost 3.x)
            "device": "cuda",
            "tree_method": "hist",
            "predictor": "gpu_predictor",
            "single_precision_histogram": True,

            # Booster 쪽도 동일 max_bin 사용
            "max_bin": mbin,
        }
        alpha = trial.suggest_float("alpha", 0.5, 2.0)  # ← 너의 weighted_mse()용 하이퍼

        fold_scores = []
        for tr_idx, va_idx in splitter.split(X_np):
            X_tr, X_va = X_np[tr_idx], X_np[va_idx]
            y_tr, y_va = y_np[tr_idx], y_np[va_idx]

            # QuantileDMatrix (train은 max_bin 지정, valid는 ref 공유 + 명시)
            dtrain = xgboost.QuantileDMatrix(X_tr, label=y_tr, max_bin=mbin)
            dvalid = xgboost.QuantileDMatrix(X_va, label=y_va, ref=dtrain, max_bin=mbin)

            prune_cb = XGBoostPruningCallback(trial, "validation_0-custom_smape")

            booster = xgboost.train(
                params=params,
                dtrain=dtrain,
                num_boost_round=5000,
                evals=[(dvalid, "validation_0")],
                # 커스텀 목적함수/지표
                obj=weighted_mse(alpha),     # ←  weighted MSE 사용
                custom_metric=custom_smape,  # ←  SMAPE 사용(낮을수록 좋음)
                maximize=False,
                early_stopping_rounds=100,
                verbose_eval=False,
                callbacks=[prune_cb],
            )

            preds = booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1))
            fold_scores.append(smape(y_va, preds))  # 외부에서도 SMAPE로 평균

        return float(np.mean(fold_scores))

    # Study (프루닝 조금 완화하고 싶으면 n_startup_trials 늘리기)
    study = optuna.create_study(
        direction="minimize",
        sampler=optuna.samplers.TPESampler(n_startup_trials=10, multivariate=True, group=True),
        pruner=optuna.pruners.MedianPruner(n_startup_trials=10),
    )

    # enqueue_trial에 max_bin 넣지 마세요(충돌 유발). 필요하면 alpha 등만.
    study.enqueue_trial({
        "eta": 0.1,
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "reg_lambda": 1.0,
        "reg_alpha": 1e-8,
        "alpha": 1.0,      # 커스텀 목적함수 기본 베이스라인
        # "max_bin": 256,   # ← 넣지 말기 (충돌 위험)
    })

    study.optimize(objective, n_trials=60, n_jobs=1)

    best_params_dict[building_type] = study.best_params
    print(f"==> Best SMAPE for {building_type}: {study.best_value:.4f}")
    print(f"==> Best Params: {study.best_params}")


[I 2025-08-09 17:52:39,610] A new study created in memory with name: no-name-5a1bfdf0-6b2c-415a-9271-8357425433ce






[I 2025-08-09 17:52:46,389] Trial 0 finished with value: 18.131908734639484 and parameters: {'max_bin': 245, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 18.131908734639484.
[I 2025-08-09 17:52:57,700] Trial 1 finished with value: 20.027572949727375 and parameters: {'max_bin': 365, 'eta': 0.0365357418368711, 'max_depth': 6, 'min_child_weight': 11, 'subsample': 0.8565976093557622, 'colsample_bytree': 0.6813836991937351, 'reg_lambda': 0.00014038948347976202, 'reg_alpha': 2.018613491440372e-08, 'alpha': 1.8724250121695947}. Best is trial 0 with value: 18.131908734639484.
[I 2025-08-09 17:53:05,835] Trial 2 finished with value: 18.37404664357503 and parameters: {'max_bin': 381, 'eta': 0.06214973498245014, 'max_depth': 8, 'min_child_weight': 9, 'subsample': 0.6131932457047152, 'colsample_bytree': 0.9071395232594799, 'reg_lambda': 0.057654585558934955, 'reg_alpha

==> Best SMAPE for 호텔: 17.0960
==> Best Params: {'max_bin': 299, 'eta': 0.17135940832365346, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.704672997977235, 'colsample_bytree': 0.8804324778801227, 'reg_lambda': 5.822529927466454e-06, 'reg_alpha': 2.69856367626827e-06, 'alpha': 1.329573661566136}




[I 2025-08-09 17:56:48,000] Trial 0 finished with value: 9.951410293579102 and parameters: {'max_bin': 353, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 9.951410293579102.
[I 2025-08-09 17:56:54,757] Trial 1 finished with value: 14.477431615193685 and parameters: {'max_bin': 287, 'eta': 0.06687110757550044, 'max_depth': 5, 'min_child_weight': 6, 'subsample': 0.7634450360663716, 'colsample_bytree': 0.6421430589327092, 'reg_lambda': 2.8559378473948523e-06, 'reg_alpha': 0.0008419664149890112, 'alpha': 1.755682799535588}. Best is trial 0 with value: 9.951410293579102.
[I 2025-08-09 17:57:02,566] Trial 2 finished with value: 9.680558204650879 and parameters: {'max_bin': 361, 'eta': 0.06750788489746047, 'max_depth': 5, 'min_child_weight': 8, 'subsample': 0.6369779580158647, 'colsample_bytree': 0.7903931951974031, 'reg_lambda': 5.1745219807679384e-08, 'reg_alpha':

==> Best SMAPE for Commercial: 8.2688
==> Best Params: {'max_bin': 383, 'eta': 0.14160503232904864, 'max_depth': 9, 'min_child_weight': 12, 'subsample': 0.8101306225860281, 'colsample_bytree': 0.98313885089423, 'reg_lambda': 0.4324914266548993, 'reg_alpha': 0.00020136763908664874, 'alpha': 0.6613035391048088}




[I 2025-08-09 18:02:33,230] Trial 0 finished with value: 7.961862246195476 and parameters: {'max_bin': 284, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 7.961862246195476.
[I 2025-08-09 18:02:51,518] Trial 1 finished with value: 7.648314317067464 and parameters: {'max_bin': 199, 'eta': 0.040731558362181064, 'max_depth': 8, 'min_child_weight': 1, 'subsample': 0.9028204034222509, 'colsample_bytree': 0.9582684406159724, 'reg_lambda': 2.278412468576809, 'reg_alpha': 6.0248030600693025e-06, 'alpha': 1.7209560824616692}. Best is trial 1 with value: 7.648314317067464.
[I 2025-08-09 18:03:03,631] Trial 2 finished with value: 8.697098731994629 and parameters: {'max_bin': 312, 'eta': 0.03325447193656214, 'max_depth': 3, 'min_child_weight': 12, 'subsample': 0.6145235874858923, 'colsample_bytree': 0.6673192999508671, 'reg_lambda': 1.5546660768586154, 'reg_alpha': 0.002

==> Best SMAPE for Hospital: 7.1554
==> Best Params: {'max_bin': 337, 'eta': 0.14139562079774268, 'max_depth': 5, 'min_child_weight': 2, 'subsample': 0.8982581702031441, 'colsample_bytree': 0.988039015417535, 'reg_lambda': 9.927880413589548e-08, 'reg_alpha': 0.00048676926572586824, 'alpha': 1.0267265590847787}




[I 2025-08-09 18:08:17,264] Trial 0 finished with value: 19.328946113586426 and parameters: {'max_bin': 222, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 19.328946113586426.
[I 2025-08-09 18:08:36,283] Trial 1 finished with value: 19.51204776763916 and parameters: {'max_bin': 215, 'eta': 0.018258540721166356, 'max_depth': 5, 'min_child_weight': 11, 'subsample': 0.7004640839199697, 'colsample_bytree': 0.6411007715899061, 'reg_lambda': 0.01694153927226088, 'reg_alpha': 2.323099792280637e-06, 'alpha': 0.9923488458366755}. Best is trial 0 with value: 19.328946113586426.
[I 2025-08-09 18:08:41,950] Trial 2 finished with value: 16.860386848449707 and parameters: {'max_bin': 261, 'eta': 0.11423655914488764, 'max_depth': 4, 'min_child_weight': 11, 'subsample': 0.8037383353553581, 'colsample_bytree': 0.8364075373393424, 'reg_lambda': 6.029390998607908, 'reg_alpha': 

==> Best SMAPE for 학교: 14.8638
==> Best Params: {'max_bin': 208, 'eta': 0.010675799344323202, 'max_depth': 4, 'min_child_weight': 8, 'subsample': 0.7265980406508409, 'colsample_bytree': 0.9316657405099804, 'reg_lambda': 0.19106710728010282, 'reg_alpha': 8.4214285256004, 'alpha': 1.7881649754690017}




[I 2025-08-09 18:11:27,410] Trial 0 finished with value: 19.01397705078125 and parameters: {'max_bin': 355, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 19.01397705078125.
[I 2025-08-09 18:11:38,780] Trial 1 finished with value: 18.72513739267985 and parameters: {'max_bin': 284, 'eta': 0.014569264296272803, 'max_depth': 5, 'min_child_weight': 4, 'subsample': 0.8446248482963781, 'colsample_bytree': 0.7872178785109736, 'reg_lambda': 1.221252009164842e-07, 'reg_alpha': 8.825887864630708e-06, 'alpha': 1.9389393998317856}. Best is trial 1 with value: 18.72513739267985.
[I 2025-08-09 18:11:43,549] Trial 2 finished with value: 22.089971860249836 and parameters: {'max_bin': 367, 'eta': 0.19173792487433333, 'max_depth': 9, 'min_child_weight': 12, 'subsample': 0.9455010610588859, 'colsample_bytree': 0.9516999701966498, 'reg_lambda': 0.01927769225463201, 'reg_alpha': 

==> Best SMAPE for Other Buildings: 16.0096
==> Best Params: {'max_bin': 259, 'eta': 0.19623577555050906, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.7143053950399415, 'colsample_bytree': 0.7925780791217174, 'reg_lambda': 0.001561623950572243, 'reg_alpha': 0.028512163063564494, 'alpha': 0.7638670031286156}




[I 2025-08-09 18:13:44,453] Trial 0 finished with value: 41.30334075291952 and parameters: {'max_bin': 262, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 41.30334075291952.
[I 2025-08-09 18:13:56,447] Trial 1 finished with value: 50.967657248179115 and parameters: {'max_bin': 286, 'eta': 0.08651149251496842, 'max_depth': 6, 'min_child_weight': 8, 'subsample': 0.8665354049744429, 'colsample_bytree': 0.6894585766671583, 'reg_lambda': 1.213309122931048e-08, 'reg_alpha': 5.974984260756972e-07, 'alpha': 0.7655279446083251}. Best is trial 0 with value: 41.30334075291952.
[I 2025-08-09 18:14:21,040] Trial 2 finished with value: 46.10870901743571 and parameters: {'max_bin': 274, 'eta': 0.01966376321112241, 'max_depth': 6, 'min_child_weight': 6, 'subsample': 0.6425597172769153, 'colsample_bytree': 0.7379992065694541, 'reg_lambda': 0.0013423150187477299, 'reg_alpha': 

==> Best SMAPE for Apartment: 41.3033
==> Best Params: {'max_bin': 262, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}




[I 2025-08-09 18:17:04,026] Trial 0 finished with value: 11.710269610087076 and parameters: {'max_bin': 197, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 11.710269610087076.
[I 2025-08-09 18:17:14,255] Trial 1 finished with value: 13.11950429280599 and parameters: {'max_bin': 341, 'eta': 0.06680777450210224, 'max_depth': 8, 'min_child_weight': 12, 'subsample': 0.9565329029579748, 'colsample_bytree': 0.6784669870512953, 'reg_lambda': 5.309829198802215e-05, 'reg_alpha': 1.6808446368240206, 'alpha': 0.7224129217346342}. Best is trial 0 with value: 11.710269610087076.
[I 2025-08-09 18:17:24,497] Trial 2 finished with value: 13.343258221944174 and parameters: {'max_bin': 321, 'eta': 0.047197938498129734, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.8685241635354101, 'colsample_bytree': 0.695963266465626, 'reg_lambda': 0.0032600506872322516, 'reg_alpha':

==> Best SMAPE for Research Institute: 10.5107
==> Best Params: {'max_bin': 361, 'eta': 0.12492114970935476, 'max_depth': 6, 'min_child_weight': 9, 'subsample': 0.607584076081988, 'colsample_bytree': 0.9463482986409826, 'reg_lambda': 0.015683869774667184, 'reg_alpha': 0.007455931315826427, 'alpha': 1.1757136650412316}




[I 2025-08-09 18:19:19,998] Trial 0 finished with value: 19.560065905253094 and parameters: {'max_bin': 316, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 19.560065905253094.
[I 2025-08-09 18:19:31,104] Trial 1 finished with value: 20.342753092447918 and parameters: {'max_bin': 235, 'eta': 0.06813765647577352, 'max_depth': 3, 'min_child_weight': 8, 'subsample': 0.6962039684747969, 'colsample_bytree': 0.6640158380051011, 'reg_lambda': 0.15762949535188023, 'reg_alpha': 0.06503556411229038, 'alpha': 1.7791102072308056}. Best is trial 0 with value: 19.560065905253094.
[I 2025-08-09 18:19:45,710] Trial 2 finished with value: 20.53476079305013 and parameters: {'max_bin': 313, 'eta': 0.04170503711525434, 'max_depth': 3, 'min_child_weight': 4, 'subsample': 0.6723523906544641, 'colsample_bytree': 0.7231774737532001, 'reg_lambda': 2.740394278632557e-07, 'reg_alpha': 0

==> Best SMAPE for 백화점: 18.8910
==> Best Params: {'max_bin': 301, 'eta': 0.1492054771400693, 'max_depth': 4, 'min_child_weight': 6, 'subsample': 0.7138159179059449, 'colsample_bytree': 0.9917983254594225, 'reg_lambda': 1.8495437087985195e-05, 'reg_alpha': 0.006786032878493139, 'alpha': 0.9525559650466506}




[I 2025-08-09 18:23:03,446] Trial 0 finished with value: 47.52145195007324 and parameters: {'max_bin': 329, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 47.52145195007324.
[I 2025-08-09 18:23:06,827] Trial 1 finished with value: 48.78933334350586 and parameters: {'max_bin': 208, 'eta': 0.14635876904540007, 'max_depth': 4, 'min_child_weight': 3, 'subsample': 0.8583478880760276, 'colsample_bytree': 0.9524743682574625, 'reg_lambda': 2.985512278494095e-07, 'reg_alpha': 1.2052399380655994, 'alpha': 1.7758721513256002}. Best is trial 0 with value: 47.52145195007324.
[I 2025-08-09 18:23:19,384] Trial 2 finished with value: 40.75875345865885 and parameters: {'max_bin': 272, 'eta': 0.02392406763174502, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.7058761898491525, 'colsample_bytree': 0.6501794187329765, 'reg_lambda': 0.00556252449659903, 'reg_alpha': 0.1268

==> Best SMAPE for IDC(전화국): 40.7588
==> Best Params: {'max_bin': 272, 'eta': 0.02392406763174502, 'max_depth': 8, 'min_child_weight': 3, 'subsample': 0.7058761898491525, 'colsample_bytree': 0.6501794187329765, 'reg_lambda': 0.00556252449659903, 'reg_alpha': 0.12683045665853257, 'alpha': 1.5190853104021955}




[I 2025-08-09 18:25:18,258] Trial 0 finished with value: 14.578986803690592 and parameters: {'max_bin': 343, 'eta': 0.1, 'max_depth': 6, 'min_child_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'reg_lambda': 1.0, 'reg_alpha': 1e-08, 'alpha': 1.0}. Best is trial 0 with value: 14.578986803690592.
[I 2025-08-09 18:26:11,601] Trial 1 finished with value: 16.546918233235676 and parameters: {'max_bin': 325, 'eta': 0.013583010897967448, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.6606778742176739, 'colsample_bytree': 0.6348515346675335, 'reg_lambda': 0.8055482605627107, 'reg_alpha': 0.3683192717628896, 'alpha': 0.5570897238721967}. Best is trial 0 with value: 14.578986803690592.
[I 2025-08-09 18:26:23,078] Trial 2 finished with value: 15.319369633992514 and parameters: {'max_bin': 382, 'eta': 0.09515389884447836, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.6079890955071071, 'colsample_bytree': 0.9064078367772809, 'reg_lambda': 0.3551669230233678, 'reg_alpha': 0.1

==> Best SMAPE for Public: 13.2819
==> Best Params: {'max_bin': 198, 'eta': 0.1389259264336883, 'max_depth': 3, 'min_child_weight': 6, 'subsample': 0.8321282625335832, 'colsample_bytree': 0.971529650945054, 'reg_lambda': 0.6105069507898042, 'reg_alpha': 0.027505814272511236, 'alpha': 1.0471884412243635}


In [45]:
# 결과 정리
xgb_best_params = pd.DataFrame.from_dict(best_params_dict, orient='index')
xgb_best_params.to_csv("xgb_best_params_found.csv", index=True)
print("\n✅ 저장: xgb_best_params_found.csv")
xgb_best_params


✅ 저장: xgb_best_params_found.csv


Unnamed: 0,max_bin,eta,max_depth,min_child_weight,subsample,colsample_bytree,reg_lambda,reg_alpha,alpha
호텔,299,0.171359,4,4,0.704673,0.880432,5.82253e-06,2.698564e-06,1.329574
Commercial,383,0.141605,9,12,0.810131,0.983139,0.4324914,0.0002013676,0.661304
Hospital,337,0.141396,5,2,0.898258,0.988039,9.92788e-08,0.0004867693,1.026727
학교,208,0.010676,4,8,0.726598,0.931666,0.1910671,8.421429,1.788165
Other Buildings,259,0.196236,3,1,0.714305,0.792578,0.001561624,0.02851216,0.763867
Apartment,262,0.1,6,2,0.8,0.8,1.0,1e-08,1.0
Research Institute,361,0.124921,6,9,0.607584,0.946348,0.01568387,0.007455931,1.175714
백화점,301,0.149205,4,6,0.713816,0.991798,1.849544e-05,0.006786033,0.952556
IDC(전화국),272,0.023924,8,3,0.705876,0.650179,0.005562524,0.1268305,1.519085
Public,198,0.138926,3,6,0.832128,0.97153,0.610507,0.02750581,1.047188


In [46]:
xgb_best_params = pd.read_csv('xgb_best_params_found.csv')
xgb_best_params['building_type'] = type_list
xgb_best_params.set_index('building_type',inplace=True)

In [47]:
xgb_best_params

Unnamed: 0_level_0,Unnamed: 0,max_bin,eta,max_depth,min_child_weight,subsample,colsample_bytree,reg_lambda,reg_alpha,alpha
building_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
호텔,호텔,299,0.171359,4,4,0.704673,0.880432,5.82253e-06,2.698564e-06,1.329574
Commercial,Commercial,383,0.141605,9,12,0.810131,0.983139,0.4324914,0.0002013676,0.661304
Hospital,Hospital,337,0.141396,5,2,0.898258,0.988039,9.92788e-08,0.0004867693,1.026727
학교,학교,208,0.010676,4,8,0.726598,0.931666,0.1910671,8.421429,1.788165
Other Buildings,Other Buildings,259,0.196236,3,1,0.714305,0.792578,0.001561624,0.02851216,0.763867
Apartment,Apartment,262,0.1,6,2,0.8,0.8,1.0,1e-08,1.0
Research Institute,Research Institute,361,0.124921,6,9,0.607584,0.946348,0.01568387,0.007455931,1.175714
백화점,백화점,301,0.149205,4,6,0.713816,0.991798,1.849544e-05,0.006786033,0.952556
IDC(전화국),IDC(전화국),272,0.023924,8,3,0.705876,0.650179,0.005562524,0.1268305,1.519085
Public,Public,198,0.138926,3,6,0.832128,0.97153,0.610507,0.02750581,1.047188


In [None]:
kf = KFold(n_splits = 7,shuffle=True,random_state=RANDOM_SEED)
answer_df = pd.DataFrame(columns=['answer'])
pred_df = pd.DataFrame(columns=['pred'])



for i in type_list:
    
    x = X[(X.building_type == i)]
    y = Y[(Y.building_type == i)]
    X_test = test_X[test_X.building_type==i]
    
    x = pd.get_dummies(x, columns=['building_number'], drop_first=False)
    X_test = pd.get_dummies(X_test, columns=['building_number'], drop_first=False)

    x = x.drop(['building_type'],axis =1)
    X_test = X_test.drop(['building_type'],axis =1)
    y = y['power_consumption']
    x_columns = np.array(x.columns)
    x = np.array(x) ; y = np.array(y)
    
    j = 0
    xgb_fold_smape = []
    answer_list = []
    pred = pd.DataFrame(index = range(0,y.shape[0]), columns=['pred'])
    answer = pd.DataFrame(columns=['answer'])
    
    for train_index, valid_index in kf.split(x):
        j += 1
        
        

        X_train, X_valid = x[train_index], x[valid_index]
        Y_train, Y_valid = y[train_index], y[valid_index]
        Y_train = np.log(Y_train) ; Y_valid = np.log(Y_valid)


        evals = [(X_train,Y_train),(X_valid,Y_valid)]
        xgb_model = XGBRegressor(learning_rate = 0.05,n_estimators = 5000,
                             max_depth = int(xgb_best_params.loc[i]['max_depth']),
                             random_state = RANDOM_SEED,
                             subsample = xgb_best_params.loc[i]['subsample'],
                             colsample_bytree = xgb_best_params.loc[i]['colsample_bytree'],
                             min_child_weight = int(xgb_best_params.loc[i]['min_child_weight']),
                             objective=weighted_mse(xgb_best_params.loc[i]['alpha'])
                             
                             )
        
        xgb_model.fit(X_train, Y_train, early_stopping_rounds = 100,
                       eval_metric = custom_smape, eval_set = evals, verbose = False)
        xgb_pred = xgb_model.predict(X_valid)
        xgb_pred = np.exp(xgb_pred)
        pred['pred'][valid_index] = xgb_pred
        xgb_smape = smape(np.exp(Y_valid),xgb_pred)
        xgb_answer = xgb_model.predict(X_test)
        answer_list.append(np.exp(xgb_answer))
        xgb_fold_smape.append(xgb_smape)
        
        
        if j == 7:
            sorted_idx = xgb_model.feature_importances_.argsort()
            plt.figure(figsize=(8,15))
            plt.barh(x_columns[sorted_idx],  xgb_model.feature_importances_[sorted_idx])
            plt.xlabel('%s model XGB Feature Importance'%(i))
            plt.show()


    type_answer = sum(answer_list) / len(answer_list)
 
    answer.answer = type_answer
    answer_df = pd.concat([answer_df,answer],ignore_index=True)
    pred_df = pd.concat([pred_df,pred],ignore_index=True)

    avg_smape = sum(xgb_fold_smape) / len(xgb_fold_smape)
    print('Building type = %s : XGBRegressor Model SMAPE : %.4f' %(i,avg_smape))
    
    
total_score = smape(Y.power_consumption,pred_df.pred)
print('Total SMAPE : %.4f'%(total_score))

TypeError: XGBModel.fit() got an unexpected keyword argument 'callbacks'

## 정답파일 만들기

In [None]:
answer = pd.read_csv('sample_submission.csv')

In [None]:
answer.answer = answer_df.answer

In [None]:
answer.to_csv('private_재현.csv',index = False)

In [None]:
answer.head()