## 1. 데이터셋 구축

### 1. 라이브러리 불러오기

In [28]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from gplearn.genetic import SymbolicRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, train_test_split, KFold
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt
import koreanize_matplotlib
import pandas as pd
import numpy as np
import os 
import pickle

### 2. 경로 설정 및 데이터 불러오기

In [29]:
train_path = "../data/origin/train.csv"
test_path = "../data/origin/test.csv"
building_path = "../data/origin/building_info.csv"
submission_path = "../data/origin/sample_submission.csv"

ko2en_dict = {
 '건물번호': 'b_num',
 '일시': 'date',
 '기온(°C)': 'tmp',
 '강수량(mm)': 'rain',
 '풍속(m/s)': 'wind',
 '습도(%)': 'hum',
 '일조(hr)': 'sunshine',
 '일사(MJ/m2)': 'solar',
 '전력소비량(kWh)': 'power_consumption',
 '건물유형': 'b_type',
 '연면적(m2)': 'total_area',
 '냉방면적(m2)': 'cooling_area',
 '태양광용량(kW)': 'solar_capacity',
 'ESS저장용량(kWh)': 'ess_capacity',
 'PCS용량(kW)': 'pcs_capacity',
}

change_name = ['hotel', 'commercial', 'hospital', 'school', 'etc', 'apart', 'research', 'store', 'idc','public']

train = pd.read_csv(train_path, encoding='utf-8')
test = pd.read_csv(test_path, encoding='utf-8')
building = pd.read_csv(building_path, encoding='utf-8')

### 3. 시계열 변환 및 건물별 클러스터링

In [30]:
def rename_dataframe_columns(df, mapping_dict):
    return df.rename(columns=mapping_dict).copy()

def add_time(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d %H')
    df['weekday'] = df['datetime'].dt.weekday
    df['time'] = df['datetime'].dt.hour
    
    df['month_day'] = df['datetime'].dt.strftime("%m-%d")
    return df

In [None]:
train_df = rename_dataframe_columns(train, ko2en_dict)
test_df = rename_dataframe_columns(test, ko2en_dict)
building_info_df = rename_dataframe_columns(building, ko2en_dict)

train_df = add_time(train_df)
test_df = add_time(test_df)

train_merge = pd.merge(train_df, building_info_df, on='b_num', how='left')
test_merge = pd.merge(test_df, building_info_df, on='b_num', how='left')

le = LabelEncoder()
all_values = pd.concat([train_merge['month_day'], test_merge['month_day']])
le.fit(all_values)

train_merge['month_day'] = le.transform(train_merge['month_day'])
test_merge['month_day'] = le.transform(test_merge['month_day'])

btypes = list(building_info_df['b_type'].unique())
type_map = {bt: change_name[i] for i, bt in enumerate(btypes)}
train_merge['b_type'] = train_merge['b_type'].apply(lambda x : type_map[x])
test_merge['b_type'] = test_merge['b_type'].apply(lambda x : type_map[x])

display(train_merge.head())
print(list(train_merge.columns))
display(test_merge.head())
print(list(test_merge.columns))

Unnamed: 0,num_date_time,b_num,date,tmp,rain,wind,hum,sunshine,solar,power_consumption,datetime,weekday,time,month_day,b_type,total_area,cooling_area,solar_capacity,ess_capacity,pcs_capacity
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8,2024-06-01 00:00:00,5,0,0,hotel,82912.71,77586.0,-,-,-
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.0,5591.85,2024-06-01 01:00:00,5,1,0,hotel,82912.71,77586.0,-,-,-
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.0,5338.17,2024-06-01 02:00:00,5,2,0,hotel,82912.71,77586.0,-,-,-
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.0,4554.42,2024-06-01 03:00:00,5,3,0,hotel,82912.71,77586.0,-,-,-
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.0,3602.25,2024-06-01 04:00:00,5,4,0,hotel,82912.71,77586.0,-,-,-


['num_date_time', 'b_num', 'date', 'tmp', 'rain', 'wind', 'hum', 'sunshine', 'solar', 'power_consumption', 'datetime', 'weekday', 'time', 'month_day', 'b_type', 'total_area', 'cooling_area', 'solar_capacity', 'ess_capacity', 'pcs_capacity']


Unnamed: 0,num_date_time,b_num,date,tmp,rain,wind,hum,datetime,weekday,time,month_day,b_type,total_area,cooling_area,solar_capacity,ess_capacity,pcs_capacity
0,1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0,2024-08-25 00:00:00,6,0,85,hotel,82912.71,77586.0,-,-,-
1,1_20240825 01,1,20240825 01,26.1,0.0,0.0,80.0,2024-08-25 01:00:00,6,1,85,hotel,82912.71,77586.0,-,-,-
2,1_20240825 02,1,20240825 02,25.9,0.0,0.3,83.0,2024-08-25 02:00:00,6,2,85,hotel,82912.71,77586.0,-,-,-
3,1_20240825 03,1,20240825 03,25.7,0.0,1.1,83.0,2024-08-25 03:00:00,6,3,85,hotel,82912.71,77586.0,-,-,-
4,1_20240825 04,1,20240825 04,25.5,0.0,1.0,86.0,2024-08-25 04:00:00,6,4,85,hotel,82912.71,77586.0,-,-,-


['num_date_time', 'b_num', 'date', 'tmp', 'rain', 'wind', 'hum', 'datetime', 'weekday', 'time', 'month_day', 'b_type', 'total_area', 'cooling_area', 'solar_capacity', 'ess_capacity', 'pcs_capacity']


## 2. 모델 학습 및 평가

### 1. 유틸 함수 및 모델 변수 설정

In [None]:
def outlier_process(df, threshold=2.0):
    '''이상치 처리 메서드'''
    df = df.copy()
    for key, group in df.groupby("b_num"):
        idx = group.index
        vals = group["power_consumption"].to_numpy()
        for i in range(1, len(vals) - 1):
            if vals[i-1] == 0: 
                continue
            ratio = vals[i] / vals[i-1]
            if ratio >= threshold or ratio <= 1/threshold:
                vals[i] = (vals[i-1] + vals[i+1]) / 2
        df.loc[idx, "power_consumption"] = vals
    return df

def is_drop(df, col):
    '''col 전체가 결측치면 True 하나라도 참값이 있다면 False'''
    if df[col].isna().all():
        return True
    else:
        return False
    
def convert_day(df, threshold=0.018):
    '''주말의 평균 전력사용량과 평일의 전력사용량의 비율 차이가 0.018 이상 차이나면 범주화'''
    gb = df.groupby(['weekday']).mean(numeric_only=True)

    workday = (sum(gb['power_consumption'].iloc[0:5])/5)/sum(gb['power_consumption']) 
    holiday = (sum(gb['power_consumption'].iloc[5:])/2)/sum(gb['power_consumption'])
    if workday - holiday > threshold:
        return True
    else:
        return False
    
def minmax_scale(df: pd.DataFrame, exclude_cols, scaler,fit):
    '''MinMax Scalering 적용'''
    target_cols = [i for i in df.columns if i not in exclude_cols]
    if fit:
        df[target_cols] = scaler.fit_transform(df[target_cols])
    else:
        df[target_cols] = scaler.transform(df[target_cols])
    return df

def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9)
    )
smape_scorer = make_scorer(lambda y_true, y_pred: -smape(y_true, y_pred))

default_params = {
    "XGB": {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": 7,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42,
        "n_jobs": -1,
        "verbosity": 0  
    },
    "LGBM": {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": -1,
        "num_leaves": 63,
        "subsample": 0.8,
        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1   
    },
    "RF": {
        "n_estimators": 1000,
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt",
        "random_state": 42,
        "n_jobs": -1
    },
    "CatBoost": {
        "iterations": 1000,
        "learning_rate": 0.05,
        "depth": 7,
        "l2_leaf_reg": 3,
        "subsample": 0.8,
        "random_seed": 42,
        "verbose": 0,
        "task_type": "CPU",
        "loss_function": "RMSE"
    }
}

### 2. 학습 및 피처 엔지니어링 수행

In [None]:
scaler = MinMaxScaler()
result = {}
best_score_ever = 0
view_results = {tp: {} for tp in change_name}

train_merge = outlier_process(train_merge)

for tp in change_name:
    exclude_list = ['time', 'tmp', 'month_day']
    drop_cols = ['num_date_time', 'b_num', 'date','datetime', 'b_type','rain', 'sunshine', 'solar']

    train_type_df = train_merge[train_merge['b_type'] == tp].reset_index(drop=True)
    test_type_df = test_merge[test_merge['b_type'] == tp].reset_index(drop=True)

    train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
    test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
    
    for col in train_type_df.columns:
        if is_drop(train_type_df, col) and col not in drop_cols:
            drop_cols.append(col)
    
    if convert_day(train_type_df):
        train_type_df['weekday'] = train_type_df['weekday'].apply(lambda x: 0 if x < 5 else 1)
        test_type_df['weekday'] = test_type_df['weekday'].apply(lambda x: 0 if x < 5 else 1)
        exclude_list.append('weekday')
    else:
        drop_cols.append('weekday')

    train_type_df.drop(drop_cols, axis=1, inplace=True)

    train_type_df.fillna(0, inplace=True)
    test_type_df.fillna(0, inplace=True)

    y_train = train_type_df['power_consumption'].copy()
    X_train = train_type_df.drop(columns=['power_consumption']).copy()
    X_test = test_type_df[X_train.columns]

    exclude_train = exclude_list + ['power_consumption']  
    exclude_test = exclude_list                           

    X_train = minmax_scale(X_train, exclude_train, scaler, True)
    X_test = minmax_scale(X_test, exclude_test, scaler, False)

    X_tr, X_vr, y_tr, y_vr = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model_builders = {
        "XGB": lambda: XGBRegressor(**default_params["XGB"]),
        "LGBM": lambda: LGBMRegressor(**default_params["LGBM"]),
        "RF": lambda: RandomForestRegressor(**default_params["RF"]),
        "CatBoost": lambda: CatBoostRegressor(**default_params["CatBoost"])
    }

    best_model = None
    best_score = float('inf')
    best_name = None

    print(f"[{tp}]")

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    for name, build_model in model_builders.items():
        fold_scores = []
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
            X_tr, X_vr = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_tr, y_vr = y_train.iloc[train_idx], y_train.iloc[val_idx]

            model = build_model()
            model.fit(X_tr, y_tr)
            val_pred = model.predict(X_vr)
            score = smape(y_vr, val_pred)
            fold_scores.append(score)


        avg_score = np.mean(fold_scores)
        best_fold_score = np.min(fold_scores)
        view_results[tp][name] = avg_score
        # 로그 출력
        print(f"Model: {name}, Best Fold SMAPE: {best_fold_score:.4f}, Average SMAPE: {avg_score:.4f}")

        if avg_score < best_score:
            best_score = avg_score
            best_model = build_model()
            best_name = name

    print(f"Best model: {best_name} with Avg SMAPE {best_score:.4f}")
    best_score_ever += best_score

    # 최적 모델로 전체 train 데이터 재학습
    best_model.fit(X_train, y_train)

    # 테스트 데이터 예측
    test_pred = best_model.predict(X_test)
    train_pred = best_model.predict(X_train) # 시각화 용도
    # print(f"Test prediction sample: {test_pred[:5]}")
    result[tp] = test_pred
    print("=" * 100)

print(f"Average over all building types: {(best_score_ever/10):.3f}")

  train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
  test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[hotel]
Model: XGB, Best Fold SMAPE: 7.0409, Average SMAPE: 7.2819
Model: LGBM, Best Fold SMAPE: 6.7430, Average SMAPE: 6.9411
Model: RF, Best Fold SMAPE: 8.5032, Average SMAPE: 8.7844
Model: CatBoost, Best Fold SMAPE: 7.7605, Average SMAPE: 7.8834
Best model: LGBM with Avg SMAPE 6.9411
[commercial]


  train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
  test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


Model: XGB, Best Fold SMAPE: 3.2310, Average SMAPE: 3.2978
Model: LGBM, Best Fold SMAPE: 3.0526, Average SMAPE: 3.2054
Model: RF, Best Fold SMAPE: 4.3356, Average SMAPE: 4.4192
Model: CatBoost, Best Fold SMAPE: 3.4043, Average SMAPE: 3.5068
Best model: LGBM with Avg SMAPE 3.2054


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[hospital]
Model: XGB, Best Fold SMAPE: 2.9509, Average SMAPE: 3.0839
Model: LGBM, Best Fold SMAPE: 3.1845, Average SMAPE: 3.3642
Model: RF, Best Fold SMAPE: 3.9450, Average SMAPE: 4.1594
Model: CatBoost, Best Fold SMAPE: 3.4546, Average SMAPE: 3.5778
Best model: XGB with Avg SMAPE 3.0839


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[school]
Model: XGB, Best Fold SMAPE: 3.5455, Average SMAPE: 3.6756
Model: LGBM, Best Fold SMAPE: 3.4689, Average SMAPE: 3.6474
Model: RF, Best Fold SMAPE: 5.1027, Average SMAPE: 5.2750
Model: CatBoost, Best Fold SMAPE: 4.1258, Average SMAPE: 4.2928
Best model: LGBM with Avg SMAPE 3.6474


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[etc]
Model: XGB, Best Fold SMAPE: 6.8575, Average SMAPE: 6.9831
Model: LGBM, Best Fold SMAPE: 6.6821, Average SMAPE: 6.7761
Model: RF, Best Fold SMAPE: 8.6400, Average SMAPE: 8.7645
Model: CatBoost, Best Fold SMAPE: 8.3712, Average SMAPE: 8.7319
Best model: LGBM with Avg SMAPE 6.7761
[apart]


  train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
  test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


Model: XGB, Best Fold SMAPE: 19.9139, Average SMAPE: 20.8535
Model: LGBM, Best Fold SMAPE: 19.4731, Average SMAPE: 20.0332
Model: RF, Best Fold SMAPE: 27.0364, Average SMAPE: 27.5985
Model: CatBoost, Best Fold SMAPE: 24.8372, Average SMAPE: 26.0246
Best model: LGBM with Avg SMAPE 20.0332
[research]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


Model: XGB, Best Fold SMAPE: 5.0540, Average SMAPE: 5.2682
Model: LGBM, Best Fold SMAPE: 5.3967, Average SMAPE: 5.4874
Model: RF, Best Fold SMAPE: 5.9854, Average SMAPE: 6.2840
Model: CatBoost, Best Fold SMAPE: 6.3491, Average SMAPE: 6.5175
Best model: XGB with Avg SMAPE 5.2682


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[store]
Model: XGB, Best Fold SMAPE: 7.0488, Average SMAPE: 7.3459
Model: LGBM, Best Fold SMAPE: 6.9882, Average SMAPE: 7.2132
Model: RF, Best Fold SMAPE: 8.3634, Average SMAPE: 8.6042
Model: CatBoost, Best Fold SMAPE: 8.5366, Average SMAPE: 8.7106
Best model: LGBM with Avg SMAPE 7.2132


  train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
  test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[idc]
Model: XGB, Best Fold SMAPE: 1.1563, Average SMAPE: 1.1901
Model: LGBM, Best Fold SMAPE: 1.0917, Average SMAPE: 1.1338
Model: RF, Best Fold SMAPE: 1.6247, Average SMAPE: 1.7180
Model: CatBoost, Best Fold SMAPE: 1.3196, Average SMAPE: 1.3610
Best model: LGBM with Avg SMAPE 1.1338


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.transform(df[target_cols])


[public]
Model: XGB, Best Fold SMAPE: 5.8581, Average SMAPE: 6.0813
Model: LGBM, Best Fold SMAPE: 6.1131, Average SMAPE: 6.2777
Model: RF, Best Fold SMAPE: 6.7175, Average SMAPE: 6.9640
Model: CatBoost, Best Fold SMAPE: 6.7559, Average SMAPE: 6.9765
Best model: XGB with Avg SMAPE 6.0813
Average over all building types: 6.338


In [None]:
colors = {"XGB": "red", "LGBM": "blue", "RF": "green", "CatBoost": "black"}

plt.figure(figsize=(14, 7))
x = range(len(change_name))  # 건물 타입 index

for model_name, color in colors.items():
    y = [view_results[tp][model_name] for tp in change_name]
    plt.plot(x, y, marker='o', color=color, label=model_name)

plt.xticks(x, change_name, rotation=45)
plt.xlabel("Building Type")
plt.ylabel("Average SMAPE")
plt.title("SMAPE Comparison by Building Type")
plt.legend()
plt.grid(True, linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()

In [35]:
y_out = np.zeros(len(test_merge))
for tp in change_name:
    idx = test_merge.index[test_merge['b_type'] == tp].tolist()
    y_out[idx] = result[tp]

submission = pd.read_csv(submission_path)
submission['answer'] = y_out
submission.to_csv("../result/0817/ML_0817_03.csv", index=False)
print("저장 완료: baseline_submission.csv")

저장 완료: baseline_submission.csv
