## 1. 데이터셋 구축

### 1. 라이브러리 불러오기

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from gplearn.genetic import SymbolicRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np
import os 
import pickle

### 2. 경로 설정 및 데이터 불러오기

In [None]:
train_path = "../data/origin/train.csv"
test_path = "../data/origin/test.csv"
building_path = "../data/origin/building_info.csv"
submission_path = "../data/origin/sample_submission.csv"

ko2en_dict = {
 '건물번호': 'b_num',
 '일시': 'date',
 '기온(°C)': 'tmp',
 '강수량(mm)': 'rain',
 '풍속(m/s)': 'wind',
 '습도(%)': 'hum',
 '일조(hr)': 'sunshine',
 '일사(MJ/m2)': 'solar',
 '전력소비량(kWh)': 'power_consumption',
 '건물유형': 'b_type',
 '연면적(m2)': 'total_area',
 '냉방면적(m2)': 'cooling_area',
 '태양광용량(kW)': 'solar_capacity',
 'ESS저장용량(kWh)': 'ess_capacity',
 'PCS용량(kW)': 'pcs_capacity',
}

change_name = ['hotel', 'commercial', 'hospital', 'school', 'etc', 'apart', 'research', 'store', 'idc','public']

train = pd.read_csv(train_path, encoding='utf-8')
test = pd.read_csv(test_path, encoding='utf-8')
building = pd.read_csv(building_path, encoding='utf-8')

### 3. 시계열 변환 및 건물별 클러스터링

In [57]:
def rename_dataframe_columns(df, mapping_dict):
    return df.rename(columns=mapping_dict).copy()

def add_time(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d %H')
    df['weekday'] = df['datetime'].dt.weekday
    df['time'] = df['datetime'].dt.hour   
    return df

In [58]:
train_df = rename_dataframe_columns(train, ko2en_dict)
test_df = rename_dataframe_columns(test, ko2en_dict)
building_info_df = rename_dataframe_columns(building, ko2en_dict)

train_df = add_time(train_df)
test_df = add_time(test_df)

train_merge = pd.merge(train_df, building_info_df, on='b_num', how='left')
test_merge = pd.merge(test_df, building_info_df, on='b_num', how='left')

btypes = list(building_info_df['b_type'].unique())
type_map = {bt: change_name[i] for i, bt in enumerate(btypes)}
train_merge['b_type'] = train_merge['b_type'].apply(lambda x : type_map[x])
test_merge['b_type'] = test_merge['b_type'].apply(lambda x : type_map[x])

display(train_merge.head())
print(list(train_merge.columns))
display(test_merge.head())
print(list(test_merge.columns))

Unnamed: 0,num_date_time,b_num,date,tmp,rain,wind,hum,sunshine,solar,power_consumption,datetime,weekday,time,b_type,total_area,cooling_area,solar_capacity,ess_capacity,pcs_capacity
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,0.0,0.0,5794.8,2024-06-01 00:00:00,5,0,hotel,82912.71,77586.0,-,-,-
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,0.0,0.0,5591.85,2024-06-01 01:00:00,5,1,hotel,82912.71,77586.0,-,-,-
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,0.0,0.0,5338.17,2024-06-01 02:00:00,5,2,hotel,82912.71,77586.0,-,-,-
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,0.0,0.0,4554.42,2024-06-01 03:00:00,5,3,hotel,82912.71,77586.0,-,-,-
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,0.0,0.0,3602.25,2024-06-01 04:00:00,5,4,hotel,82912.71,77586.0,-,-,-


['num_date_time', 'b_num', 'date', 'tmp', 'rain', 'wind', 'hum', 'sunshine', 'solar', 'power_consumption', 'datetime', 'weekday', 'time', 'b_type', 'total_area', 'cooling_area', 'solar_capacity', 'ess_capacity', 'pcs_capacity']


Unnamed: 0,num_date_time,b_num,date,tmp,rain,wind,hum,datetime,weekday,time,b_type,total_area,cooling_area,solar_capacity,ess_capacity,pcs_capacity
0,1_20240825 00,1,20240825 00,26.5,0.0,0.7,80.0,2024-08-25 00:00:00,6,0,hotel,82912.71,77586.0,-,-,-
1,1_20240825 01,1,20240825 01,26.1,0.0,0.0,80.0,2024-08-25 01:00:00,6,1,hotel,82912.71,77586.0,-,-,-
2,1_20240825 02,1,20240825 02,25.9,0.0,0.3,83.0,2024-08-25 02:00:00,6,2,hotel,82912.71,77586.0,-,-,-
3,1_20240825 03,1,20240825 03,25.7,0.0,1.1,83.0,2024-08-25 03:00:00,6,3,hotel,82912.71,77586.0,-,-,-
4,1_20240825 04,1,20240825 04,25.5,0.0,1.0,86.0,2024-08-25 04:00:00,6,4,hotel,82912.71,77586.0,-,-,-


['num_date_time', 'b_num', 'date', 'tmp', 'rain', 'wind', 'hum', 'datetime', 'weekday', 'time', 'b_type', 'total_area', 'cooling_area', 'solar_capacity', 'ess_capacity', 'pcs_capacity']


## 2. 모델 학습 및 평가

### 1. 유틸 함수 및 모델 변수 설정

In [None]:
def outlier_process(df, threshold=2.0):
    '''이상치 처리 메서드'''
    df = df.copy()
    for key, group in df.groupby("b_num"):
        idx = group.index
        vals = group["power_consumption"].to_numpy()
        for i in range(1, len(vals) - 1):
            if vals[i-1] == 0: 
                continue
            ratio = vals[i] / vals[i-1]
            if ratio >= threshold or ratio <= 1/threshold:
                vals[i] = (vals[i-1] + vals[i+1]) / 2
        df.loc[idx, "power_consumption"] = vals
    return df

def is_drop(df, col):
    '''col 전체가 결측치면 True 하나라도 참값이 있다면 False'''
    if df[col].isna().all():
        return True
    else:
        return False
    
def convert_day(df, threshold=0.018):
    '''주말의 평균 전력사용량과 평일의 전력사용량의 비율 차이가 0.018 이상 차이나면 범주화'''
    gb = df.groupby(['weekday']).mean(numeric_only=True)

    workday = (sum(gb['power_consumption'].iloc[0:5])/5)/sum(gb['power_consumption']) 
    holiday = (sum(gb['power_consumption'].iloc[5:])/2)/sum(gb['power_consumption'])
    if workday - holiday > threshold:
        return True
    else:
        return False
    
def minmax_scale(df: pd.DataFrame, exclude_cols, scaler,fit):
    '''MinMax Scalering 적용'''
    target_cols = [i for i in df.columns if i not in exclude_cols]
    if fit:
        df[target_cols] = scaler.fit_transform(df[target_cols])
    else:
        df[target_cols] = scaler.transform(df[target_cols])
    return df

def smape(y_true, y_pred):
    return 100 * np.mean(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-9)
    )
smape_scorer = make_scorer(lambda y_true, y_pred: -smape(y_true, y_pred))

default_params = {
    "SR" : {
        "population_size": 500,
        "generations": 20,
        "tournament_size": 20,
        "stopping_criteria": 0.01,
        "p_crossover": 0.7,
        "p_subtree_mutation": 0.1,
        "p_hoist_mutation": 0.05,
        "p_point_mutation": 0.1,
        "max_samples": 0.9,
        "verbose": 1,
        "parsimony_coefficient": 0.01,
        "random_state": 42
    },
    "XGB": {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": 7,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "random_state": 42,
        "n_jobs": -1,
        "verbosity": 0  
    },
    "LGBM": {
        "n_estimators": 1000,
        "learning_rate": 0.05,
        "max_depth": -1,
        "num_leaves": 63,
        "subsample": 0.8,
        "random_state": 42,
        "n_jobs": -1,
        "verbose": -1   
    },
    "RF": {
        "n_estimators": 1000,
        "max_depth": None,
        "min_samples_split": 2,
        "min_samples_leaf": 1,
        "max_features": "sqrt",
        "random_state": 42,
        "n_jobs": -1
    }
}


### 2. 학습 및 피처 엔지니어링 수행

In [None]:
scaler = StandardScaler()
result = {}
best_score_ever = 0

train_merge = outlier_process(train_merge)

for tp in change_name:
    exclude_list = ['time', 'tmp', 'cooling_area']
    drop_cols = ['num_date_time', 'b_num', 'date','datetime', 'b_type', 'total_area','rain', 'sunshine', 'solar']

    train_type_df = train_merge[train_merge['b_type'] == tp].reset_index(drop=True)
    test_type_df = test_merge[test_merge['b_type'] == tp].reset_index(drop=True)

    for col in train_type_df.columns:
        if is_drop(train_type_df, col) and col not in drop_cols:
            drop_cols.append(col)
    
    if convert_day(train_type_df):
        train_type_df['weekday'] = train_type_df['weekday'].apply(lambda x: 0 if x < 5 else 1)
        test_type_df['weekday'] = test_type_df['weekday'].apply(lambda x: 0 if x < 5 else 1)
        exclude_list.append('weekday')
    else:
        drop_cols.append('weekday')

    train_type_df.drop(drop_cols, axis=1, inplace=True)

    train_type_df = train_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')
    test_type_df = test_type_df.replace("-", 0).apply(pd.to_numeric, errors='coerce')

    train_type_df['cooling_area'] = train_type_df['cooling_area'].apply(lambda x: x/1000)
    test_type_df['cooling_area'] = test_type_df['cooling_area'].apply(lambda x: x/1000)

    y_train = train_type_df['power_consumption'].copy()
    X_train = train_type_df.drop(columns=['power_consumption']).copy()
    X_test = test_type_df[X_train.columns]

    # exclude_train = exclude_list + ['power_consumption']  
    # exclude_test = exclude_list                           

    # X_train = minmax_scale(X_train, exclude_train, scaler, True)
    # X_test = minmax_scale(X_test, exclude_test, scaler, False)

    X_tr, X_vr, y_tr, y_vr = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    model_builders = {
        "XGB": lambda: XGBRegressor(**default_params["XGB"]),
        "LGBM": lambda: LGBMRegressor(**default_params["LGBM"]),
        "RF": lambda: RandomForestRegressor(**default_params["RF"])
    }

    best_model = None
    best_score = float('inf')

    # 세 가지 모델 학습 및 검증
    print(f"[{tp}]")
    for name, build_model in model_builders.items():
        model = build_model()
        model.fit(X_tr, y_tr)
        val_pred = model.predict(X_vr)
        score = smape(y_vr, val_pred)
        print(f"Model: {name}, Validation SMAPE: {score:.4f}")
        if score < best_score:
            best_score = score
            best_model = model
            best_name = name

    print(f"Best model: {best_name} with SMAPE {best_score:.4f}")
    best_score_ever += best_score
    # 최적 모델로 전체 train 데이터 재학습
    best_model.fit(X_train, y_train)

    # 테스트 데이터 예측
    test_pred = best_model.predict(X_test)
    print(f"Test prediction sample: {test_pred[:5]}")
    result[tp] = test_pred
    print("=" * 100)
print(f"everage : {(best_score_ever/10):.3f}")

  train_type_df = train_type_df.replace("-", 0)
  test_type_df = test_type_df.replace("-", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 11.2057
Model: LGBM, Validation SMAPE: 10.2986
Model: RF, Validation SMAPE: 13.3219
Best model: LGBM with SMAPE 10.2986
Test prediction sample: [5485.49377852 5460.27561434 4958.53495842 4497.01423325 4521.82998641]


  train_type_df = train_type_df.replace("-", 0)
  test_type_df = test_type_df.replace("-", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 4.1838
Model: LGBM, Validation SMAPE: 4.0417
Model: RF, Validation SMAPE: 4.4043
Best model: LGBM with SMAPE 4.0417
Test prediction sample: [1381.38355613 1128.06436044 1073.78489879 1004.59345147 1008.58904713]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 4.8799
Model: LGBM, Validation SMAPE: 4.9808
Model: RF, Validation SMAPE: 5.0780
Best model: XGB with SMAPE 4.8799
Test prediction sample: [11691.582 11343.318 11049.075 11155.189 11209.917]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 6.6861
Model: LGBM, Validation SMAPE: 6.6297
Model: RF, Validation SMAPE: 6.7408
Best model: LGBM with SMAPE 6.6297
Test prediction sample: [3717.86279504 3636.86852757 3572.47880072 3532.95523164 3347.72036469]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 13.2629
Model: LGBM, Validation SMAPE: 13.1061
Model: RF, Validation SMAPE: 13.5696
Best model: LGBM with SMAPE 13.1061
Test prediction sample: [4648.27669824 4421.66122764 4751.23214201 4472.39842243 4573.47698109]


  train_type_df = train_type_df.replace("-", 0)
  test_type_df = test_type_df.replace("-", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 27.0458
Model: LGBM, Validation SMAPE: 24.7387
Model: RF, Validation SMAPE: 37.0278
Best model: LGBM with SMAPE 24.7387
Test prediction sample: [924.20316807 757.2240533  638.64243929 588.98267243 557.51357751]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 9.1656
Model: LGBM, Validation SMAPE: 8.9606
Model: RF, Validation SMAPE: 8.2242
Best model: RF with SMAPE 8.2242
Test prediction sample: [2039.7444  1991.98224 1860.87648 1821.7578  1782.52608]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 9.8563
Model: LGBM, Validation SMAPE: 9.3252
Model: RF, Validation SMAPE: 10.9400
Best model: LGBM with SMAPE 9.3252
Test prediction sample: [533.67283928 481.65681941 477.57496527 478.14738864 460.304802  ]


  train_type_df = train_type_df.replace("-", 0)
  test_type_df = test_type_df.replace("-", 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 2.5360
Model: LGBM, Validation SMAPE: 2.4235
Model: RF, Validation SMAPE: 3.1192
Best model: LGBM with SMAPE 2.4235
Test prediction sample: [9652.73833142 9628.96373071 9584.77393914 9615.2300747  9629.4566777 ]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[target_cols] = scaler.fit_transform(df[target_cols])


Model: XGB, Validation SMAPE: 8.4045
Model: LGBM, Validation SMAPE: 8.3330
Model: RF, Validation SMAPE: 7.8274
Best model: RF with SMAPE 7.8274
Test prediction sample: [1662.84032 1304.08016 1117.0609  1062.84748 1091.74976]


In [None]:
y_out = np.zeros(len(test_merge))
for tp in change_name:
    idx = test_merge.index[test_merge['b_type'] == tp].tolist()
    y_out[idx] = result[tp]

submission = pd.read_csv(submission_path)
submission['answer'] = y_out
submission.to_csv("../result/0817/ML_1.csv", index=False)
print("저장 완료: baseline_submission.csv")

저장 완료: baseline_submission.csv
