In [None]:
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

In [123]:
train_path = "../data/origin/train.csv"
test_path = "../data/origin/test.csv"
building_path = "../data/origin/building_info.csv"
submission_path = "../data/origin/sample_submission.csv"

ko2en_dict = {
 '건물번호': 'b_num',
 '일시': 'date',
 '기온(°C)': 'tmp',
 '강수량(mm)': 'rain',
 '풍속(m/s)': 'wind',
 '습도(%)': 'hum',
 '일조(hr)': 'sunshine',
 '일사(MJ/m2)': 'solar',
 '전력소비량(kWh)': 'power_consumption',
 '건물유형': 'b_type',
 '연면적(m2)': 'total_area',
 '냉방면적(m2)': 'cooling_area',
 '태양광용량(kW)': 'solar_capacity',
 'ESS저장용량(kWh)': 'ess_capacity',
 'PCS용량(kW)': 'pcs_capacity',
}

change_name = ['hotel', 'commercial', 'hospital', 'school', 'etc', 'apart', 'research', 'store', 'idc','public']

train = pd.read_csv(train_path, encoding='utf-8')
test = pd.read_csv(test_path, encoding='utf-8')
building = pd.read_csv(building_path, encoding='utf-8')

In [124]:
def rename_dataframe_columns(df, mapping_dict):
    return df.rename(columns=mapping_dict).copy()

def add_time(df):
    df['datetime'] = pd.to_datetime(df['date'], format='%Y%m%d %H')
    df['datetime'] = df['datetime'].dt.strftime("%Y-%m-%d %H")
    # df.set_index('datetime', inplace=True)
    return df

def outlier_process(df, threshold=2.0):
    '''이상치 처리 메서드'''
    df = df.copy()
    for key, group in df.groupby("b_num"):
        idx = group.index
        vals = group["power_consumption"].to_numpy()
        for i in range(1, len(vals) - 1):
            if vals[i-1] == 0: 
                continue
            ratio = vals[i] / vals[i-1]
            if ratio >= threshold or ratio <= 1/threshold:
                vals[i] = (vals[i-1] + vals[i+1]) / 2
        df.loc[idx, "power_consumption"] = vals
    return df

In [129]:
train_df = rename_dataframe_columns(train, ko2en_dict)
test_df = rename_dataframe_columns(test, ko2en_dict)
building_info_df = rename_dataframe_columns(building, ko2en_dict)

train_df = add_time(train_df)
test_df = add_time(test_df)

train_merge = pd.merge(train_df, building_info_df, on='b_num', how='left')
test_merge = pd.merge(test_df, building_info_df, on='b_num', how='left')

btypes = list(building_info_df['b_type'].unique())
type_map = {bt: change_name[i] for i, bt in enumerate(btypes)}
train_merge['b_type'] = train_merge['b_type'].apply(lambda x : type_map[x])
test_merge['b_type'] = test_merge['b_type'].apply(lambda x : type_map[x])

train_merge = outlier_process(train_merge)

train_merge = train_merge.replace("-", 0)
test_merge = test_merge.replace("-", 0)

train_merge.drop(['sunshine', 'solar'], axis=1, inplace=True)
train_merge.head()

Unnamed: 0,num_date_time,b_num,date,tmp,rain,wind,hum,power_consumption,datetime,b_type,total_area,cooling_area,solar_capacity,ess_capacity,pcs_capacity
0,1_20240601 00,1,20240601 00,18.3,0.0,2.6,82.0,5794.8,2024-06-01 00,hotel,82912.71,77586.0,0,0,0
1,1_20240601 01,1,20240601 01,18.3,0.0,2.7,82.0,5591.85,2024-06-01 01,hotel,82912.71,77586.0,0,0,0
2,1_20240601 02,1,20240601 02,18.1,0.0,2.6,80.0,5338.17,2024-06-01 02,hotel,82912.71,77586.0,0,0,0
3,1_20240601 03,1,20240601 03,18.0,0.0,2.6,81.0,4554.42,2024-06-01 03,hotel,82912.71,77586.0,0,0,0
4,1_20240601 04,1,20240601 04,17.8,0.0,1.3,81.0,3602.25,2024-06-01 04,hotel,82912.71,77586.0,0,0,0


In [137]:
def scalering(df: pd.DataFrame, exclude_cols, scaler,fit):
    '''Scalering 적용'''
    target_cols = [i for i in df.columns if i not in exclude_cols]
    if fit:
        df[target_cols] = scaler.fit_transform(df[target_cols])
    else:
        df[target_cols] = scaler.transform(df[target_cols])
    return df

def train_validation_split(df, seq_len, ratio=0.8):
    '''학습 검증 데이터 분리'''
    train_size = int(df.shape[0] * ratio)

    train_set = df.iloc[:train_size]
    test_set = df.iloc[train_size - seq_len:]
    return train_set, test_set

def make_dataset(data, seq_length):
    '''LSTM 모델 학습을 위한 데이터 셋 구축'''
    dataX, dataY = [], []

    if 'power_consumption' in data.columns:
        for i in range(0, data.shape[0] - seq_length):
            x = data.iloc[i:i+seq_length].drop(columns=['power_consumption']).values
            y = data.iloc[i+seq_length]['power_consumption']
            dataX.append(x)
            dataY.append(y)

        return np.array(dataX), np.array(dataY).reshape(-1, 1)
    else:
        for i in range(0, data.shape[0] - seq_length):
            x = data.iloc[i:i+seq_length].values   
            dataX.append(x)
        return np.array(dataX), None 

def smape_loss(y_true, y_pred):
    """SMAPE 계산"""
    return 100 * torch.mean(
        2 * torch.abs(y_pred - y_true) / (torch.abs(y_true) + torch.abs(y_pred) + 1e-9)
    )

class LSTM(nn.Module):
    def __init__(
            self,input_dim,hidden_dim,output_dim,seq_length,layers
    ):
        super().__init__()
        # 속성 저장
        self.input_dim = input_dim 
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.seq_length = seq_length 
        self.layers = layers 

        # 레이어(batch size)
        self.lstm = nn.LSTM(
            self.input_dim, 
            self.hidden_dim,
            num_layers=self.layers,
            batch_first=True
        )
        self.linear = nn.Linear(self.hidden_dim, self.output_dim, bias=True)

    def reset_hidden_state(self):
        """LSTM 학습 초기화하는 함수"""
        self.hidden = (
            torch.zeros(self.layers, self.seq_length, self.hidden_dim),
            torch.zeros(self.layers, self.seq_length, self.hidden_dim)
        )

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.linear(x[:,-1])

        return x


In [None]:
submission_result = {}
drop_cols = ['num_date_time', 'b_num', 'date', 'b_type','total_area']
exclude_cols = []
seq_length = 24
batch_size = 50
scaler = StandardScaler()
device = "cuda" if torch.cuda.is_available() else "cpu"
epochs = 50
criterion = nn.MSELoss()

train_dict = {}
test_dict = {}

for name in change_name:
    train_dict[name] = train_merge[train_merge['b_type'] == name].set_index('datetime')
    test_dict[name] = test_merge[test_merge['b_type'] == name].set_index('datetime')

    train_dict[name].drop(drop_cols, axis=1, inplace=True)
    test_dict[name].drop(drop_cols, axis=1, inplace=True)
    
    train_ex = ['power_consumption']
    train_dict[name] = scalering(train_dict[name],train_ex,scaler, fit = True)
    test_dict[name] = scalering(test_dict[name],exclude_cols,scaler, fit = False)

    tr, vr = train_validation_split(train_dict[name], seq_length)
    X_tr, y_tr = make_dataset(tr,seq_length)
    X_vr, y_vr = make_dataset(vr,seq_length)
    X_test, _ = make_dataset(test_dict[name], seq_length)

    train_X_ts = torch.FloatTensor(X_tr)
    train_y_ts = torch.FloatTensor(y_tr)
    test_X_ts = torch.FloatTensor(X_test)

    val_X_ts = torch.FloatTensor(X_vr)
    val_y_ts = torch.FloatTensor(y_vr)

    dataset = TensorDataset(train_X_ts, train_y_ts)
    dataset_val = TensorDataset(val_X_ts, val_y_ts)

    train_loader = DataLoader(dataset, batch_size=batch_size)
    val_loader = DataLoader(dataset_val, batch_size=batch_size)

    input_dim = train_X_ts.shape[2]
    model = LSTM(
        input_dim = input_dim,
        hidden_dim = 30,
        output_dim = 1,
        seq_length = seq_length,
        layers = 1
    )
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    loss_history = []
    print(f"[{name} is training]")
    for epoch in range(epochs):
        # -------- Train --------
        model.train()
        train_loss = 0
        train_smape = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_smape += smape_loss(y_batch, outputs).item()

        # -------- Validation --------
        model.eval()
        val_loss = 0
        val_smape = 0
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                outputs = model(X_batch)
                loss = criterion(outputs, y_batch)
                
                val_loss += loss.item()
                val_smape += smape_loss(y_batch, outputs).item()

        
        print(f"[Epoch {epoch+1}/{epochs}] "
            f"Train Loss: {train_loss/len(train_loader):.4f}, "
            f"Train SMAPE: {train_smape/len(train_loader):.4f}, "
            f"Val Loss: {val_loss/len(val_loader):.4f}, "
            f"Val SMAPE: {val_smape/len(val_loader):.4f}")
    
    model.eval()
    preds = []
    with torch.no_grad():
        for i in range(0, len(test_X_ts), batch_size):
            batch = test_X_ts[i:i+batch_size]
            output = model(batch)
            preds.extend(output.cpu().numpy())

    preds = np.array(preds).reshape(-1)

    # 결과 저장
    submission_result[name] = preds
    print(f"[{name}] Test 예측 shape: {preds.shape}")
    print("=" * 200)

[hotel is training]
[Epoch 1/50] Train Loss: 21747430.8098, Train SMAPE: 189.8904, Val Loss: 3646786.8796, Val SMAPE: 181.6964
[Epoch 2/50] Train Loss: 21202662.7537, Train SMAPE: 172.6993, Val Loss: 3374129.4192, Val SMAPE: 166.1855
[Epoch 3/50] Train Loss: 20697630.8294, Train SMAPE: 158.4260, Val Loss: 3120355.5198, Val SMAPE: 152.3655
[Epoch 4/50] Train Loss: 20215439.5622, Train SMAPE: 146.0171, Val Loss: 2882370.5152, Val SMAPE: 139.8676
[Epoch 5/50] Train Loss: 19752627.9604, Train SMAPE: 135.0550, Val Loss: 2658826.7591, Val SMAPE: 128.4744
[Epoch 6/50] Train Loss: 19307553.5945, Train SMAPE: 125.3058, Val Loss: 2448855.5861, Val SMAPE: 118.0277
[Epoch 7/50] Train Loss: 18879154.0461, Train SMAPE: 116.6614, Val Loss: 2251791.2957, Val SMAPE: 108.4039
[Epoch 8/50] Train Loss: 18466634.9138, Train SMAPE: 109.2015, Val Loss: 2067078.3243, Val SMAPE: 99.5031
[Epoch 9/50] Train Loss: 18069353.0527, Train SMAPE: 102.7587, Val Loss: 1894230.3251, Val SMAPE: 91.2431
[Epoch 10/50] Train

In [None]:
# sample_submission 불러오기
sample_submission = pd.read_csv(submission_path)

# 건물 유형별 예측 채워넣기
for name in change_name:
    sample_submission.loc[sample_submission['b_type'] == name, 'answer'] = submission_result[name]

# 저장
sample_submission.to_csv("../result/0816/LSTM.csv", index=False)
print("LSTM.csv 파일이 저장되었습니다!")

(16296, 24, 13) (16296, 1)
(4080, 24, 13) (4080, 1)
