# Import

In [11]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore') 

# Fixed Random-Seed

In [158]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(1004) # Seed 고정

# Load Data

In [159]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
building_info = pd.read_csv('./building_info.csv')

# Data Preprocessing

1. Buliding info 데이터를 repeat로 증강
2. 원본 데이터에 결합

In [160]:
#결측값을 0으로 채웁니다
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train_df['month'] = train_df['일시'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['일시'].apply(lambda x : int(x[6:8]))
train_df['time'] = train_df['일시'].apply(lambda x : int(x[9:11]))

test_df['month'] = test_df['일시'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['일시'].apply(lambda x : int(x[6:8]))
test_df['time'] = test_df['일시'].apply(lambda x : int(x[9:11]))

In [161]:
train_x = train_df.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
test_x = test_df.drop(columns=['num_date_time', '일시'])
train_y = train_df['전력소비량(kWh)']

In [162]:
repeat_data = building_info.loc[np.repeat(building_info.index.values, 2040)]
repeat_data.drop(['건물번호', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1, inplace=True)
repeat_data.reset_index(drop=True, inplace=True)

In [163]:
train_x = pd.concat([train_x, repeat_data], axis=1)

In [164]:
repeat_data = building_info.loc[np.repeat(building_info.index.values, 168)]
repeat_data.drop(['건물번호', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)'], axis=1, inplace=True)
repeat_data.reset_index(drop=True, inplace=True)

In [165]:
test_x = pd.concat([test_x, repeat_data], axis=1)

In [166]:
train_x

Unnamed: 0,건물번호,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time,건물유형,연면적(m2),냉방면적(m2)
0,1,18.6,0.0,0.9,42.0,6,1,0,건물기타,110634.00,39570.00
1,1,18.0,0.0,1.1,45.0,6,1,1,건물기타,110634.00,39570.00
2,1,17.7,0.0,1.5,45.0,6,1,2,건물기타,110634.00,39570.00
3,1,16.7,0.0,1.4,48.0,6,1,3,건물기타,110634.00,39570.00
4,1,18.4,0.0,2.8,43.0,6,1,4,건물기타,110634.00,39570.00
...,...,...,...,...,...,...,...,...,...,...,...
203995,100,23.1,0.0,0.9,86.0,8,24,19,호텔및리조트,57497.84,40035.23
203996,100,22.4,0.0,1.3,86.0,8,24,20,호텔및리조트,57497.84,40035.23
203997,100,21.3,0.0,1.0,92.0,8,24,21,호텔및리조트,57497.84,40035.23
203998,100,21.0,0.0,0.3,94.0,8,24,22,호텔및리조트,57497.84,40035.23


In [167]:
from sklearn.preprocessing import LabelEncoder

#질적 변수들을 수치화합니다
qual_col = ['건물유형']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train_x[i])
    train_x[i]=le.transform(train_x[i])
    
    for label in np.unique(test_x[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i]=le.transform(test_x[i])
print('Done.')

Done.


# Define Evaluate Function (custom SMAPE)

refer 1. https://stackoverflow.com/questions/54067683/custom-scoring-function-in-sklearn-cross-validate

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.metrics import make_scorer

def custom_SMAPE(y_test, y_pred):
    return np.mean((np.abs(y_test-y_pred))/(np.abs(y_test)+np.abs(y_pred)))*100

scorer = {'custom': make_scorer(custom_SMAPE, greater_is_better=True)}

def test_func(test_model, X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    smape = cross_validate(test_model, X_train_scaled, y_train, scoring=scorer, return_train_score=True, cv=5, n_jobs=-1)
    avg_smape = round(np.mean(smape['test_custom']), 4)
    return print(f"평균 SMAPE :{avg_smape :.4f}")

# Define Regression Model

In [185]:
test_func(RandomForestRegressor(), train_x, train_y)

KeyboardInterrupt: 

# Inference

In [10]:
preds = model.predict(test_x)

In [11]:
submission = pd.read_csv('./sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [12]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2127.2688
1,1_20220825 01,2090.5008
2,1_20220825 02,2009.9712
3,1_20220825 03,1981.9440
4,1_20220825 04,1946.7744
...,...,...
16795,100_20220831 19,893.0712
16796,100_20220831 20,784.7448
16797,100_20220831 21,748.2216
16798,100_20220831 22,654.1728


# Submission

In [13]:
submission.to_csv('./baseline_submission.csv', index=False)