# Data 개별 처리

In [176]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_squared_log_error as MSLE
from tqdm.auto import tqdm
from prophet.plot import add_changepoints_to_plot, plot_seasonality
from prophet import Prophet
import koreanize_matplotlib

In [108]:

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(42)  # Seed 고정
building_df = pd.read_csv('building_info.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [109]:
building_df['태양광용량(kW)'] = building_df['태양광용량(kW)'].replace('-', 0).astype(float)
building_df['ESS저장용량(kWh)'] = building_df['ESS저장용량(kWh)'].replace('-', 0).astype(float)
building_df['PCS용량(kW)'] = building_df['PCS용량(kW)'].replace('-', 0).astype(float)

In [110]:
train_df = train_df.merge(building_df, on="건물번호", how="left")
test_df = test_df.merge(building_df, on="건물번호", how="left")
train_df = train_df.drop(columns = ["일조(hr)" , "일사(MJ/m2)"], axis=1)
# test_df = test_df.drop(columns = ["일조(hr)" , "일사(MJ/m2)"], axis=1)

In [111]:
train_df.rename(columns={"num_date_time" : "num_date_time", "건물번호":"b_num", "일시" : "D&T", "기온(C)" : "temp", "강수량(mm)" : "precip", "풍속(m/s)" : "w_s" , "습도(%)" : "hum", "전력소비량(kWh)":"power", "건물유형":"b_type", "연면적(m2)":"f_area", "냉방면적(m2)":"c_area", "태양광용량(kW)": "SUN_light", "ESS저장용량(kWh)": "ESS_save", "PCS용량(kW)":"PCS" }, inplace=True)
test_df.rename(columns={"num_date_time" : "num_date_time", "건물번호":"b_num", "일시" : "D&T", "기온(C)" : "temp", "강수량(mm)" : "precip", "풍속(m/s)" : "w_s" , "습도(%)" : "hum", "전력소비량(kWh)":"power", "건물유형":"b_type", "연면적(m2)":"f_area", "냉방면적(m2)":"c_area", "태양광용량(kW)": "SUN_light", "ESS저장용량(kWh)": "ESS_save", "PCS용량(kW)":"PCS"} , inplace= True)

# 선형보간 실시
train_df["w_s"] = train_df["w_s"].interpolate()
train_df["hum"] = train_df["hum"].interpolate()
train_df.loc[train_df["w_s"]<0] = 0
train_df.loc[train_df["hum"]<0] = 0
# 소수점 단위 절삭
train_df["c_area"] = train_df["c_area"].round(4)
train_df["f_area"] = train_df["f_area"].round(4)

lf = LabelEncoder()
train_df["c_area"] = lf.fit_transform(train_df["c_area"])
train_df["f_area"] = lf.fit_transform(train_df["f_area"])
train_df["b_type"] = lf.fit_transform(train_df["b_type"])
train_df = train_df.fillna(0)

def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)
def is_weekend(ds):
    date = pd.to_datetime(ds)
    if date.weekday() >= 5:
        return 1
    else:
        return 0
def is_noon(ds):
    date = pd.to_datetime(ds)
    if date.hour >= 7 and date.hour<20:
        return 1
    else:
        return 0
# 파생변수 생성 (train data)
train_df['D&T'] = pd.to_datetime(train_df['D&T'])
# train_df['month'] = train_df['num_date_time'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['D&T'].dt.day
train_df['month'] = train_df['D&T'].dt.month
train_df['hour'] = train_df['D&T'].dt.hour
train_df['weekend'] = train_df['D&T'].apply(is_weekend)
train_df['noon'] = train_df['D&T'].apply(is_noon)
train_df['weekday'] = train_df['D&T'].dt.weekday
train_df['sin_time'] = np.sin(2*np.pi*train_df.hour/24)
train_df['cos_time'] = np.cos(2*np.pi*train_df.hour/24)
train_df['THI'] = 9/5*train_df['temp'] - 0.55*(1-train_df['hum']/100)*(9/5*train_df['temp']-26)+32

cdhs = np.array([])
for num in range(1,101,1):
    temp = train_df[train_df['b_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])

train_df['CDH'] = cdhs
# train_df.drop("D&T", axis =1 , inplace=True)
# K-Means > train data
by_weekday = train_df.groupby(['b_num','weekday'])['power'].median().reset_index().pivot('b_num','weekday','power').reset_index()
by_hour = train_df.groupby(['b_num','hour'])['power'].median().reset_index().pivot('b_num','hour','power').reset_index().drop('b_num', axis = 1)

df = pd.concat([by_weekday, by_hour], axis= 1)
columns = ['b_num'] + ['weekday'+str(i) for i in range(7)] + ['hour'+str(i) for i in range(24)]
df.columns = columns
kmeans = KMeans(n_clusters=5, random_state = 42)
km_cluster = kmeans.fit_predict(df.iloc[:,1:])

df_clust = df.copy()
df_clust['km_cluster'] = km_cluster
# df_clust['km_cluster'] = df_clust['km_cluster'].map({})

train_df = train_df.merge(df_clust[['b_num','km_cluster']], on='b_num', how='left')
km_d = pd.get_dummies(train_df['km_cluster'], prefix='km', drop_first=False)

train_df = pd.concat([train_df, km_d], axis=1)
# prophet 형태로 전환
train_df.rename(columns={"D&T": "ds", "power": "y"}, inplace=True)

train_df["b_type"] = lf.fit_transform(train_df["b_type"])
train_df.head()

  by_weekday = train_df.groupby(['b_num','weekday'])['power'].median().reset_index().pivot('b_num','weekday','power').reset_index()
  by_hour = train_df.groupby(['b_num','hour'])['power'].median().reset_index().pivot('b_num','hour','power').reset_index().drop('b_num', axis = 1)


Unnamed: 0,num_date_time,b_num,ds,temp,precip,w_s,hum,y,b_type,f_area,...,sin_time,cos_time,THI,CDH,km_cluster,km_0,km_1,km_2,km_3,km_4
0,1_20220601 00,1,2022-06-01 00:00:00,18.6,0.0,0.9,42.0,1085.28,0,59,...,0.0,1.0,63.09388,-7.4,0,1,0,0,0,0
1,1_20220601 01,1,2022-06-01 01:00:00,18.0,0.0,1.1,45.0,1047.36,0,59,...,0.258819,0.965926,62.464,-15.4,0,1,0,0,0,0
2,1_20220601 02,1,2022-06-01 02:00:00,17.7,0.0,1.5,45.0,974.88,0,59,...,0.5,0.866025,62.08735,-23.7,0,1,0,0,0,0
3,1_20220601 03,1,2022-06-01 03:00:00,16.7,0.0,1.4,48.0,953.76,0,59,...,0.707107,0.707107,60.89884,-33.0,0,1,0,0,0,0
4,1_20220601 04,1,2022-06-01 04:00:00,18.4,0.0,2.8,43.0,986.4,0,59,...,0.866025,0.5,62.88788,-40.6,0,1,0,0,0,0


In [113]:
def get_score_splited_train(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest,ytest)
    pred = model.predict(xtest)
    C = SMAPE(ytest, pred)

    print(f"ACC train : {A:.4f}, test : {B:.4f}, SMAPE : {C:.4f}" )
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

## Models

In [125]:
def linear_1(x_train, y_train, x_test, y_test):

    model1 = LinearRegression().fit(x_train, y_train)
    y_pred = model1.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model1.score(x_train, y_train), model1.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [197]:
def decision_1(x_train, y_train, x_test, y_test):
    model2 = DecisionTreeRegressor().fit(x_train, y_train)
    y_pred = model2.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model2.score(x_train, y_train), model2.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [198]:
def RandomF_1(x_train, y_train, x_test, y_test):
    model3 = RandomForestRegressor().fit(x_train, y_train)
    y_pred = model3.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model3.score(x_train, y_train), model3.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [173]:
# XGB

In [183]:
# CAT


In [202]:
test_submission = pd.DataFrame(columns=['num_date_time', 'answer'])
# 나머지 0처리

holidays = pd.DataFrame(columns=['holiday', 'ds'])
# holidays["lower_window"] = 1
# holidays["upper_window"] = 1
smape_list = []
answer_list = []
num_list = []
for i in tqdm(range(1,101)):
    # 한건물당 168개씩 예측한다
    train = train_df.loc[(train_df["b_num"] == i), ][:-168]
    test = train_df.loc[(train_df["b_num"] == i), ][-168:].reset_index(drop=True)
    # p_train = train.drop(columns= ["num_date_time"])
    x_train = train.drop(columns= ["num_date_time", "y", "ds"])
    y_train = train["y"]
    x_test = test.drop(columns= ["num_date_time", "y", "ds"])
    # p_test = test.drop(columns= ["num_date_time"])
    y_test = test["y"]
    answer , smape_data = RandomF_1(x_train,y_train, x_test, y_test)
    # test_submission["answer"]= pd.concat([test_submission["answer"], pd.Series(answer)], axis = 0 ).reset_index(drop=True)
    answer_list.extend(answer)
    # test_submission.loc[(i-1)*168:(i*168)-1, "num_date_time"] = train["num_date_time"].values
    num_list.extend(train["num_date_time"])
    smape_list.append(smape_data)
print(f"SMAPE score is {sum(smape_list)/100}")

  0%|          | 0/100 [00:00<?, ?it/s]

train_score : 0.9959, test_score : 0.9482, SMAPE : 3.4899
train_score : 0.9925, test_score : 0.8776, SMAPE : 3.8402
train_score : 0.9854, test_score : 0.8651, SMAPE : 4.8068
train_score : 0.9967, test_score : 0.9645, SMAPE : 2.3334
train_score : 0.9968, test_score : 0.9805, SMAPE : 2.5913
train_score : 0.9979, test_score : 0.9653, SMAPE : 2.2400
train_score : 0.9920, test_score : 0.9149, SMAPE : 3.6649
train_score : 0.9739, test_score : 0.7295, SMAPE : 2.6524
train_score : 0.9982, test_score : 0.9722, SMAPE : 2.1246
train_score : 0.9891, test_score : -0.0244, SMAPE : 5.0295
train_score : 0.9896, test_score : 0.8890, SMAPE : 1.8363
train_score : 0.9942, test_score : 0.8454, SMAPE : 2.4486
train_score : 0.9780, test_score : 0.2215, SMAPE : 3.4052
train_score : 0.9581, test_score : 0.2226, SMAPE : 8.3352
train_score : 0.9910, test_score : 0.9022, SMAPE : 1.5368
train_score : 0.9977, test_score : 0.9768, SMAPE : 2.3528
train_score : 0.9960, test_score : 0.9473, SMAPE : 2.8218
train_score :

In [190]:
print(f"The Last SMAPE score is {sum(smape_list)/100}")

The Last SMAPE score is 3.4293720314802414


In [192]:
# Banyla RandomForest
print(f"The Best SMAPE score is {sum(smape_list)/100}(Banyla RandomForest)")

The Best SMAPE score is 3.4293720314802414(Banyla RandomForest)


# 데이터 결합 처리

In [0]:
# Feature Importance 추출
train_x = train_df.drop(columns=[
# 'num_date_time'
#                                  ,"y"
#                                  ,"ds",
                                 'b_num'
                                 ,'temp'
                                 ,'precip'
                                 ,'w_s'
                                 ,'hum'
                                 ,'b_type'
                                 ,'f_area'
                                 ,'c_area'
                                 ,'SUN_light'
                                 ,'ESS_save'
                                 ,'PCS'
                                 ,'day'
                                 ,'month'
                                , 'hour'
                                , 'weekend'
                                , 'noon'
                                , 'weekday'
                                , 'sin_time'
                                , 'cos_time'
                                , 'THI'
                                , 'CDH'
                                , 'km_cluster'
                                , 'km_0'
                                , 'km_1'
                                , 'km_2'
                                , 'km_3',
                                 'km_4'
                                 ])
train_y = train_df['y']
xtrain, xtest, ytrain, ytest = train_test_split(train_x, train_y)

In [250]:
train_df.columns

Index(['num_date_time', 'b_num', 'ds', 'temp', 'precip', 'w_s', 'hum', 'y',
       'b_type', 'f_area', 'c_area', 'SUN_light', 'ESS_save', 'PCS', 'day',
       'month', 'hour', 'weekend', 'noon', 'weekday', 'sin_time', 'cos_time',
       'THI', 'CDH', 'km_cluster', 'km_0', 'km_1', 'km_2', 'km_3', 'km_4'],
      dtype='object')

In [201]:
best_rf = RandomForestRegressor(random_state=42, bootstrap=True, max_depth=None, min_samples_leaf=1,
                                min_samples_split=2, n_estimators=200)
best_rf.fit(xtrain, ytrain)
get_score_splited_train(best_rf, xtrain, xtest, ytrain, ytest)
prediction = best_rf.predict(xtest)
SMAPE(ytest, prediction)
feature_importances = pd.DataFrame(best_rf.feature_importances_,

                                   index=xtrain.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

ACC train : 0.9994, test : 0.9956, SMAPE : 2.1337
            importance
km_3          0.380670
km_1          0.312507
km_2          0.094559
noon          0.035857
f_area        0.021027
c_area        0.020925
b_num         0.015429
cos_time      0.012944
hour          0.011429
km_4          0.011308
km_cluster    0.011065
weekday       0.010876
THI           0.010289
SUN_light     0.008487
sin_time      0.008243
b_type        0.007413
weekend       0.007038
km_0          0.005612
CDH           0.004565
day           0.002701
month         0.001848
w_s           0.001426
temp          0.001388
hum           0.001095
PCS           0.000631
ESS_save      0.000546
precip        0.000121


In [193]:
linear_1(xtrain,ytrain,xtest,ytest)

train_score : 0.8716, test_score : 0.8739, SMAPE : 16.2055


(array([4176.81327936, 2859.88728734, 3427.78612563, ..., 2659.50790586,
         821.11103217, 3219.59140154]),
 16.205465119156678)

In [199]:
decision_1(xtrain,ytrain,xtest,ytest)

train_score : 1.0000, test_score : 0.9913, SMAPE : 2.7917


(array([ 973.92, 1280.88,  636.72, ..., 2310.3 , 1457.4 , 8815.68]),
 2.7917264263987636)

In [200]:
RandomF_1(xtrain,ytrain,xtest,ytest)

train_score : 0.9993, test_score : 0.9956, SMAPE : 2.1648


(array([ 986.151 , 1206.036 ,  661.122 , ..., 2330.3292, 1595.535 ,
        8816.2416]),
 2.1647580994938456)