# Data 개별 처리

In [219]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_squared_log_error as MSLE
from tqdm.auto import tqdm
from prophet.plot import add_changepoints_to_plot, plot_seasonality
from prophet import Prophet
import koreanize_matplotlib

In [233]:

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(42)  # Seed 고정
building_df = pd.read_csv('building_info.csv')
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [234]:
building_df['태양광용량(kW)'] = building_df['태양광용량(kW)'].replace('-', 0).astype(float)
building_df['ESS저장용량(kWh)'] = building_df['ESS저장용량(kWh)'].replace('-', 0).astype(float)
building_df['PCS용량(kW)'] = building_df['PCS용량(kW)'].replace('-', 0).astype(float)

In [235]:
train_df = train_df.merge(building_df, on="건물번호", how="left")
test_df = test_df.merge(building_df, on="건물번호", how="left")
train_df = train_df.drop(columns = ["일조(hr)" , "일사(MJ/m2)"], axis=1)
# test_df = test_df.drop(columns = ["일조(hr)" , "일사(MJ/m2)"], axis=1)

In [236]:
train_df.rename(columns={"num_date_time" : "num_date_time", "건물번호":"b_num", "일시" : "D&T", "기온(C)" : "temp", "강수량(mm)" : "precip", "풍속(m/s)" : "w_s" , "습도(%)" : "hum", "전력소비량(kWh)":"power", "건물유형":"b_type", "연면적(m2)":"f_area", "냉방면적(m2)":"c_area", "태양광용량(kW)": "SUN_light", "ESS저장용량(kWh)": "ESS_save", "PCS용량(kW)":"PCS" }, inplace=True)
test_df.rename(columns={"num_date_time" : "num_date_time", "건물번호":"b_num", "일시" : "D&T", "기온(C)" : "temp", "강수량(mm)" : "precip", "풍속(m/s)" : "w_s" , "습도(%)" : "hum", "전력소비량(kWh)":"power", "건물유형":"b_type", "연면적(m2)":"f_area", "냉방면적(m2)":"c_area", "태양광용량(kW)": "SUN_light", "ESS저장용량(kWh)": "ESS_save", "PCS용량(kW)":"PCS"} , inplace= True)

# 선형보간 실시
train_df["w_s"] = train_df["w_s"].interpolate()
train_df["hum"] = train_df["hum"].interpolate()
train_df.loc[train_df["w_s"]<0] = 0
train_df.loc[train_df["hum"]<0] = 0
# 소수점 단위 절삭
train_df["c_area"] = train_df["c_area"].round(4)
train_df["f_area"] = train_df["f_area"].round(4)
test_df["c_area"] = test_df["c_area"].round(4)
test_df["f_area"] = test_df["f_area"].round(4)

lf = LabelEncoder()
train_df["c_area"] = lf.fit_transform(train_df["c_area"])
train_df["f_area"] = lf.fit_transform(train_df["f_area"])
train_df["b_type"] = lf.fit_transform(train_df["b_type"])
train_df = train_df.fillna(0)

test_df["c_area"] = lf.fit_transform(test_df["c_area"])
test_df["f_area"] = lf.fit_transform(test_df["f_area"])
test_df["b_type"] = lf.fit_transform(test_df["b_type"])

def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)
def is_weekend(ds):
    date = pd.to_datetime(ds)
    if date.weekday() >= 5:
        return 1
    else:
        return 0
def is_noon(ds):
    date = pd.to_datetime(ds)
    if date.hour >= 7 and date.hour<20:
        return 1
    else:
        return 0
# 파생변수 생성 (train data)
train_df['D&T'] = pd.to_datetime(train_df['D&T'])
# train_df['month'] = train_df['num_date_time'].apply(lambda x : int(x[4:6]))
train_df['day'] = train_df['D&T'].dt.day
train_df['month'] = train_df['D&T'].dt.month
train_df['hour'] = train_df['D&T'].dt.hour
train_df['weekend'] = train_df['D&T'].apply(is_weekend)
train_df['noon'] = train_df['D&T'].apply(is_noon)
train_df['weekday'] = train_df['D&T'].dt.weekday
train_df['sin_time'] = np.sin(2*np.pi*train_df.hour/24)
train_df['cos_time'] = np.cos(2*np.pi*train_df.hour/24)
train_df['THI'] = 9/5*train_df['temp'] - 0.55*(1-train_df['hum']/100)*(9/5*train_df['temp']-26)+32

# 파생변수 생성 (test data)
test_df['D&T'] = pd.to_datetime(test_df['D&T'])
# test_df['month'] = test_df['num_date_time'].apply(lambda x : int(x[4:6]))
test_df['day'] = test_df['D&T'].dt.day
test_df['month'] = test_df['D&T'].dt.month
test_df['hour'] = test_df['D&T'].dt.hour
test_df['weekend'] = test_df['D&T'].apply(is_weekend)
test_df['noon'] = test_df['D&T'].apply(is_noon)
test_df['weekday'] = test_df['D&T'].dt.weekday
test_df['sin_time'] = np.sin(2*np.pi*test_df.hour/24)
test_df['cos_time'] = np.cos(2*np.pi*test_df.hour/24)
test_df['THI'] = 9/5*test_df['temp'] - 0.55*(1-test_df['hum']/100)*(9/5*test_df['temp']-26)+32
# train cdh
cdhs = np.array([])
for num in range(1,101,1):
    temp = train_df[train_df['b_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])

train_df['CDH'] = cdhs
# train_df.drop("D&T", axis =1 , inplace=True)
# K-Means > train data
by_weekday = train_df.groupby(['b_num','weekday'])['power'].median().reset_index().pivot('b_num','weekday','power').reset_index()
by_hour = train_df.groupby(['b_num','hour'])['power'].median().reset_index().pivot('b_num','hour','power').reset_index().drop('b_num', axis = 1)
# test cdh
cdhs = np.array([])
for num in range(1,101,1):
    temp = test_df[test_df['b_num'] == num]
    cdh = CDH(temp['temp'].values)
    cdhs = np.concatenate([cdhs, cdh])

test_df['CDH'] = cdhs
# train_df.drop("D&T", axis =1 , inplace=True)
# K-Means > train data
# by_weekday = train_df.groupby(['b_num','weekday'])['power'].median().reset_index().pivot('b_num','weekday','power').reset_index()
# by_hour = train_df.groupby(['b_num','hour'])['power'].median().reset_index().pivot('b_num','hour','power').reset_index().drop('b_num', axis = 1)

df = pd.concat([by_weekday, by_hour], axis= 1)
columns = ['b_num'] + ['weekday'+str(i) for i in range(7)] + ['hour'+str(i) for i in range(24)]
df.columns = columns
kmeans = KMeans(n_clusters=5, random_state = 42)
km_cluster = kmeans.fit_predict(df.iloc[:,1:])

df_clust = df.copy()
df_clust['km_cluster'] = km_cluster
# df_clust['km_cluster'] = df_clust['km_cluster'].map({})
# train cluster
train_df = train_df.merge(df_clust[['b_num','km_cluster']], on='b_num', how='left')
km_d = pd.get_dummies(train_df['km_cluster'], prefix='km', drop_first=False)

train_df = pd.concat([train_df, km_d], axis=1)

# test cluster
test_df = test_df.merge(df_clust[['b_num','km_cluster']], on='b_num', how='left')
km_d = pd.get_dummies(test_df['km_cluster'], prefix='km', drop_first=False)

test_df = pd.concat([test_df, km_d], axis=1)
# prophet 형태로 전환
train_df.rename(columns={"D&T": "ds", "power": "y"}, inplace=True)
test_df.rename(columns={"D&T": "ds", "power": "y"}, inplace=True)

test_df.head()

  by_weekday = train_df.groupby(['b_num','weekday'])['power'].median().reset_index().pivot('b_num','weekday','power').reset_index()
  by_hour = train_df.groupby(['b_num','hour'])['power'].median().reset_index().pivot('b_num','hour','power').reset_index().drop('b_num', axis = 1)


Unnamed: 0,num_date_time,b_num,ds,temp,precip,w_s,hum,b_type,f_area,c_area,...,sin_time,cos_time,THI,CDH,km_cluster,km_0,km_1,km_2,km_3,km_4
0,1_20220825 00,1,2022-08-25 00:00:00,23.5,0.0,2.2,72,0,59,35,...,0.0,1.0,71.7898,-2.5,0,1,0,0,0,0
1,1_20220825 01,1,2022-08-25 01:00:00,23.0,0.0,0.9,72,0,59,35,...,0.258819,0.965926,71.0284,-5.5,0,1,0,0,0,0
2,1_20220825 02,1,2022-08-25 02:00:00,22.7,0.0,1.5,75,0,59,35,...,0.5,0.866025,70.81675,-8.8,0,1,0,0,0,0
3,1_20220825 03,1,2022-08-25 03:00:00,22.1,0.0,1.3,78,0,59,35,...,0.707107,0.707107,70.11262,-12.7,0,1,0,0,0,0
4,1_20220825 04,1,2022-08-25 04:00:00,21.8,0.0,1.0,77,0,59,35,...,0.866025,0.5,69.56514,-16.9,0,1,0,0,0,0


In [224]:
def get_score_splited_train(model, xtrain, xtest, ytrain, ytest):
    A = model.score(xtrain, ytrain)
    B = model.score(xtest,ytest)
    pred = model.predict(xtest)
    C = SMAPE(ytest, pred)

    print(f"ACC train : {A:.4f}, test : {B:.4f}, SMAPE : {C:.4f}" )
def SMAPE(true, pred):
    return np.mean((np.abs(true-pred))/(np.abs(true) + np.abs(pred))) * 100

## Models

In [229]:
def linear_1(x_train, y_train, x_test, y_test = None):

    model1 = LinearRegression().fit(x_train, y_train)
    y_pred = model1.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    if y_test == None:
        return y_pred
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model1.score(x_train, y_train), model1.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [230]:
def decision_1(x_train, y_train, x_test, y_test = None):
    model2 = DecisionTreeRegressor().fit(x_train, y_train)
    y_pred = model2.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    if y_test == None:
        return y_pred
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model2.score(x_train, y_train), model2.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [246]:
def RandomF_1(x_train, y_train, x_test, y_test= None):
    model3 = RandomForestRegressor(random_state=42, bootstrap=True, max_depth=None, min_samples_leaf=1,
                                min_samples_split=2, n_estimators=200).fit(x_train, y_train)
    y_pred = model3.predict(x_test)   #  y_pred -> 1D ndarray
    y_pred = np.where(y_pred<0, -y_pred, y_pred)
    if y_test == None:
        return y_pred
    # ValueError: Mean Squared Logarithmic Error cannot be used when targets contain negative values.
    #RMSLE = lambda y_true, y_pred : MSLE(y_true, y_pred) ** 0.5
    A,B,C = model3.score(x_train, y_train), model3.score(x_test, y_test), SMAPE(y_test, y_pred)
    print(f'train_score : {A:.4f}, test_score : {B:.4f}, SMAPE : {C:.4f}')
    return y_pred, C

In [173]:
# XGB

In [183]:
# CAT


In [238]:
test_submission = pd.DataFrame(columns=['num_date_time', 'answer'])
# 나머지 0처리

# holidays = pd.DataFrame(columns=['holiday', 'ds'])
# holidays["lower_window"] = 1
# holidays["upper_window"] = 1
smape_list = []
answer_list = []
num_list = []
for i in tqdm(range(1,101)):
    # 한건물당 168개씩 예측한다
    train = train_df.loc[(train_df["b_num"] == i), ]
    test = test_df.loc[(test_df["b_num"] == i), ].reset_index(drop=True)

    x_train = train.drop(columns= ["num_date_time", "y", "ds"])
    y_train = train["y"]
    x_test = test.drop(columns= ["num_date_time", "ds"])

    answer = RandomF_1(x_train,y_train, x_test)

    answer_list.extend(answer)

    num_list.extend(test["num_date_time"])


  0%|          | 0/100 [00:00<?, ?it/s]

In [243]:
test_submission["num_date_time"] = num_list
test_submission["answer"] = answer_list
test_submission.to_csv("results/2023-08-21.csv", index = False)

The Last SMAPE score is 3.4293720314802414


In [192]:
# Banyla RandomForest
print(f"The Best SMAPE score is {sum(smape_list)/100}(Banyla RandomForest)")

The Best SMAPE score is 3.4293720314802414(Banyla RandomForest)


# 데이터 결합 처리

In [244]:
# Feature Importance 추출
train_x = train_df.drop(columns=["y","ds","num_date_time"])
train_y = train_df['y']
xtrain, xtest, ytrain, ytest = train_test_split(train_x, train_y)
best_rf = RandomForestRegressor(random_state=42, bootstrap=True, max_depth=None, min_samples_leaf=1,
                                min_samples_split=2, n_estimators=200)
best_rf.fit(xtrain, ytrain)
get_score_splited_train(best_rf, xtrain, xtest, ytrain, ytest)
prediction = best_rf.predict(xtest)
SMAPE(ytest, prediction)
feature_importances = pd.DataFrame(best_rf.feature_importances_,

                                   index=xtrain.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

ACC train : 0.9994, test : 0.9952, SMAPE : 2.1349
            importance
km_3          0.381904
km_1          0.311694
km_2          0.094021
noon          0.037312
f_area        0.021006
c_area        0.020534
b_num         0.015508
hour          0.011470
cos_time      0.011281
THI           0.010786
weekday       0.010352
km_4          0.010183
km_cluster    0.010011
SUN_light     0.009251
sin_time      0.007713
km_0          0.007657
b_type        0.007398
weekend       0.007264
CDH           0.004740
day           0.002687
month         0.001745
temp          0.001527
w_s           0.001380
hum           0.001221
PCS           0.000644
ESS_save      0.000567
precip        0.000143


In [193]:
linear_1(xtrain,ytrain,xtest,ytest)

train_score : 0.8716, test_score : 0.8739, SMAPE : 16.2055


(array([4176.81327936, 2859.88728734, 3427.78612563, ..., 2659.50790586,
         821.11103217, 3219.59140154]),
 16.205465119156678)

In [199]:
decision_1(xtrain,ytrain,xtest,ytest)

train_score : 1.0000, test_score : 0.9913, SMAPE : 2.7917


(array([ 973.92, 1280.88,  636.72, ..., 2310.3 , 1457.4 , 8815.68]),
 2.7917264263987636)

In [247]:
xtrain = train_df.drop(columns=["y","ds","num_date_time"])
ytrain = train_df['y']
xtest = test_df.drop(columns= ["num_date_time", "ds"])
answer = RandomF_1(xtrain,ytrain,xtest)

In [248]:
test_submission["num_date_time"] = test_df["num_date_time"]
test_submission["answer"] = answer_list
test_submission.to_csv("results/2023-08-21_2.csv", index = False)

In [249]:
test_submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2155.8912
1,1_20220825 01,2172.5712
2,1_20220825 02,1956.8784
3,1_20220825 03,1940.2080
4,1_20220825 04,1972.3680
...,...,...
16795,100_20220831 19,786.7248
16796,100_20220831 20,729.9864
16797,100_20220831 21,693.3552
16798,100_20220831 22,618.5904
