In [136]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import scipy.stats as stats
import matplotlib
matplotlib.rcParams['font.family'] ='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] =False
from sklearn.metrics import mean_absolute_error

In [137]:
df = pd.read_csv('C:/Users/user/github/Data/energy/train.csv', encoding='cp949')

In [138]:
eda_df = df.copy()

In [139]:
eda_df['month'] = 0
eda_df['hour'] = 0
eda_df['year'] = 0
eda_df['day'] = 0
eda_df['요일'] = 0

In [140]:
eda_df.date_time = pd.to_datetime(eda_df.date_time)

In [141]:
days = {0: '월요일', 1: '화요일', 2: '수요일', 3: '목요일', 4: '금요일', 5: '토요일', 6: '일요일'}

In [142]:
eda_df['month'] = eda_df.date_time.dt.month
eda_df['hour'] = eda_df.date_time.dt.hour
eda_df['year'] = eda_df.date_time.dt.year
eda_df['day'] = eda_df.date_time.dt.day
eda_df['요일'] = eda_df['date_time'].dt.weekday.map(days)

In [143]:
input = eda_df.drop(['기온(°C)','풍속(m/s)','습도(%)','강수량(mm)','일조(hr)','비전기냉방설비운영','태양광보유','month'], axis=1)

In [144]:
mean = pd.DataFrame(input.groupby(['num','요일','hour']).mean()['전력사용량(kWh)'])

In [145]:
std = pd.DataFrame(input.groupby(['num','요일','hour']).std()['전력사용량(kWh)'])

In [146]:
total_df = pd.merge(mean, std, how='inner', left_on = ['num','요일','hour'], right_on=['num','요일','hour'])

In [147]:
total_df.reset_index(inplace=True)

In [148]:
total_df.columns = ['num' ,'요일', 'hour', 'mean', 'std']

In [149]:
def get_weekday_indices(index):
    datetime_index = pd.DatetimeIndex(index)
    weekday_indices = datetime_index[datetime_index.weekday < 5]
    return weekday_indices

In [150]:
df_1 = df[df['num']==1]

In [151]:
df_1.date_time = pd.to_datetime(df_1.date_time)

In [152]:
df_1 = df_1.set_index(['date_time'])

In [153]:
weekday = get_weekday_indices(df_1.index)

In [154]:
weekday

DatetimeIndex(['2020-06-01 00:00:00', '2020-06-01 01:00:00',
               '2020-06-01 02:00:00', '2020-06-01 03:00:00',
               '2020-06-01 04:00:00', '2020-06-01 05:00:00',
               '2020-06-01 06:00:00', '2020-06-01 07:00:00',
               '2020-06-01 08:00:00', '2020-06-01 09:00:00',
               ...
               '2020-08-24 14:00:00', '2020-08-24 15:00:00',
               '2020-08-24 16:00:00', '2020-08-24 17:00:00',
               '2020-08-24 18:00:00', '2020-08-24 19:00:00',
               '2020-08-24 20:00:00', '2020-08-24 21:00:00',
               '2020-08-24 22:00:00', '2020-08-24 23:00:00'],
              dtype='datetime64[ns]', name='date_time', length=1464, freq=None)

In [155]:
input_1 = df_1.loc[weekday]

In [156]:
input_1.reset_index(inplace=True)

In [157]:
input_1['hour'] = input_1.date_time.dt.hour

In [158]:
input_1

Unnamed: 0,date_time,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,hour
0,2020-06-01 00:00:00,1,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0
1,2020-06-01 01:00:00,1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1
2,2020-06-01 02:00:00,1,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2
3,2020-06-01 03:00:00,1,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3
4,2020-06-01 04:00:00,1,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...
1459,2020-08-24 19:00:00,1,8714.952,29.4,3.4,66.0,0.0,0.2,0.0,0.0,19
1460,2020-08-24 20:00:00,1,8740.224,28.7,1.9,69.0,0.0,0.0,0.0,0.0,20
1461,2020-08-24 21:00:00,1,8730.504,28.3,1.1,71.0,0.0,0.0,0.0,0.0,21
1462,2020-08-24 22:00:00,1,8725.968,28.3,2.4,72.0,0.0,0.0,0.0,0.0,22


In [159]:
train_data = input_1[:24*30].iloc[:,2]

In [160]:
test_data = input_1[24*30:].iloc[:,2]

In [161]:
from sklearn.ensemble import RandomForestRegressor

In [162]:
input_1

Unnamed: 0,date_time,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,hour
0,2020-06-01 00:00:00,1,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0
1,2020-06-01 01:00:00,1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1
2,2020-06-01 02:00:00,1,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2
3,2020-06-01 03:00:00,1,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3
4,2020-06-01 04:00:00,1,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...
1459,2020-08-24 19:00:00,1,8714.952,29.4,3.4,66.0,0.0,0.2,0.0,0.0,19
1460,2020-08-24 20:00:00,1,8740.224,28.7,1.9,69.0,0.0,0.0,0.0,0.0,20
1461,2020-08-24 21:00:00,1,8730.504,28.3,1.1,71.0,0.0,0.0,0.0,0.0,21
1462,2020-08-24 22:00:00,1,8725.968,28.3,2.4,72.0,0.0,0.0,0.0,0.0,22


In [163]:
train_rf = input_1[:24*30]
test_rf = input_1[24*30:]

In [164]:
train_rf['hour'] = train_rf.date_time.dt.hour
train_rf['month'] = train_rf.date_time.dt.month
train_rf['dayofweek'] = train_rf.date_time.dt.dayofweek

test_rf['hour'] = test_rf.date_time.dt.hour
test_rf['month'] = test_rf.date_time.dt.month
test_rf['dayofweek'] = test_rf.date_time.dt.dayofweek

In [165]:
train_rf

Unnamed: 0,date_time,num,전력사용량(kWh),기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,hour,month,dayofweek
0,2020-06-01 00:00:00,1,8179.056,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,6,0
1,2020-06-01 01:00:00,1,8135.640,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,6,0
2,2020-06-01 02:00:00,1,8107.128,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,6,0
3,2020-06-01 03:00:00,1,8048.808,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,6,0
4,2020-06-01 04:00:00,1,8043.624,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
715,2020-07-10 19:00:00,1,8536.104,26.2,2.7,66.0,0.0,0.1,0.0,0.0,19,7,4
716,2020-07-10 20:00:00,1,8529.624,25.1,1.3,70.0,0.0,0.3,0.0,0.0,20,7,4
717,2020-07-10 21:00:00,1,8519.904,24.0,2.7,70.0,0.0,0.0,0.0,0.0,21,7,4
718,2020-07-10 22:00:00,1,8523.144,23.4,1.9,73.0,0.0,0.0,0.0,0.0,22,7,4


In [166]:
train_rf_x = train_rf.drop(['date_time', 'num', '전력사용량(kWh)'], axis=1)
test_rf_x = train_rf.drop(['date_time', 'num', '전력사용량(kWh)'], axis=1)

train_rf_y = train_rf['전력사용량(kWh)']
test_rf_y = test_rf['전력사용량(kWh)']

In [167]:
from sklearn.tree import DecisionTreeRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [168]:
params = {
    'n_estimators' : [5,50,100],
    'max_depth': [3,5,7],
    'min_samples_leaf':[4,6],
    'min_samples_split':[4,8,16]
}

In [169]:
lg_params = {
    'n_estimators' : [5,10,15],
    'max_depth': [3,5,8],
    'learning_rate' : [0.01, 0.1, 0.3],
    'num_leaves' : [16, 32, 64]
}

In [170]:
rf = RandomForestRegressor()
lg = LGBMRegressor(n_jobs = -1)

In [171]:
train_rf_x

Unnamed: 0,기온(°C),풍속(m/s),습도(%),강수량(mm),일조(hr),비전기냉방설비운영,태양광보유,hour,month,dayofweek
0,17.6,2.5,92.0,0.8,0.0,0.0,0.0,0,6,0
1,17.7,2.9,91.0,0.3,0.0,0.0,0.0,1,6,0
2,17.5,3.2,91.0,0.0,0.0,0.0,0.0,2,6,0
3,17.1,3.2,91.0,0.0,0.0,0.0,0.0,3,6,0
4,17.0,3.3,92.0,0.0,0.0,0.0,0.0,4,6,0
...,...,...,...,...,...,...,...,...,...,...
715,26.2,2.7,66.0,0.0,0.1,0.0,0.0,19,7,4
716,25.1,1.3,70.0,0.0,0.3,0.0,0.0,20,7,4
717,24.0,2.7,70.0,0.0,0.0,0.0,0.0,21,7,4
718,23.4,1.9,73.0,0.0,0.0,0.0,0.0,22,7,4


In [172]:
grid_cv_lg = GridSearchCV(lg, param_grid=lg_params,
                      cv=3, n_jobs=2)
grid_cv_lg.fit(train_rf_x, train_rf_y)

grid_cv_rf = GridSearchCV(rf, param_grid=params,
                      cv=3, n_jobs=2)
grid_cv_rf.fit(train_rf_x, test_rf_x)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 300
[LightGBM] [Info] Number of data points in the train set: 720, number of used features: 8
[LightGBM] [Info] Start training from score 8507.449807


In [173]:
from tensorflow.keras import layers, models

In [174]:
model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(train_rf_x.shape[1],)))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1))

model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [175]:
dnn_his = model.fit(train_rf_x, train_rf_y, epochs=240, batch_size=24, verbose=0)

In [176]:
ann_pred = model.predict(test_rf_x)



In [177]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential

In [178]:
train_rf_x.shape

(720, 10)

In [179]:
# LSTM 모델 정의
lstm_model = Sequential()
lstm_model.add(LSTM(64, input_shape=(train_rf_x.shape[1],1), return_sequences=True))
lstm_model.add(LSTM(128))
lstm_model.add(layers.Dense(1))

# Compile the model
lstm_model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [180]:
lstm_his = lstm_model.fit(train_rf_x.values.reshape(-1,10,1), train_rf_y, epochs=240, batch_size=24, verbose=0)

In [181]:
model_mse = []
model_mae = []
model_mape = []
rf_reuslt = []
lstm_result = []
ann_result = []
lg_result = []
first = True

for i in range(30):
    if first:
        predictions_rf =  grid_cv_rf.best_estimator_.predict(test_rf_x.iloc[:24])
        predictions_lg = grid_cv_lg.best_estimator_.predict(test_rf_x.iloc[:24])
        predictions_ann = model.predict(test_rf_x.iloc[:24])
        predictions_lstm = lstm_model.predict(test_rf_x.iloc[:24].reshape(-1,10,1))
        first = False

    else:

        new_data_x = train_rf_x.iloc[: 24 * (i+1) + 30 * 24, :]
        new_data_y = train_rf_y.iloc[: 24 * (i+1) + 30 * 24, 2]

        input_new_x = new_data_x[['hour', 'month', 'dayofweek']]

        # 모델 학습
        grid_cv_rf.best_estimator_.fit(new_data_x, new_data_y)
        grid_cv_lg.best_estimator_.fit(new_data_x, new_data_y)
        model.fit(new_data_x, new_data_y, epochs=240, batch_size=24, verbose=0)
        lstm_model.fit(new_data_x.reshape(-1,10,1), new_data_y, epochs=240, batch_size=24, verbose=0)


        # 베이지안 업데이트와 동일한 방식으로 테스트 데이터 세트 분리
        test_data_x = test_rf_x.iloc[24 * i: 24 * (i+1)]
        test_data_y = test_rf_y.iloc[24 * i: 24 * (i+1)]

        # 예측 수행
        predictions_rf = grid_cv_rf.best_estimator_.fit.predict(test_data_x)
        predictions_lg = grid_cv_lg.best_estimator_.predict(test_data_x)
        predictions_ann = model.predict(test_data_x)
        predictions_lstm = lstm_model.predict(test_data_x.reshape(-1,10,1))

        # 오차 계산
        mae_rf = mean_absolute_error(test_data_y, predictions_rf)
        mae_lg = mean_absolute_error(test_data_y, predictions_lg)
        mae_ann = mean_absolute_error(test_data_y, predictions_ann)
        mae_lstm = mean_absolute_error(test_data_y, predictions_lstm)


        # 오차 저장
        model_mae.append([mae_rf, mae_lg, mae_ann, mae_lstm])
        
        # model_mape.append([mae_rf, mae_auto_arima])

    rf_reuslt.append(predictions_rf, predictions_lg, predictions_ann, predictions_lstm)




AttributeError: 'DataFrame' object has no attribute 'reshape'