In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import warnings
warnings.filterwarnings('ignore')

In [3]:
def NMAE_FUNC (true, pred, S):
    
    # 판다스 데이터프레임 기준
    # true : 실제발전량
    # pred : 예측발전량
    # S : 발전용량(kW)
    
    true_values = true.loc[true.values > S * 0.1, :]
    pred_values = pred.loc[true_values.index, :]
    
    AE = sum(abs(true_values.values - pred_values.values))
    NAE = AE / S
    NMAE = 100 * (NAE / len(true_values))
    
    return NMAE[0]

In [4]:
# 기상 데이터
weather_path = './기상데이터'
weather_list = []

for file in os.listdir(weather_path):
    weather_list.append(pd.read_csv(weather_path + '/' + file, encoding='cp949'))
    
weather = pd.concat(weather_list)
weather.index = pd.to_datetime(weather['일시'])
weather = weather.drop(columns=['일시','지점', '지점명','일사(MJ/m2)',
                                '기온(°C)','강수량(mm)','적설(cm)','풍속(m/s)'])
weather = weather.sort_index()
weather = weather.fillna(0)
weather

Unnamed: 0_level_0,습도(%),일조(hr),전운량(10분위)
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-01-01 00:00:00,18.0,0.0,0.0
2020-01-01 01:00:00,26.0,0.0,0.0
2020-01-01 02:00:00,32.0,0.0,0.0
2020-01-01 03:00:00,36.0,0.0,0.0
2020-01-01 04:00:00,39.0,0.0,0.0
...,...,...,...
2023-03-31 19:00:00,83.0,0.0,0.0
2023-03-31 20:00:00,89.0,0.0,0.0
2023-03-31 21:00:00,87.0,0.0,0.0
2023-03-31 22:00:00,84.0,0.0,0.0


In [5]:
# 태양광 발전량 데이터
solar = pd.read_csv('./DATA/태양광 발전량 데이터.csv')
solar.index = pd.to_datetime(solar['일시'])
solar = solar.drop(columns=['일시'])
solar

Unnamed: 0_level_0,AC_출력_1,AC_출력_2
일시,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-01-01 00:00:00,,
2020-01-01 01:00:00,,
2020-01-01 02:00:00,,
2020-01-01 03:00:00,,
2020-01-01 04:00:00,,
...,...,...
2023-03-31 19:00:00,,
2023-03-31 20:00:00,,
2023-03-31 21:00:00,,
2023-03-31 22:00:00,,


In [6]:
result = pd.merge(left=weather, right=solar, left_index=True, right_index=True)
result

Unnamed: 0_level_0,습도(%),일조(hr),전운량(10분위),AC_출력_1,AC_출력_2
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 00:00:00,18.0,0.0,0.0,,
2020-01-01 01:00:00,26.0,0.0,0.0,,
2020-01-01 02:00:00,32.0,0.0,0.0,,
2020-01-01 03:00:00,36.0,0.0,0.0,,
2020-01-01 04:00:00,39.0,0.0,0.0,,
...,...,...,...,...,...
2023-03-31 19:00:00,83.0,0.0,0.0,,
2023-03-31 20:00:00,89.0,0.0,0.0,,
2023-03-31 21:00:00,87.0,0.0,0.0,,
2023-03-31 22:00:00,84.0,0.0,0.0,,


In [7]:
# 상관 계수 확인을 위한 데이터 전처리
pearson = result.copy()
pearson = pearson.fillna(0)
pearson = pearson[pearson['AC_출력_1'] != 0.0]
pearson.head()

Unnamed: 0_level_0,습도(%),일조(hr),전운량(10분위),AC_출력_1,AC_출력_2
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 07:00:00,49.0,0.0,0.0,3.67875,3.832857
2020-01-01 08:00:00,46.0,0.2,0.0,10.6875,13.486667
2020-01-01 09:00:00,38.0,1.0,0.0,21.321667,25.706666
2020-01-01 10:00:00,35.0,1.0,0.0,28.073333,32.9975
2020-01-01 11:00:00,33.0,1.0,0.0,29.853333,36.303334


In [8]:
# 피어슨 상관계수 확인
tt = ['AC_출력_1', 'AC_출력_2']
pearson_ = pearson.drop(columns=['AC_출력_1', 'AC_출력_2'])
pearson_.dropna()
pearson_columns = pearson_.columns

for i in tt:
    print(i)
    for j in pearson_columns:
        print(j,np.corrcoef(pearson[i],pearson[j])[0])

AC_출력_1
습도(%) [ 1.         -0.32768935]
일조(hr) [1.         0.64786661]
전운량(10분위) [ 1.         -0.36056833]
AC_출력_2
습도(%) [ 1.         -0.33993899]
일조(hr) [1.         0.63611376]
전운량(10분위) [ 1.         -0.36707918]


In [9]:
# LSTM에 필요한 모듈
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from xgboost import XGBRegressor
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import LSTM,LeakyReLU,PReLU
import tensorflow as tf
from sklearn.model_selection import train_test_split




In [10]:
# dataset을 만들기 위한 함수
def make_dataset(data, label, window_size=20):
    feature_list = []
    label_list = []
    for i in range(len(data) - window_size):
        feature_list.append(np.array(data.iloc[i:i+window_size]))
        label_list.append(np.array(label.iloc[i+window_size]))
    return np.array(feature_list), np.array(label_list)

In [11]:
raw_data = result.copy()

# 결측치 drop
raw_data = raw_data.dropna()
raw_data

Unnamed: 0_level_0,습도(%),일조(hr),전운량(10분위),AC_출력_1,AC_출력_2
일시,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-01 07:00:00,49.0,0.0,0.0,3.678750,3.832857
2020-01-01 08:00:00,46.0,0.2,0.0,10.687500,13.486667
2020-01-01 09:00:00,38.0,1.0,0.0,21.321667,25.706666
2020-01-01 10:00:00,35.0,1.0,0.0,28.073333,32.997500
2020-01-01 11:00:00,33.0,1.0,0.0,29.853333,36.303334
...,...,...,...,...,...
2023-03-31 14:00:00,56.0,1.0,5.0,27.386667,29.505833
2023-03-31 15:00:00,65.0,1.0,1.0,17.643334,19.521667
2023-03-31 16:00:00,70.0,1.0,0.0,7.915833,8.749167
2023-03-31 17:00:00,80.0,1.0,0.0,4.352500,4.498333


In [12]:
# 예측 결과를 저장할 딕셔너리 선언
y_pred_tr_dict = {}
y_pred_te_dict = {}

In [13]:
# train, test 데이터 분리
test_date = '2022-12-31 23:00:00'

train = raw_data.loc[raw_data.index <= test_date, :] # 26304 rows
test = raw_data.loc[raw_data.index > test_date, :] # 2160 rows

train.to_csv('./data_set/train_data.csv',encoding='utf-8-sig')
test.to_csv('./data_set/test_data.csv',encoding='utf-8-sig')

train.shape,test.shape

((11632, 5), (1069, 5))

In [None]:
feature_cols = list(train.drop(columns=['AC_출력_1','AC_출력_2']).columns) # 입력변수
label_cols = ['AC_출력_1','AC_출력_2'] # 출력변수

for target in label_cols:
    # 데이터 전처리
    train_feature_origin = train[feature_cols]
    train_label_origin = train[[target]]

    test_feature_origin = test[feature_cols]
    test_label_origin = test[[target]]
    
    scaler = MinMaxScaler()
    y_scaler = MinMaxScaler()

    scaler.fit(train_feature_origin)
    y_scaler.fit(train_label_origin)

    x_train_scaled = scaler.transform(train_feature_origin)
    y_train_scaled = y_scaler.transform(train_label_origin)

    x_test_scaled = scaler.transform(test_feature_origin)
    y_test_scaled = y_scaler.transform(test_label_origin)

    train_feature_df = pd.DataFrame(x_train_scaled,index=train_feature_origin.index)
    train_label_df = pd.DataFrame(y_train_scaled,index=train_label_origin.index)

    test_feature_df = pd.DataFrame(x_test_scaled,index=test_feature_origin.index)
    test_label_df = pd.DataFrame(y_test_scaled,index=test_label_origin.index)

    train_feature, train_label = make_dataset(train_feature_df, train_label_df, 20)
    test_feature,test_label = make_dataset(test_feature_df,test_label_df,20)

    x_train, x_valid, y_train, y_valid = train_test_split(train_feature, train_label, test_size=0.2)

    # LSTM 모델 정의
    model = Sequential()
    model.add(LSTM(32, 
                input_shape=(train_feature.shape[1], train_feature.shape[2]), 
                activation='relu',
                return_sequences=False)
            )
    model.add(Dense(1))
    model.compile(loss='mean_squared_error', optimizer='adam')
    early_stop = EarlyStopping(monitor='val_loss', patience=5)

    model_path = 'model'
    filename = os.path.join(model_path, 'tmp_checkpoint.h5')
    checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    history = model.fit(x_train, y_train, 
                                        epochs=200, 
                                        batch_size=16,
                                        validation_data=(x_valid, y_valid), 
                                        callbacks=[early_stop, checkpoint])
    
    # 훈련된 모델 저장
    import pickle
    with open(f'./모델/{target}_model.pickle', 'wb') as fw:
        pickle.dump(model, fw)
    
    # Train 데이터 평가
    y_pred = model.predict(train_feature)
    y_pred = np.array(y_pred).reshape(-1, 1)

    print('MSE : ', round(mean_squared_error(train_label, y_pred),3))
    print('MAE : ', round(mean_absolute_error(train_label, y_pred),3))
    print('R2 : ', round(r2_score(train_label, y_pred),3))

    # Train 데이터 예측 결과 딕셔너리 저장
    y_pred_ori = y_scaler.inverse_transform(y_pred)
    y_pred_ori = pd.DataFrame(data=y_pred_ori, index=range(len(train_feature)))

    y_pred_tr_dict[target] = y_pred_ori

    # Train 데이터 예측 결과 시각화
    plt.figure(figsize=(20, 5))
    
    idx=len(train_feature)
    plt.plot(range(len(train_feature))[-idx:], y_scaler.inverse_transform(train_label)[-idx:], label='Observed')
    plt.plot(range(len(train_feature))[-idx:], y_scaler.inverse_transform(y_pred)[-idx:], label='Predicted')
    plt.legend()
    plt.grid()
    plt.show()
    
    # Test 데이터 평가
    y_pred = model.predict(test_feature)
    y_pred = np.array(y_pred).reshape(-1, 1)

    print('MSE : ', round(mean_squared_error(test_label, y_pred),3))
    print('MAE : ', round(mean_absolute_error(test_label, y_pred),3))
    print('R2 : ', round(r2_score(test_label, y_pred),3))

    # Test 데이터 예측 결과 딕셔너리 저장
    y_pred_ori = y_scaler.inverse_transform(y_pred)
    y_pred_ori = pd.DataFrame(data=y_pred_ori, index=range(len(test_feature)))

    # Test 데이터 예측결과를 딕셔너리에 저장
    y_pred_te_dict[target] = y_pred_ori

    plt.figure(figsize=(20, 5))

    idx=len(test_feature)
    plt.plot(range(len(test_feature))[-idx:], y_scaler.inverse_transform(test_label)[-idx:], label='Observed')
    plt.plot(range(len(test_feature))[-idx:], y_scaler.inverse_transform(y_pred)[-idx:], label='Predicted')
    plt.legend()
    plt.grid()
    plt.show()

In [467]:
tr_list = []
te_list = []

for key in y_pred_tr_dict.keys():
    tr_list.append(y_pred_tr_dict[key])
    te_list.append(y_pred_te_dict[key])

pred_tr = pd.concat(tr_list, axis=1)
pred_te = pd.concat(te_list, axis=1)

pred_tr.columns = ['예측_발전량_1', '예측_발전량_2']
pred_te.columns = ['예측_발전량_1', '예측_발전량_2']
pred_tr['예측총발전량'] = pred_tr.sum(axis=1)
pred_te['예측총발전량'] = pred_te.sum(axis=1)

true_tr = raw_data[['AC_출력_1', 'AC_출력_2']].loc[raw_data.index <= test_date, :]
true_te = raw_data[['AC_출력_1', 'AC_출력_2']].loc[raw_data.index > test_date, :]

true_tr['총발전량'] = true_tr.sum(axis=1)
true_te['총발전량'] = true_te.sum(axis=1)

# window_size 만큼 앞부분 제거
true_tr = true_tr[20:]
true_te = true_te[20:]

pred_tr.index = true_tr.index
pred_te.index = true_te.index

S_total = 100 # 총 설비용량 : 100kW

# Train 예측총발전량에 대한 NMAE(%)
tr_NMAE = NMAE_FUNC(true=true_tr[['총발전량']], pred=pred_tr[['예측총발전량']], S=S_total)
print('Train NMAE(%) :', round(tr_NMAE,3), '%')

# Test 예측총발전량에 대한 NMAE(%)
te_NMAE = NMAE_FUNC(true=true_te[['총발전량']], pred=pred_te[['예측총발전량']], S=S_total)
print('Test NMAE(%) :', round(te_NMAE,3), '%')



Train NMAE(%) : 9.58 %
Test NMAE(%) : 9.673 %


In [468]:
# Train, Test 결과 저장
pred_tr.to_csv('./예측 결과 제출 양식/Train 예측 결과.csv', encoding='utf-8-sig')
pred_te.to_csv('./예측 결과 제출 양식/Test 예측 결과.csv', encoding='utf-8-sig')

# Train, Test NMAE(%) 결과 저장
pd.DataFrame([tr_NMAE, te_NMAE], index=['Train_NMAE(%)', 'Test_NMAE(%)']).T.to_csv('./예측 결과 제출 양식/NMAE_결과.csv', index=False, encoding='utf-8-sig')