# 필요한 라이브러리 로드

In [16]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import tree
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import  mean_squared_error ,mean_absolute_percentage_error
from sklearn.ensemble import RandomForestRegressor, VotingRegressor 

from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor


# 전처리 된 태양광 데이터 로드

## 태양광 데이터 일 단위분석

In [17]:
df = pd.read_csv('./data/sun.csv') # 시간 단위 데이터 로드
df['일시'] = pd.to_datetime(df['일시'])  # 데이터 타입을 날짜 형식으로 변환
df['년'] = df['일시'].dt.year
df['월'] = df['일시'].dt.month
df['일'] = df['일시'].dt.day
df_1 = df.drop('일시', axis = 1 )
result = df_1.groupby(['년', '월', '일']).agg({
    ' 태양광 발전량(MWh) ': 'sum',
    '기온(°C)': 'mean',
    '풍향(deg)': 'mean',
    '풍속(m/s)': 'mean',
    '강수량(mm)': 'mean',
    '습도(%)': 'mean'
     
}).reset_index() # 시간단위로 기후 데이터는 평균, 풍력 발전량은 총합을 사용

In [18]:
df_1

Unnamed: 0,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),태양광 발전량(MWh),solar_altitude,_강원도,_경기도,_경상남도,...,_부산시,_울산시,_인천시,_전라남도,_전라북도,_제주도,_충청남도,년,월,일
0,-8.2,0.0,0.3,0.0,77.0,0.24,-71.412722,0,0,0,...,0,0,0,0,1,0,0,2020,1,1
1,-8.6,0.0,0.0,0.0,79.0,0.50,-75.237641,0,0,0,...,0,0,0,0,1,0,0,2020,1,1
2,-7.9,284.8,0.7,0.0,78.0,0.37,-69.496024,0,0,0,...,0,0,0,0,1,0,0,2020,1,1
3,-7.8,215.2,1.4,0.0,80.0,0.05,-59.190126,0,0,0,...,0,0,0,0,1,0,0,2020,1,1
4,-6.7,209.7,1.4,0.0,78.0,0.02,-47.642389,0,0,0,...,0,0,0,0,1,0,0,2020,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271780,3.6,322.3,1.4,0.0,45.0,0.24,-15.177272,0,0,0,...,0,1,0,0,0,0,0,2022,12,31
271781,3.1,296.8,2.7,0.0,46.0,0.43,-26.685467,0,0,0,...,0,1,0,0,0,0,0,2022,12,31
271782,3.0,290.7,2.7,0.0,45.0,0.42,-38.473794,0,0,0,...,0,1,0,0,0,0,0,2022,12,31
271783,2.7,264.6,2.5,0.0,42.0,0.23,-50.283065,0,0,0,...,0,1,0,0,0,0,0,2022,12,31


In [19]:
# Decision Tree
def DT_anaytics(X_train, X_test,y_train, y_test):    
    #Decision Tree 객체 생성
    DT = DecisionTreeRegressor()
    #GridSearch 에 사용할 그리드의 범위 설정
    params = {'max_depth': [5], 'min_samples_split': [100, 1000, 10000, 100000]}
    # 5-fold교차 검증, 모델 평가 지표로 neg_mean_squared_error 사용
    DT_search = GridSearchCV(DT, params, cv=5, scoring='neg_mean_squared_error')
    # GridSearch 의 결과로 사용된 하이퍼 파라미타가 적용된 DT 분석 진행
    DT_search.fit(X_train,y_train) # 학습
    best_clf = DT_search.best_estimator_ # 최적의 하이퍼 파라미터
    y_pred = DT_search.predict(X_test) #예측

    # 성능 지료 rmse, mse, mape사용
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mape= mean_absolute_percentage_error(y_test, y_pred)
    print(f'''
rmse for Decision Tree :{rmse}
mse for Decision Tree :{mse}
mape for Decision Tree :{mape}
''')

def RF_anaytics(X_train, X_test,y_train, y_test):    
    # RandomForestRegressor객체 생성, 깊이 20
    rfc = RandomForestRegressor(max_depth=20, random_state=24)
    rfc.fit(X_train,y_train) # 학습
    y_pred = rfc.predict(X_test) # 예측

    # 성능 지료 rmse, mse, mape사용
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mape= mean_absolute_percentage_error(y_test, y_pred)
    print(f'''
rmse for RF_anaytics :{rmse}
mse for RF_anaytics :{mse}
mape for Decision Tree :{mape}
''')


def MLP_anaytics(X_train, X_test,y_train, y_test): 
    # MLPRegressor 객체 생성 은닉층3개, 1500 iterations,30% dorpout 설정
    regr = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=1500, random_state=42, alpha=0.03)
    regr.fit(X_train, y_train)
    regr.predict(X_test)
    y_pred = regr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mape= mean_absolute_percentage_error(y_test, y_pred)
    # 성능 지료 rmse, mse, mape사용
    print(f'''
rmse for MLP_anaytics :{rmse}
mse for MLP_anaytics :{mse}
mape for MLP_anaytics :{mape}
''')
    
    
def ensemble(X_train, X_test, y_train, y_test):
    # Individual regressors
    rf_regressor = RandomForestRegressor(max_depth=20, random_state=24)
    mlp_regressor = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=1500, random_state=42, alpha=0.03)

    # Ensemble regressor
    ensemble_regressor = VotingRegressor(estimators=[
        ('rf', rf_regressor),
        ('mlp', mlp_regressor)
    ])

    ensemble_regressor.fit(X_train, y_train)
    y_pred = ensemble_regressor.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)
    print(f'''
rmse for Ensemble :{rmse}
mse for Ensemble :{mse}
mape for Ensemble :{mape}
''')

## 예측결과 성능 지표 출력

In [None]:
predicts = [' 풍력 발전량(MWh) ',' 태양광 발전량(MWh) ']

sun_df1 = result[[' 태양광 발전량(MWh) ', '기온(°C)', '풍향(deg)', '풍속(m/s)','강수량(mm)', '습도(%)']].dropna()

X_train, X_test, y_train, y_test = train_test_split(sun_df1.drop(predicts[1], axis=1),sun_df1[predicts[1]], random_state= 24)
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on your data and transform it
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

DT_anaytics(X_train, X_test,y_train, y_test)
RF_anaytics(X_train, X_test,y_train, y_test)
MLP_anaytics(X_train, X_test,y_train, y_test)
ensemble(X_train, X_test,y_train, y_test)


rmse for Decision Tree :5284.5827760172415
mse for Decision Tree :27926815.11657809
mape for Decision Tree :0.5043212708724814


rmse for RF_anaytics :4507.84740438573
mse for RF_anaytics :20320688.22122717
mape for Decision Tree :0.3760523736651505






rmse for MLP_anaytics :4521.508683566669
mse for MLP_anaytics :20444040.775568787
mape for MLP_anaytics :0.3288731241185372






rmse for Ensemble :4438.050526682119
mse for Ensemble :19696292.477383435
mape for Ensemble :0.35079054035781854


rmse for Stacking Ensemble :4391.371681804705
mse for Stacking Ensemble :19284145.24775628
mape for Stacking Ensemble :0.3129642585303651



## 시간단위 분석 진행

In [22]:
df_hour = pd.read_csv('./data/sun.csv')
df_hour = df_hour.drop(['일시'], axis =1 )

In [24]:
df_hour

Unnamed: 0,기온(°C),풍향(deg),풍속(m/s),강수량(mm),습도(%),태양광 발전량(MWh),solar_altitude,_강원도,_경기도,_경상남도,_경상북도,_부산시,_울산시,_인천시,_전라남도,_전라북도,_제주도,_충청남도
0,-8.2,0.0,0.3,0.0,77.0,0.24,-71.412722,0,0,0,0,0,0,0,0,1,0,0
1,-8.6,0.0,0.0,0.0,79.0,0.50,-75.237641,0,0,0,0,0,0,0,0,1,0,0
2,-7.9,284.8,0.7,0.0,78.0,0.37,-69.496024,0,0,0,0,0,0,0,0,1,0,0
3,-7.8,215.2,1.4,0.0,80.0,0.05,-59.190126,0,0,0,0,0,0,0,0,1,0,0
4,-6.7,209.7,1.4,0.0,78.0,0.02,-47.642389,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
271780,3.6,322.3,1.4,0.0,45.0,0.24,-15.177272,0,0,0,0,0,1,0,0,0,0,0
271781,3.1,296.8,2.7,0.0,46.0,0.43,-26.685467,0,0,0,0,0,1,0,0,0,0,0
271782,3.0,290.7,2.7,0.0,45.0,0.42,-38.473794,0,0,0,0,0,1,0,0,0,0,0
271783,2.7,264.6,2.5,0.0,42.0,0.23,-50.283065,0,0,0,0,0,1,0,0,0,0,0


In [25]:
def DT_anaytics(X_train, X_test,y_train, y_test):    
    DT = DecisionTreeRegressor()
    params = {'max_depth': [5], 'min_samples_split': [100, 1000, 10000, 100000]}
    DT_search = GridSearchCV(DT, params, cv=5, scoring='neg_mean_squared_error')
    DT_search.fit(X_train,y_train)
    best_clf = DT_search.best_estimator_
    y_pred = DT_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)
# Calculate MAPE
    print(f'''
rmse for Decision Tree :{rmse}
mse for Decision Tree :{mse}
''')

    
def RF_analytics(X_train, X_test, y_train, y_test):
    # RandomForestRegressor 모델 정의
    rfc = RandomForestRegressor(random_state=24)

    # 탐색할 하이퍼파라미터 그리드 정의
    param_grid = {'max_depth': [10,15,20]}

    # GridSearchCV 객체 생성
    grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='neg_mean_squared_error')
    
    # Grid Search를 사용하여 모델 훈련
    grid_search.fit(X_train, y_train)

    # 최적의 모델 추출
    best_rfc = grid_search.best_estimator_

    # 테스트 데이터로 예측 및 성능 평가
    y_pred = best_rfc.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)

    print(f'''
rmse for RF_analytics: {rmse}
mse for RF_analytics: {mse}
Best Parameters for RF_analytics: {grid_search.best_params_}
''')

def MLP_analytics(X_train, X_test, y_train, y_test):
    # MLPRegressor 모델 정의
    regr = MLPRegressor(max_iter=1000, random_state=42)

    # 탐색할 하이퍼파라미터 그리드 정의
    param_grid = {
        'hidden_layer_sizes': [(100, 100, 100), (50, 50, 50)],  
        'alpha': [0.01, 0.03, 0.1]
    }

    # GridSearchCV 객체 생성
    grid_search = GridSearchCV(regr, param_grid, cv=5, scoring='neg_mean_squared_error')

    # Grid Search를 사용하여 모델 훈련
    grid_search.fit(X_train, y_train)

    # 최적의 모델 추출
    best_regr = grid_search.best_estimator_

    # 테스트 데이터로 예측 및 성능 평가
    y_pred = best_regr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mse = mean_squared_error(y_test, y_pred)

    print(f'''
rmse for MLP_analytics: {rmse}
mse for MLP_analytics: {mse}
Best Parameters for MLP_analytics: {grid_search.best_params_}
''')



In [26]:
predicts = [' 풍력 발전량(MWh) ',' 태양광 발전량(MWh) ']

wind_1 = df_hour.dropna()

X_train, X_test, y_train, y_test = train_test_split(wind_1.drop(predicts[1], axis=1),wind_1[predicts[1]], random_state= 24)
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on your data and transform it
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

DT_anaytics(X_train, X_test,y_train, y_test)
RF_anaytics(X_train, X_test,y_train, y_test)
MLP_anaytics(X_train, X_test,y_train, y_test)


rmse for Decision Tree :70.27027793761859
mse for Decision Tree :4937.911961430165


rmse for RF_anaytics :54.828692821166555
mse for RF_anaytics :3006.185556477841
mape for Decision Tree :6455194899730903.0


rmse for MLP_anaytics :60.282055498048095
mse for MLP_anaytics :3633.926215069751
mape for MLP_anaytics :8354769046238150.0



# 2023년 1월 1일에 대한 예측 진행

In [None]:
predicts = [' 풍력 발전량(MWh) ',' 태양광 발전량(MWh) ']

sun_df1 = result[[' 태양광 발전량(MWh) ', '기온(°C)', '풍속(m/s)','강수량(mm)', '습도(%)']].dropna()

X_train, X_test, y_train, y_test = train_test_split(sun_df1.drop(predicts[1], axis=1),sun_df1[predicts[1]], random_state= 24)
# Create a StandardScaler object
scaler = StandardScaler()

# Fit the scaler on your data and transform it
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)


# Fit the scaler on your data and transform it
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
rfc = RandomForestRegressor(max_depth=20, random_state=24)
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
mape= mean_absolute_percentage_error(y_test, y_pred)

asdf = pd.DataFrame({'기온(°C)':0.3375, '풍속(m/s)': 2.65, '강수량(mm)': 0, '습도(%)': 55.33}, index=[0])


# Select only the relevant features from asdf that match X_test
new_row = asdf.iloc[0, :].values

# Add a single row to X_test
X_test_with_new_row = np.vstack([X_test, new_row])
a = rfc.predict(X_test_with_new_row)
print(a[-1])

7061.291099999993
