In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# conda install pytorch torchvision -c pytorch 
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut


In [14]:
# data/output_df의 파일들과 data/weather의 파일들을 읽어온다.
seoul_output_df = pd.read_csv('data/output_df/seoul_spring_df.csv')
seoul_spring_df = pd.read_csv('data/weather/seoul_spring.csv')

# 위에 파일들을 합친다.
seoul_df = pd.merge(seoul_output_df, seoul_spring_df, on='일시', how='left')

# 출력
seoul_df



Unnamed: 0,일시,일반봄배추:면적 (ha),생산량 (톤),평균기온(°C),최고기온(°C),최저기온(°C),월합강수량(00~24h만)(mm),합계 일사량(MJ/m2)
0,2000,72,2660,6.3,18.9,-4.8,3.1,389.50
1,2000,72,2660,11.9,23.5,1.8,30.7,440.90
2,2000,72,2660,17.5,30.5,7.9,75.2,443.69
3,2001,8,353,5.0,21.0,-6.1,18.1,396.53
4,2001,8,353,13.6,28.2,2.1,12.3,492.04
...,...,...,...,...,...,...,...,...
61,2020,0,1,11.1,23.8,1.9,16.9,637.52
62,2020,0,1,18.0,30.0,9.2,112.4,564.83
63,2021,5,228,9.0,22.9,-1.2,110.9,476.94
64,2021,5,228,14.2,28.2,3.1,124.1,558.28


In [15]:
# 입력데이터 정규화
scaler = MinMaxScaler()
seoul_df.iloc[:, 1:] = scaler.fit_transform(seoul_df.iloc[:, 1:])
seoul_df





Unnamed: 0,일시,일반봄배추:면적 (ha),생산량 (톤),평균기온(°C),최고기온(°C),최저기온(°C),월합강수량(00~24h만)(mm),합계 일사량(MJ/m2)
0,2000,0.818182,1.000000,0.167702,0.244792,0.142857,0.000000,0.178047
1,2000,0.818182,1.000000,0.515528,0.484375,0.479592,0.126085,0.313506
2,2000,0.818182,1.000000,0.863354,0.848958,0.790816,0.329374,0.320859
3,2001,0.090909,0.132707,0.086957,0.354167,0.076531,0.068524,0.196574
4,2001,0.090909,0.132707,0.621118,0.729167,0.494898,0.042028,0.448280
...,...,...,...,...,...,...,...,...
61,2020,0.000000,0.000376,0.465839,0.500000,0.484694,0.063042,0.831677
62,2020,0.000000,0.000376,0.894410,0.822917,0.857143,0.499315,0.640111
63,2021,0.056818,0.085714,0.335404,0.453125,0.326531,0.492462,0.408486
64,2021,0.056818,0.085714,0.658385,0.729167,0.545918,0.552764,0.622849


In [16]:
# sequence to sequence
def split_sequence(sequence, n_steps):
    X, y = list(), list()
    for i in range(len(sequence)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the sequence
        if end_ix > len(sequence)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequence[i:end_ix, :-1], sequence[end_ix, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# 데이터셋 생성
n_steps = 7
X, y = split_sequence(seoul_df.values, n_steps)

# 데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 데이터셋 변환
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2])
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2])

In [19]:
# 모델 생성
model = Sequential()
model.add(LSTM(128, activation='relu', input_shape=(n_steps, X.shape[2])))
model.add(Dense(1))
model.summary()

# 모델 컴파일
model.compile(optimizer='adam', loss='mse')

# 모델 학습
model.fit(X_train, y_train, epochs=100, verbose=0)


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_1 (LSTM)               (None, 128)               69632     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 69,761
Trainable params: 69,761
Non-trainable params: 0
_________________________________________________________________


2022-12-19 14:41:19.609183: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:114] Plugin optimizer for device_type GPU is enabled.


<keras.callbacks.History at 0x2a5387c40>

In [23]:
# 모델 평가 
y_pred = model.predict(X_test)
print('MSE :', mean_squared_error(y_test, y_pred))
print('MAE :', mean_absolute_error(y_test, y_pred))
print('R2 :', r2_score(y_test, y_pred))

# 모델 저장
model.save('model/model.h5')



MSE : 0.07554681819222175
MAE : 0.22988191283016998
R2 : -0.5291262608810663


In [22]:
# 모델 예측
y_pred = model.predict(X_test)

# 예측값 역정규화
y_pred = scaler.inverse_transform(y_pred)

# 실제값 역정규화
y_test = scaler.inverse_transform(y_test.reshape(-1, 1))

# 예측값 출력
y_pred



array([[0.38470647],
       [0.5162678 ],
       [0.51538277],
       [0.4500675 ],
       [0.27719685],
       [0.29741475],
       [0.5450001 ],
       [0.513258  ],
       [0.5332203 ],
       [0.3103008 ],
       [0.33635518],
       [0.2701244 ]], dtype=float32)