In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# conda install pytorch torchvision -c pytorch 
import tensorflow as tf
import torch
from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import DataLoader # 데이터로더
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import LeavePOut

# import MLPRegressor
# XGBRegressor
# LGBMRegressor
# CatBoostRegressor
# KNeighborsRegressor
# Lasso
# Ridge
# ElasticNet
# SGDRegressor

from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor


In [23]:
# data/output_df의 파일들과 data/weather의 파일들을 읽어온다.
seoul_output_df = pd.read_csv('data/output_df/seoul_spring_df.csv')
seoul_spring_df = pd.read_csv('data/weather/seoul_spring.csv')

# 위에 파일들을 합친다.
seoul_df = pd.merge(seoul_output_df, seoul_spring_df, on='일시', how='left')

# 출력
seoul_df

Unnamed: 0,일시,일반봄배추:면적 (ha),생산량 (톤),평균기온(°C),최고기온(°C),최저기온(°C),월합강수량(00~24h만)(mm),합계 일사량(MJ/m2)
0,2000,72,2660,6.3,18.9,-4.8,3.1,389.50
1,2000,72,2660,11.9,23.5,1.8,30.7,440.90
2,2000,72,2660,17.5,30.5,7.9,75.2,443.69
3,2001,8,353,5.0,21.0,-6.1,18.1,396.53
4,2001,8,353,13.6,28.2,2.1,12.3,492.04
...,...,...,...,...,...,...,...,...
61,2020,0,1,11.1,23.8,1.9,16.9,637.52
62,2020,0,1,18.0,30.0,9.2,112.4,564.83
63,2021,5,228,9.0,22.9,-1.2,110.9,476.94
64,2021,5,228,14.2,28.2,3.1,124.1,558.28


In [24]:
# 일반봄배추:면적 (ha)	생산량 (톤)	평균기온(°C) 컬럼만 추출
seoul_df = seoul_df[['일반봄배추:면적 (ha)', '생산량 (톤)', '평균기온(°C)']]
seoul_df

Unnamed: 0,일반봄배추:면적 (ha),생산량 (톤),평균기온(°C)
0,72,2660,6.3
1,72,2660,11.9
2,72,2660,17.5
3,8,353,5.0
4,8,353,13.6
...,...,...,...
61,0,1,11.1
62,0,1,18.0
63,5,228,9.0
64,5,228,14.2


In [25]:
# 입력데이터 정규화 (StandardScaler)
scaler = StandardScaler()
scaler.fit(seoul_df.iloc[:, 1:])
seoul_df.iloc[:, 1:] = scaler.transform(seoul_df.iloc[:, 1:])
seoul_df


Unnamed: 0,일반봄배추:면적 (ha),생산량 (톤),평균기온(°C)
0,72,2.410005,-1.198366
1,72,2.410005,-0.102853
2,72,2.410005,0.992660
3,8,-0.304781,-1.452681
4,8,-0.304781,0.229714
...,...,...,...
61,0,-0.719000,-0.259354
62,0,-0.719000,1.090474
63,5,-0.451876,-0.670172
64,5,-0.451876,0.347090


In [26]:
# 머신러닝 모델을 만들기 위해 데이터를 나눈다.
# train : test = 7 : 3
train = seoul_df[:int(len(seoul_df)*0.7)]   
test = seoul_df[int(len(seoul_df)*0.7):]

# train 데이터를 x_train, y_train으로 나눈다.
x_train = train.iloc[:, 1:]
y_train = train.iloc[:, 0]

# 출력
x_train

Unnamed: 0,생산량 (톤),평균기온(°C)
0,2.410005,-1.198366
1,2.410005,-0.102853
2,2.410005,0.99266
3,-0.304781,-1.452681
4,-0.304781,0.229714
5,-0.304781,1.305664
6,1.189704,-0.94405
7,1.189704,0.229714
8,1.189704,1.1296
9,0.724884,-1.15924


In [27]:
# 회귀모델 생성 (LinearRegression, RandomForestRegressor, GradientBoostingRegressor, SVR, MLPRegressor, XGBRegressor, LGBMRegressor, CatBoostRegressor, KNeighborsRegressor, Lasso, Ridge, ElasticNet, DecisionTreeRegressor, ExtraTreeRegressor, AdaBoostRegressor, BaggingRegressor, GaussianProcessRegressor, HuberRegressor, PassiveAggressiveRegressor, RANSACRegressor, SGDRegressor, TheilSenRegressor, KernelRidge)

# 회귀모델 중 성능이 가장 좋은 모델을 선택하는 함수를 만든다.
def get_best_model_and_accuracy(model, params, x_train, y_train):
    grid_model = GridSearchCV(model, param_grid=params, cv=5, n_jobs=-1, verbose=1)
    grid_model.fit(x_train, y_train)
    rmse = np.sqrt(-1 * grid_model.best_score_)
    print('{0} 5 CV 시 최적 평균 RMSE 값: {1}, 최적 alpha: {2}'.format(grid_model.best_estimator_, rmse, grid_model.best_params_))
    return grid_model.best_estimator_

# LinearRegression
lr_params = {'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X':[True, False]}
lr_model = get_best_model_and_accuracy(LinearRegression(), lr_params, x_train, y_train)

# RandomForestRegressor
rf_params = {'n_estimators':[100, 300], 'max_depth':[6, 8, 10, 12], 'min_samples_leaf':[8, 12, 18], 'min_samples_split':[8, 16, 20]}
rf_model = get_best_model_and_accuracy(RandomForestRegressor(random_state=0, n_jobs=-1), rf_params, x_train, y_train)

# GradientBoostingRegressor
gb_params = {'n_estimators':[100, 300], 'learning_rate':[0.05, 0.1], 'max_depth':[3, 5, 7], 'min_samples_leaf':[3, 5, 7, 10], 'min_samples_split':[2, 3, 5, 10]}
gb_model = get_best_model_and_accuracy(GradientBoostingRegressor(random_state=0), gb_params, x_train, y_train)

# SVR
svr_params = {'kernel':['linear', 'poly', 'rbf', 'sigmoid'], 'C':[0.01, 0.1, 1, 10, 100], 'gamma':['scale', 'auto']}
svr_model = get_best_model_and_accuracy(SVR(), svr_params, x_train, y_train)

# MLPRegressor
mlp_params = {'hidden_layer_sizes':[(100, ), (300, ), (100, 100), (300, 300)], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'solver':['lbfgs', 'sgd', 'adam'], 'alpha':[0.0001, 0.05], 'learning_rate':['constant', 'invscaling', 'adaptive']}
mlp_model = get_best_model_and_accuracy(MLPRegressor(random_state=0), mlp_params, x_train, y_train)

# XGBRegressor
xgb_params = {'n_estimators':[100, 300], 'learning_rate':[0.05, 0.1], 'max_depth':[3, 5, 7], 'min_child_weight':[1, 3, 5], 'colsample_bytree':[0.5, 0.75, 1]}
xgb_model = get_best_model_and_accuracy(XGBRegressor(random_state=0, n_jobs=-1), xgb_params, x_train, y_train)

# CatBoostRegressor
cat_params = {'iterations':[100, 300], 'learning_rate':[0.05, 0.1], 'depth':[3, 5, 7], 'l2_leaf_reg':[1, 3, 5, 7, 9]}
cat_model = get_best_model_and_accuracy(CatBoostRegressor(random_state=0, silent=True), cat_params, x_train, y_train)

# KNeighborsRegressor
knn_params = {'n_neighbors':[3, 5, 7, 9], 'weights':['uniform', 'distance'], 'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size':[10, 20, 30, 40]}
knn_model = get_best_model_and_accuracy(KNeighborsRegressor(n_jobs=-1), knn_params, x_train, y_train)

# Lasso
lasso_params = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100], 'fit_intercept':[True, False], 'normalize':[True, False], 'precompute':[True, False], 'copy_X':[True, False], 'max_iter':[1000, 2000, 3000], 'tol':[0.0001, 0.001, 0.01], 'warm_start':[True, False], 'positive':[True, False], 'random_state':[0, 100], 'selection':['cyclic', 'random']}
lasso_model = get_best_model_and_accuracy(Lasso(), lasso_params, x_train, y_train)

# Ridge
ridge_params = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100], 'fit_intercept':[True, False], 'normalize':[True, False], 'copy_X':[True, False], 'max_iter':[1000, 2000, 3000], 'tol':[0.0001, 0.001, 0.01], 'solver':['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
ridge_model = get_best_model_and_accuracy(Ridge(), ridge_params, x_train, y_train)

# ElasticNet
enet_params = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9], 'fit_intercept':[True, False], 'normalize':[True, False], 'precompute':[True, False], 'max_iter':[1000, 2000, 3000], 'copy_X':[True, False], 'tol':[0.0001, 0.001, 0.01], 'warm_start':[True, False], 'positive':[True, False], 'random_state':[0, 100], 'selection':['cyclic', 'random']}
enet_model = get_best_model_and_accuracy(ElasticNet(), enet_params, x_train, y_train)

# SGDRegressor
sgd_params = {'loss':['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'penalty':['none', 'l2', 'l1', 'elasticnet'], 'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'l1_ratio':[0.15, 0.3, 0.5, 0.7, 0.85], 'fit_intercept':[True, False], 'max_iter':[1000, 2000, 3000], 'tol':[0.001, 0.01, 0.1], 'shuffle':[True, False], 'epsilon':[0.1, 0.01, 0.001], 'learning_rate':['constant', 'optimal', 'invscaling', 'adaptive'], 'eta0':[0.01, 0.1, 1], 'power_t':[0.25, 0.5, 0.75], 'early_stopping':[True, False], 'validation_fraction':[0.1, 0.2, 0.3], 'n_iter_no_change':[5, 10, 20], 'random_state':[0, 100], 'warm_start':[True, False], 'average':[True, False]}  
sgd_model = get_best_model_and_accuracy(SGDRegressor(random_state=0, n_jobs=-1), sgd_params, x_train, y_train)





Fitting 5 folds for each of 8 candidates, totalling 40 fits


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline wi

LinearRegression(normalize=False) 5 CV 시 최적 평균 RMSE 값: 2.8335233849865205, 최적 alpha: {'copy_X': True, 'fit_intercept': True, 'normalize': False}
Fitting 5 folds for each of 72 candidates, totalling 360 fits
RandomForestRegressor(max_depth=6, min_samples_leaf=8, min_samples_split=8,
                      n_estimators=300, n_jobs=-1, random_state=0) 5 CV 시 최적 평균 RMSE 값: 11.3323658908461, 최적 alpha: {'max_depth': 6, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 300}
Fitting 5 folds for each of 192 candidates, totalling 960 fits
GradientBoostingRegressor(learning_rate=0.05, min_samples_leaf=3,
                          n_estimators=300, random_state=0) 5 CV 시 최적 평균 RMSE 값: nan, 최적 alpha: {'learning_rate': 0.05, 'max_depth': 3, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 300}
Fitting 5 folds for each of 40 candidates, totalling 200 fits
SVR(C=100) 5 CV 시 최적 평균 RMSE 값: nan, 최적 alpha: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Fitting 5 folds for each of 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

MLPRegressor(alpha=0.05, hidden_layer_sizes=(300, 300),
             learning_rate='adaptive', random_state=0, solver='sgd') 5 CV 시 최적 평균 RMSE 값: nan, 최적 alpha: {'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (300, 300), 'learning_rate': 'adaptive', 'solver': 'sgd'}
Fitting 5 folds for each of 108 candidates, totalling 540 fits
XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.1, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=5, max_leaves=0, min_child_weight=3, missing=nan,
             monotone_constraints='()', n_estimators=300, n_jobs=-1,
             num_parallel_tree=1, 