In [65]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from xgboost import XGBRegressor

In [48]:
df = pd.read_csv("../data/merge_all/merge_all.csv", index_col=0)
# df.info()

In [49]:
# 인덱스를 datetime으로 변경
df.index = pd.to_datetime(df.index,format="%Y-%m-%d")
# df.index
# df.info()

DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07', '2000-01-08',
               '2000-01-09', '2000-01-10',
               ...
               '2022-11-21', '2022-11-22', '2022-11-23', '2022-11-24',
               '2022-11-25', '2022-11-26', '2022-11-27', '2022-11-28',
               '2022-11-29', '2022-11-30'],
              dtype='datetime64[ns]', name='거래년월일', length=8370, freq=None)

In [50]:
# rolling 메소드를 이용한 이동평균 만들기
mov_list = [7,28]
for mov in mov_list :
    globals()["df_mov_avr"+str(mov)] = df["평균"].rolling(mov, min_periods=1).mean()

In [56]:
df_yester = df["평균"].shift(1)      # 전날 가격을 추출
df_yester[0] = (6040+5160)/2        # 1999년 12월 31일 배추 평균값
# df_yester

거래년월일
2000-01-01    5600.0
2000-01-02    5600.0
2000-01-03    5600.0
2000-01-04    5600.0
2000-01-05    5600.0
               ...  
2022-11-26    6100.0
2022-11-27    6100.0
2022-11-28    6100.0
2022-11-29    5664.0
2022-11-30    5592.0
Name: 평균, Length: 8370, dtype: float64

In [57]:
df = pd.concat([df, df_yester ,df_mov_avr7, df_mov_avr28],axis=1)
# df

Unnamed: 0_level_0,자동차용 경유 가격 (원),전월비(%),평균,거래량,평균,평균,평균
거래년월일,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-01,583.35,4.0,5600.0,102285.0,5600.0,5600.000000,5600.000000
2000-01-02,583.35,4.0,5600.0,102285.0,5600.0,5600.000000,5600.000000
2000-01-03,583.35,4.0,5600.0,102285.0,5600.0,5600.000000,5600.000000
2000-01-04,583.35,4.0,5600.0,715338.0,5600.0,5600.000000,5600.000000
2000-01-05,583.35,4.0,5620.0,340568.0,5600.0,5604.000000,5604.000000
...,...,...,...,...,...,...,...
2022-11-26,1879.15,-8.0,6100.0,296250.0,6100.0,6282.571429,6898.071429
2022-11-27,1879.15,-8.0,6100.0,296250.0,6100.0,6151.428571,6833.214286
2022-11-28,1879.15,-8.0,5664.0,201478.0,6100.0,6060.571429,6757.196429
2022-11-29,1879.15,-8.0,5592.0,77130.0,5664.0,5965.142857,6694.410714


In [58]:
df.columns = ["자동차용 경유 가격 (원)", "전월비(%)", "평균 가격(원)", "거래량 (kg)", "전일 평균가격" ,"이동 평균가격_7일","이동 평균가격_28일"]
# df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8370 entries, 2000-01-01 to 2022-11-30
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   자동차용 경유 가격 (원)  8370 non-null   float64
 1   전월비(%)          8370 non-null   float64
 2   평균 가격(원)        8370 non-null   float64
 3   거래량 (kg)        8370 non-null   float64
 4   전일 평균가격         8370 non-null   float64
 5   이동 평균가격_7일      8370 non-null   float64
 6   이동 평균가격_28일     8370 non-null   float64
dtypes: float64(7)
memory usage: 523.1 KB


In [59]:
feature = df[["자동차용 경유 가격 (원)", "전월비(%)", "거래량 (kg)", "전일 평균가격" ,"이동 평균가격_7일","이동 평균가격_28일"]]
target = df["평균 가격(원)"]

In [60]:
std = StandardScaler()
std.fit(feature)
feature_std = std.transform(feature)

In [63]:
# feature_std

In [62]:
std2 = StandardScaler()
std2.fit(target.values.reshape(-1,1))
target_std = std2.transform(target.values.reshape(-1,1))
# target_std.shape

In [14]:
target_std.shape

(8370, 1)

In [24]:
# 딥러닝용 tensor로 변환
X_train, X_test = feature_std[:6696].reshape(-1,3,1.), feature_std[6696:].reshape(-1,3,1)
y_train, y_test = target_std[:6696], target_std[6696:]

## LSTM

In [25]:
model = Sequential()
model.add(LSTM(units=3, input_shape=(X_train.shape[1],1), activation="tanh", ))
model.add(Dense(1))
model.compile(loss="mean_squared_error",optimizer = "adam", metrics=["mse"])

In [26]:
model.fit(X_train, y_train ,epochs=300, verbose=2)

Epoch 1/300
210/210 - 4s - loss: 0.6320 - mse: 0.6320 - 4s/epoch - 19ms/step
Epoch 2/300
210/210 - 1s - loss: 0.5932 - mse: 0.5932 - 533ms/epoch - 3ms/step
Epoch 3/300
210/210 - 1s - loss: 0.5767 - mse: 0.5767 - 541ms/epoch - 3ms/step
Epoch 4/300
210/210 - 1s - loss: 0.5688 - mse: 0.5688 - 514ms/epoch - 2ms/step
Epoch 5/300
210/210 - 1s - loss: 0.5641 - mse: 0.5641 - 515ms/epoch - 2ms/step
Epoch 6/300
210/210 - 1s - loss: 0.5604 - mse: 0.5604 - 506ms/epoch - 2ms/step
Epoch 7/300
210/210 - 1s - loss: 0.5571 - mse: 0.5571 - 508ms/epoch - 2ms/step
Epoch 8/300
210/210 - 1s - loss: 0.5542 - mse: 0.5542 - 513ms/epoch - 2ms/step
Epoch 9/300
210/210 - 1s - loss: 0.5520 - mse: 0.5520 - 539ms/epoch - 3ms/step
Epoch 10/300
210/210 - 1s - loss: 0.5497 - mse: 0.5497 - 514ms/epoch - 2ms/step
Epoch 11/300
210/210 - 1s - loss: 0.5484 - mse: 0.5484 - 509ms/epoch - 2ms/step
Epoch 12/300
210/210 - 1s - loss: 0.5467 - mse: 0.5467 - 514ms/epoch - 2ms/step
Epoch 13/300
210/210 - 1s - loss: 0.5450 - mse: 0.5

<keras.callbacks.History at 0x19daeee8640>

In [27]:
model.evaluate(X_test, y_test)



[2.099684238433838, 2.099684238433838]

In [28]:
y_pred = model.predict(X_test)
print("r2 score :", r2_score(y_test, y_pred))

r2 score : -0.20122443902116371


In [30]:
y_pred_in = std2.inverse_transform(y_pred)

In [None]:
fig = plt.figure((15,9))
plt.plot(df)

## GridSearchCV

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

In [None]:
param = {
    "n_estimators" : [10,20,30,50,70,100,120,150,200,300],
    "learning_rate" : [0.001,0.05,0.02,0.01,0.5,0.2,0.1,1],
    "max_depth" : [1,2,3,4,5,6,7]
}
grid = GridSearchCV(xgb, param_grid=param, cv=kfold, n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_estimator_)
print(grid.best_params_)

In [None]:
features = iris.data
target = iris.target
standardizer = StandardScaler() # 표준화 객체 생성
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1) # KNN 분류기 객체 생성
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)]) # 파이프라인 생성
# 탐색 영역의 후보를 만듭니다.
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]
# 그리드 서치 객체 생성
classifier = GridSearchCV( pipe, search_space, cv=5, verbose=0).fit(features, target)
# 최선의 이웃 개수 (k)
classifier.best_estimator_.get_params()["knn__n_neighbors"]

In [None]:
### 딕셔너리 형태로 저장    {"param":[후보1, 후보2, ......]}
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
k_values = np.array([1,3,5,7,9,11,13,15,17,19,21])
param_grid = dict(n_neighbors = k_values)
model = KNeighborsRegressor()
kfold = KFold(n_splits=10, random_state=3, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring="neg_mean_squared_error")
grid_result = grid.fit(X_train_scaled, y_train)
print("Best Score :", grid_result.best_score_)
print("Best Parameters :", grid_result.best_params_)

iteration은 1 epoch를 시행할 때 필요한 미니배치 갯수를 의미합니다. 다른 말로는 1epoch을 마치는데 필요한 파라미터 업데이트 횟수

만약 10,000개의 데이터셋을 학습시킨다고 치자. (여기서 학습은 순방향 역방향 둘 다 포함)

메모리 한계 및 성능을 고려하여 나눠서 학습을 시킬 겁니다. 

이 때, 한 턴에 1,000개씩 10번 , 5턴을 학습시킨다고 하면, batch_size = 1,000 / iteration = 10 /epoch = 5입니다.

In [None]:
alpha = np.logspace(-4,2,7)   # lasso, ridge 해당
learning_rate = np.logspace(-4,2,7)   # adaboost, gradientboost, xgboost 해당
eta = np.logspace(-4,2,7)   # xgboost 해당
max_iteration = [1,3,5,10,20,50,100,150,200]   # lasso, ridge 해당
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga', 'lbfgs']   # ridge 해당, 계산에 사용할 알고리즘
n_estimators = [1,3,5,10,15,20]   # randomforesteregressor(사용할 tree의 개수), gradientboost, adaboost, xgboost 해당
max_depth = [2,3,5,7,10,12]  # randomforesteregressor(tree의 깊이), adaboost(boosting이 끝났을 때 estimator의 최대수), gradientboost, xgboost 해당
criterion_rf = ["squared_error", "absolute_error", "friedman_mse", "poisson"]  # randomforesteregressor 해당
criterion_gb = ["squared_error", "friedman_mse"]  # gradientboost 해당
loss_ada = ["linear", "square", "exponential"]    # adaboost 해당
loss_gb = ["squared_error", "absolute_error", "huber", "quantile"]  # gradientboost 해당


In [None]:
model_lasso = ["Lasso"]
model_ridge = ["Ridge"]
model_rf = ["RandomForestRegressor"]
model_ada = ["AdaBoostRegressor"] 
model_grad ["GradientBoostingRegressor"]
model_xgb = ["XGBRegressor"]

params_lasso = {"alpha" : alpha, 
                "max_iteration" : max_iteration}

params_ridge = {"alpha" : alpha, 
                "max_iteration" : max_iteration, 
                "solver" : solver}

params_rf = {"n_estimators" : n_estimators, 
                "max_depth" : max_depth, 
                "criterion" : criterion_rf}

params_ensem = {"learning_rate" : learning_rate, 
                "n_estimators" : n_estimators, 
                "max_depth" : max_depth, 
                }

In [None]:

params = {"alpha":alpha_learningRate,
"max_iter" : []}

-------------------------------------------------

In [20]:
# 데이터 스케일링
scaler = MinMaxScaler()
scaler.fit(feature)
feature = scaler.transform(feature)
scaler.fit(target)
target = scaler.transform(target)

# 데이터셋 생성
def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    return np.array(dataX), np.array(dataY)

# 데이터셋 생성
look_back = 1
trainX, trainY = create_dataset(feature, look_back)
testX, testY = create_dataset(target, look_back)

# 데이터셋 형태 변환
trainX = np.reshape(trainX, (trainX.shape[0], 1, trainX.shape[1]))
testX = np.reshape(testX, (testX.shape[0], 1, testX.shape[1]))

# 모델 생성
model = Sequential()
model.add(LSTM(32, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)

# 예측
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

# 데이터 스케일링 복원
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

# 모델 평가
trainScore = model.evaluate(trainX, trainY, verbose=0)
print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, np.sqrt(trainScore)))
testScore = model.evaluate(testX, testY, verbose=0)
print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, np.sqrt(testScore)))

# 예측 결과 시각화
plt.figure(figsize=(12, 9))
plt.plot(testY[0], label='actual')
plt.plot(testPredict, label='prediction')
plt.legend()
plt.show()

# 예측 결과 저장
predic_df = pd.DataFrame(testPredict)

# 예측 결과 저장


ValueError: Expected 2D array, got 1D array instead:
array=[5600. 5600. 5600. ... 5664. 5592. 5242.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.