In [48]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import re
import joblib
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, get_scorer_names
from sklearn.model_selection import KFold, train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM
from xgboost import XGBRegressor
from copy import deepcopy
warnings.filterwarnings("ignore")

## 데이터 전처리

In [49]:
df = pd.read_csv("./data/merge_all/merge_drop.csv")
# df.head()

In [50]:
move_avr = [5, 6, 20, 24, 60, 72, 300]
for avr in move_avr : 
    df["이동평균가_"+str(avr)+"일"] = df["평균"].rolling(avr).mean()

In [51]:
df["거래년월일"] = pd.to_datetime(df["거래년월일"])
df.set_index("거래년월일", inplace=True)

In [52]:
df.dropna(how="any", inplace=True)
# df.head()

In [53]:
df.columns

Index(['자동차용 경유 가격 (원)', '전월비(%)', '평균', '거래량', '이동평균가_5일', '이동평균가_6일',
       '이동평균가_20일', '이동평균가_24일', '이동평균가_60일', '이동평균가_72일', '이동평균가_300일'],
      dtype='object')

In [54]:
df = df[['자동차용 경유 가격 (원)', '전월비(%)', '거래량', '이동평균가_5일', '이동평균가_6일',
       '이동평균가_20일', '이동평균가_24일', '이동평균가_60일', '이동평균가_72일', '이동평균가_300일', '평균']]

In [55]:
df.rename(columns={"평균":"배추가격"},inplace=True)
# df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5483 entries, 2001-01-10 to 2022-11-30
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   자동차용 경유 가격 (원)  5483 non-null   float64
 1   전월비(%)          5483 non-null   float64
 2   거래량             5483 non-null   float64
 3   이동평균가_5일        5483 non-null   float64
 4   이동평균가_6일        5483 non-null   float64
 5   이동평균가_20일       5483 non-null   float64
 6   이동평균가_24일       5483 non-null   float64
 7   이동평균가_60일       5483 non-null   float64
 8   이동평균가_72일       5483 non-null   float64
 9   이동평균가_300일      5483 non-null   float64
 10  배추가격            5483 non-null   float64
dtypes: float64(11)
memory usage: 514.0 KB


In [57]:
feature = df.drop(columns=["거래량", "배추가격", "이동평균가_6일", "이동평균가_24일", "이동평균가_72일"])
target = df["배추가격"]

In [None]:
## StandardScaler를 이용한 정규화

In [58]:
std_feat = StandardScaler()
std_tar = StandardScaler()

std_feat.fit(feature)
std_tar.fit(target.values.reshape(-1,1))

X = std_feat.transform(feature)
y = std_tar.transform(target.values.reshape(-1,1))

In [None]:
## trainset과 testset을 분리

In [59]:
len_x = int(len(X)*0.8)
len_y = int(len(y)*0.8)

X_train, X_test = X[:len_x], X[len_x:]
y_train, y_test = y[:len_y], y[len_y:]

## 12월 테스트용 데이터 전처리

In [72]:
## 12월 테스트용 데이터 프레임 만들기
df_price = pd.read_csv("./data/price/price_baechoo.csv")
df_price.set_index("구분", inplace=True)   # 구분 column을 인덱스로 설정
df_price.rename(columns={"평균" : "배추가격"})
df_price = df_price[df_price.index >= "2022-12-01"]
df_price.index.name = ""
df_price


Unnamed: 0,평균
,
2022-12-01,5462.0
2022-12-05,5982.0
2022-12-06,5502.0
2022-12-07,5488.0
2022-12-08,5408.0
2022-12-09,5408.0
2022-12-12,5244.5
2022-12-13,5245.5
2022-12-14,5430.5


In [73]:
local_path = "C:/workspace/datas/"
df_oil = pd.read_csv(local_path+"2022년_12월_주유소_경유가격.csv", encoding="cp949")
df_oil["구분"] = df_oil["구분"].apply(lambda x : datetime.strptime(str(x), "%Y년%m월%d일"))
df_oil.set_index("구분", inplace=True)
df_oil.index.name = ""
df_oil

Unnamed: 0,자동차용경유
,
2022-12-01,1857.47
2022-12-02,1855.96
2022-12-03,1854.52
2022-12-04,1853.31
2022-12-05,1850.98
2022-12-06,1847.19
2022-12-07,1842.17
2022-12-08,1835.07
2022-12-09,1827.11


In [74]:
df_supply = pd.read_csv("./data/transaction_supply/transaction_supply_baechoo.csv")
df_supply["거래년월일"] = pd.to_datetime(df_supply["거래년월일"])
df_supply.set_index("거래년월일", inplace=True)
df_supply = df_supply[df_supply.index >= "2022-12-01"]
df_supply.index.name = ""
df_supply

Unnamed: 0,거래량
,
2022-12-01,404610.0
2022-12-02,293540.0
2022-12-03,448820.0
2022-12-05,135374.0
2022-12-06,83130.0
2022-12-07,351610.0
2022-12-08,417400.0
2022-12-09,422810.0
2022-12-10,359760.0


In [75]:
df_test = pd.concat([df_oil, df_price, df_supply],)
# df_test = df_test.join(df_supply, how="inner")
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   자동차용경유  0 non-null      float64
 1   평균      0 non-null      float64
dtypes: float64(2)
memory usage: 0.0+ bytes


## xgboostregressor를 이용한 모델 학습

In [79]:
# {'colsample_bytree': 1, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 0, 'subsample': 0.5}


xgb = XGBRegressor(eta = 0.1, max_depth = 3, n_estimators = 100, colsample_bytree = 1, reg_alpha = 0, reg_lambda = 0, subsample = 0.5)
xgb.fit(X_train, y_train)

In [80]:
y_pred = xgb.predict(X_test)
print("rmse :", round(np.sqrt(mean_squared_error(y_test, y_pred)),3))
print("r2_score :", round(r2_score(y_test, y_pred),3))

rmse : 0.396
r2_score : 0.909


In [77]:
y_pred = xgb.predict(X_train)
print("r2_score :", round(r2_score(y_train, y_pred),3))

r2_score : 0.991


## 12월 예측