In [32]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings(action='ignore')

# pip install holidays
import holidays

In [33]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [45]:
!pip install catboost


Collecting catboost
  Using cached catboost-1.2.2-cp39-cp39-macosx_11_0_universal2.whl (25.8 MB)
Collecting plotly
  Using cached plotly-5.18.0-py3-none-any.whl (15.6 MB)
Installing collected packages: plotly, catboost
Successfully installed catboost-1.2.2 plotly-5.18.0


In [34]:
train_df = pd.read_csv('./train.csv')

In [35]:
train_df

Unnamed: 0,ID,timestamp,item,corporation,location,supply(kg),price(원/kg)
0,TG_A_J_20190101,2019-01-01,TG,A,J,0.0,0.0
1,TG_A_J_20190102,2019-01-02,TG,A,J,0.0,0.0
2,TG_A_J_20190103,2019-01-03,TG,A,J,60601.0,1728.0
3,TG_A_J_20190104,2019-01-04,TG,A,J,25000.0,1408.0
4,TG_A_J_20190105,2019-01-05,TG,A,J,32352.0,1250.0
...,...,...,...,...,...,...,...
59392,RD_F_J_20230227,2023-02-27,RD,F,J,452440.0,468.0
59393,RD_F_J_20230228,2023-02-28,RD,F,J,421980.0,531.0
59394,RD_F_J_20230301,2023-03-01,RD,F,J,382980.0,574.0
59395,RD_F_J_20230302,2023-03-02,RD,F,J,477220.0,523.0


In [36]:
#시계열 특성을 학습에 반영하기 위해 timestamp를 월, 일, 시간으로 나눕니다
train_df['year'] = train_df['timestamp'].apply(lambda x : int(x[0:4]))
train_df['month'] = train_df['timestamp'].apply(lambda x : int(x[5:7]))
train_df['day'] = train_df['timestamp'].apply(lambda x : int(x[8:10]))

In [37]:
train_df['ts'] = train_df.apply(lambda x : pd.Timestamp(year=x.year, month=x.month, day=x.day),axis=1)
train_df['weekday'] = train_df['ts'].dt.weekday
train_df['holiday'] = train_df['weekday'].apply(lambda x : 1 if x == 6 else 0)

In [38]:
kr_holidays = holidays.KR()
train_df['holiday'] = train_df.apply(lambda x : 1 if x.timestamp in kr_holidays else x.holiday, axis=1)

In [39]:
#학습에 사용하지 않을 변수들을 제거합니다
train_x = train_df.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
train_y = train_df['price(원/kg)']

In [7]:
train_x

Unnamed: 0,item,corporation,location,year,month,day
0,TG,A,J,2019,1,1
1,TG,A,J,2019,1,2
2,TG,A,J,2019,1,3
3,TG,A,J,2019,1,4
4,TG,A,J,2019,1,5
...,...,...,...,...,...,...
59392,RD,F,J,2023,2,27
59393,RD,F,J,2023,2,28
59394,RD,F,J,2023,3,1
59395,RD,F,J,2023,3,2


In [43]:
trade = pd.read_csv('international_trade.csv')
trade

Unnamed: 0,기간,품목명,수출 중량,수출 금액,수입 중량,수입 금액,무역수지
0,2019-01,토마토(신선한 것이나 냉장한 것으로 한정한다),356571,990,0,0,990
1,2019-01,양파,821330,222,4003206,1118,-896
2,2019-01,쪽파,60,1,93405,128,-127
3,2019-01,꽃양배추와 브로콜리(broccoli),160,1,638913,563,-562
4,2019-01,방울다다기 양배추,0,0,7580,38,-38
...,...,...,...,...,...,...,...
1269,2023-02,포포(papaw)[파파야(papaya)],0,0,23830,71,-71
1270,2023-02,사과,135165,351,0,0,351
1271,2023-02,배,2206012,5411,1,0,5411
1272,2023-02,신 체리[프루너스 체라서스(Prunus cerasus)],5,0,0,0,0


In [40]:
trade = pd.read_csv('international_trade.csv')
trade.columns = ['기간','x','c1','c2','c3','c4','c5']
trade = trade[['기간','x','c5']]

trade = pd.pivot_table(trade,
                         index='기간',
                         columns='x',
                         values=['c5'],
                         aggfunc='sum')
trade.columns = [ 'c'+str(i) for i,(x1,x2) in enumerate(trade.columns)]
trade_columns = trade.columns
trade = trade.fillna(0)

In [41]:
trade

Unnamed: 0_level_0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c24,c25,c26,c27,c28,c29,c30,c31,c32,c33
기간,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-01,172.0,-70.0,0.0,-3503.0,-562.0,-2934.0,-16.0,-4011.0,-7719.0,-82.0,...,28.0,0.0,-1.0,0.0,-127.0,0.0,0.0,990.0,-4461.0,-123.0
2019-02,33.0,-167.0,0.0,-1907.0,-398.0,-2585.0,-118.0,-4795.0,-6366.0,-282.0,...,2.0,0.0,-1.0,0.0,-66.0,0.0,0.0,926.0,-3978.0,-52.0
2019-03,-2.0,-106.0,0.0,-1576.0,-503.0,-4941.0,-82.0,-3363.0,-11360.0,-185.0,...,8.0,0.0,-3.0,0.0,-109.0,0.0,0.0,1369.0,-5052.0,-73.0
2019-04,-27.0,71.0,0.0,-1922.0,-1207.0,-3479.0,-253.0,-3216.0,-12311.0,-1218.0,...,15.0,0.0,-1.0,0.0,-159.0,0.0,0.0,1360.0,-5943.0,-65.0
2019-05,21.0,131.0,0.0,-1966.0,-1028.0,-2144.0,-286.0,-3247.0,-8516.0,-831.0,...,32.0,0.0,-1.0,1.0,-167.0,0.0,0.0,1041.0,-5766.0,-73.0
2019-06,21.0,955.0,0.0,-2321.0,-684.0,-3587.0,-12.0,-2414.0,-4901.0,-526.0,...,23.0,0.0,0.0,0.0,-147.0,0.0,0.0,1060.0,-4517.0,-61.0
2019-07,27.0,-1216.0,0.0,-2809.0,-671.0,-3806.0,-48.0,-6220.0,-1428.0,-356.0,...,25.0,0.0,-1.0,3.0,-130.0,0.0,0.0,1596.0,-5342.0,-70.0
2019-08,31.0,-3151.0,0.0,-1745.0,-1618.0,-2679.0,-2.0,-2817.0,-2010.0,-378.0,...,6.0,0.0,-2.0,0.0,-153.0,0.0,0.0,1339.0,-4032.0,-61.0
2019-09,39.0,-1324.0,0.0,-724.0,-1558.0,-3829.0,-9.0,-1702.0,-3284.0,-34.0,...,17.0,0.0,-2.0,0.0,-148.0,0.0,1.0,1268.0,-3862.0,-35.0
2019-10,14.0,-497.0,0.0,-713.0,-1378.0,-3139.0,-2.0,-3304.0,-3023.0,0.0,...,15.0,0.0,-1.0,0.0,-158.0,0.0,0.0,1337.0,-5456.0,-62.0


In [8]:
#질적 변수들을 수치화합니다
qual_col = ['item', 'corporation', 'location']

for i in qual_col:
    le = LabelEncoder()
    train_x[i]=le.fit_transform(train_x[i])
    # test_x[i]=le.transform(test_x[i]) #test 데이터에 대해서 fit하는 것은 data leakage에 해당합니다

print('Done.')

Done.


In [9]:
from sklearn.model_selection import train_test_split

In [12]:
# 학습 데이터와 검증 데이터를 8:2로 분할합니다
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.2)

In [24]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [14]:
from sklearn.metrics import mean_squared_error

In [22]:
def result_report(model, name):
    train_dot = model.predict(train_x)
    val_dot = model.predict(val_x)
    
    train_rmse = mean_squared_error(train_y, train_dot, squared=False)
    val_rmse = mean_squared_error(val_y, val_dot, squared=False)
    print(name)
    print('Train RMSE :', train_rmse)
    print('Validation RMSE :', val_rmse)

In [25]:
result_report(model, 'Random Forest')

Random Forest
Train RMSE : 439.79034575698597
Validation RMSE : 1199.5721039917535


In [26]:
from xgboost import XGBRegressor

model = XGBRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, verbose=100)

[0]	validation_0-rmse:2251.62511
[100]	validation_0-rmse:1232.17228
[200]	validation_0-rmse:1205.57148
[300]	validation_0-rmse:1191.92982
[400]	validation_0-rmse:1179.54287
[500]	validation_0-rmse:1166.08458
[600]	validation_0-rmse:1155.65529
[700]	validation_0-rmse:1147.99722
[800]	validation_0-rmse:1139.02033
[900]	validation_0-rmse:1132.09019
[999]	validation_0-rmse:1126.23905


In [27]:
result_report(model, 'XGBoost')

XGBoost
Train RMSE : 1015.3071411058783
Validation RMSE : 1126.2390493304895


In [40]:
# conda install lightgbm

In [41]:
from lightgbm import LGBMRegressor

In [43]:
model = LGBMRegressor(n_estimators=1000, learning_rate=0.05, n_jobs=-1)
model.fit(train_x, train_y, eval_set=[(val_x, val_y)], early_stopping_rounds=100, verbose=100)

[100]	valid_0's l2: 1.51024e+06
[200]	valid_0's l2: 1.45589e+06
[300]	valid_0's l2: 1.42425e+06
[400]	valid_0's l2: 1.3961e+06
[500]	valid_0's l2: 1.37313e+06
[600]	valid_0's l2: 1.35144e+06
[700]	valid_0's l2: 1.32915e+06
[800]	valid_0's l2: 1.31092e+06
[900]	valid_0's l2: 1.29673e+06
[1000]	valid_0's l2: 1.28612e+06


In [44]:
result_report(model, 'LightGBM')

LightGBM
Train RMSE : 994.2266918184708
Validation RMSE : 1134.072719818625
