In [205]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import holidays


from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [206]:
train = pd.read_csv(r'./data/train.csv')
international_trade = pd.read_csv(r'./data/international_trade.csv')
test = pd.read_csv(r'./data/test.csv')

In [207]:
def group_season(df):
    df.loc[(df['month'] == 3) | (df['month'] == 4) | (df['month'] == 5), 'season'] = '봄'
    df.loc[(df['month'] == 6) | (df['month'] == 7) | (df['month'] == 8), 'season'] = '여름'
    df.loc[(df['month'] == 9) | (df['month'] == 10) | (df['month'] == 11), 'season'] = '가을'
    df.loc[(df['month'] == 12) | (df['month'] == 1) | (df['month'] == 2), 'season'] = '겨울'
    return df['season']

def holiday(df):
    kr_holidays = holidays.KR()
    df['holiday'] = df.timestamp.apply(lambda x: 'holiday' if x in kr_holidays else 'non-holiday')
    return df['holiday']

In [208]:
train['year'] = train['timestamp'].apply(lambda x : int(x[0:4]))
train['month'] = train['timestamp'].apply(lambda x : int(x[5:7]))
train['day'] = train['timestamp'].apply(lambda x : int(x[8:10]))
train['Weekday'] = pd.to_datetime(train['timestamp']).dt.weekday
train['is_weekend'] = train['Weekday'].apply(lambda x: 1 if x >= 5 else 0)


test['year'] = test['timestamp'].apply(lambda x : int(x[0:4]))
test['month'] = test['timestamp'].apply(lambda x : int(x[5:7]))
test['day'] = test['timestamp'].apply(lambda x : int(x[8:10]))
test['Weekday'] = pd.to_datetime(test['timestamp']).dt.weekday
test['is_weekend'] = test['Weekday'].apply(lambda x: 1 if x >= 5 else 0)


train['season'] = group_season(train)
test['season'] = group_season(test)

train['holiday'] = holiday(train)
test['holiday'] = holiday(test)

x = train.drop(columns=['ID', 'timestamp', 'supply(kg)', 'price(원/kg)'])
y = train['price(원/kg)']

x_test = test.drop(columns=['ID', 'timestamp'])

qual_col = ['item', 'corporation', 'location', 'season', 'holiday']

for i in qual_col:      
    le = LabelEncoder()
    x[i]=le.fit_transform(x[i])
    x_test[i]=le.transform(x_test[i])

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1103)

In [209]:
xgb = XGBRegressor()
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_val)
rmse = mean_squared_error(y_val, xgb_pred, squared=False)
rmse

876.9377080315774

In [135]:
cat_col = ['item', 'corporation', 'location', 'year', 'month', 'day', 'Weekday', 'is_weekend', 'season', 'holiday']
cat = CatBoostRegressor(cat_features=cat_col, verbose=100)
cat.fit(x_train, y_train)
cat_pred = cat.predict(x_val)
rmse = mean_squared_error(y_val, cat_pred, squared=False)
rmse

Learning rate set to 0.075357
0:	learn: 1944.5664084	total: 25ms	remaining: 25s
100:	learn: 999.8997629	total: 2.56s	remaining: 22.8s
200:	learn: 959.7222024	total: 5.22s	remaining: 20.7s
300:	learn: 942.5196092	total: 7.93s	remaining: 18.4s
400:	learn: 928.8785060	total: 10.8s	remaining: 16.1s
500:	learn: 918.2622981	total: 13.6s	remaining: 13.5s
600:	learn: 907.1047097	total: 16.6s	remaining: 11s
700:	learn: 897.5341786	total: 19.8s	remaining: 8.45s
800:	learn: 890.3444058	total: 23.1s	remaining: 5.73s
900:	learn: 883.5608619	total: 26.4s	remaining: 2.9s
999:	learn: 875.0861388	total: 29.4s	remaining: 0us


917.6474170957279

In [142]:
lgbm = LGBMRegressor()
lgbm.fit(x_train, y_train)
lgbm_pred = lgbm.predict(x_val)
rmse = mean_squared_error(y_val, lgbm_pred, squared=False)
rmse

887.9576513290516

In [82]:
cat_col = ['item', 'corporation', 'location', 'year', 'month', 'day', 'season']
cat = CatBoostRegressor(cat_features=cat_col, verbose=100)
cat.fit(x, y)
cat_pred = cat.predict(x_test)

Learning rate set to 0.078061
0:	learn: 1954.9125716	total: 29.6ms	remaining: 29.6s
100:	learn: 1038.7122630	total: 2.44s	remaining: 21.7s
200:	learn: 997.6960760	total: 4.77s	remaining: 19s
300:	learn: 985.1579908	total: 7.21s	remaining: 16.7s
400:	learn: 975.3662248	total: 9.7s	remaining: 14.5s
500:	learn: 966.2400826	total: 12s	remaining: 12s
600:	learn: 957.9605702	total: 14.5s	remaining: 9.6s
700:	learn: 951.7497680	total: 17s	remaining: 7.25s
800:	learn: 945.0388660	total: 19.5s	remaining: 4.85s
900:	learn: 938.1562504	total: 21.9s	remaining: 2.4s
999:	learn: 931.2016044	total: 24.3s	remaining: 0us


In [47]:
cat_pred = np.round(cat_pred)

In [50]:
submission = pd.read_csv(r'data\sample_submission.csv')
submission['answer'] = cat_pred
submission.to_csv('./baseline_submission.csv', index=False)