In [7]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm

### Data

In [8]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
international_trade = pd.read_csv("../data/international_trade.csv")

In [9]:
# for time series model
train["case"] = train["ID"].apply(lambda x : x[0:6])

## for machine learning model
train["timestamp"] = pd.to_datetime(train["timestamp"])
train["year"] = train["timestamp"].dt.year
train["month"] = train["timestamp"].dt.month
train["day"] = train["timestamp"].dt.day
train = train.drop(columns=["ID", "supply(kg)"])

X_train = train[["case", "timestamp", "item", "corporation", "location", "year", "month", "day"]]
y_train = train["price(원/kg)"]

In [10]:
## OneHotEncoding
for catfeature in ["item", "corporation", "location"]:
    encoder = OneHotEncoder(sparse=False)
    fitted_encoder = encoder.fit(X_train[[catfeature]]) 
    cat = pd.DataFrame(fitted_encoder.transform(X_train[[catfeature]].to_numpy()), columns=[str(col) for col in fitted_encoder.categories_[0]])
    X_train = X_train.drop(columns=catfeature)
    X_train = pd.concat([X_train, cat], axis=1)

### KFold

In [11]:
X_train = X_train.drop(columns=["case", "timestamp"])

In [12]:
def kfold(X, y, n_splits:int, shuffle:bool, random_state:int=None):
    '''
    ** shuffle이 True 일 경우 random_state 설정 필요!!
    '''
    if not shuffle:
        random_state = None
    
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    scores = []
    for i, (index_train, index_valid) in enumerate(kf.split(X)):
        # print(f"Fold: {i}")
        X_tr, y_tr = X.loc[index_train], y[index_train]
        X_val, y_val = X.loc[index_valid], y[index_valid]

        model = lightgbm.LGBMRegressor()
        model.fit(X_tr, y_tr, eval_metric="rmse", eval_set=[(X_tr, y_tr), (X_val, y_val)], verbose=False)
        y_val_pred = model.predict(X_val)
        each_fold_rmse = mean_squared_error(y_true=y_val, y_pred=y_val_pred, squared=False)
        scores.append(each_fold_rmse)
        # print(f"     Train Set len: {len(index_train)}")
        # print(f"     Validation Set len: {len(index_valid)}")
        # print(f"     Rmse: {each_fold_rmse}")
        
    score = np.mean(scores)
    # print("-----------------------------------")
    # print(f"Mean Rmse: {score}")
    
    return score

In [13]:
kfold(X_train, y_train, n_splits=4, shuffle=True, random_state=0)

Fold: 0


     Train Set len: 44547
     Validation Set len: 14850
     Rmse: 1190.121824323517
Fold: 1
     Train Set len: 44548
     Validation Set len: 14849
     Rmse: 1218.571740261907
Fold: 2
     Train Set len: 44548
     Validation Set len: 14849
     Rmse: 1181.7742074509133
Fold: 3
     Train Set len: 44548
     Validation Set len: 14849
     Rmse: 1207.2078747610612
-----------------------------------
Mean Rmse: 1199.4189116993498


1199.4189116993498