In [1]:
try:
    %reload_ext lab_black
except ImportError as error:
    print(error)

import numpy as np
import pandas as pd

import xgboost as xgb
from sklearn.model_selection import KFold

In [2]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")
submission = pd.read_csv("data/sample_submission.csv")

train_df.shape, test_df.shape, submission.shape

((10000, 76), (10000, 72), (10000, 5))

In [3]:
def train_model(x_data, y_data, k=5):
    models = []

    k_fold = KFold(n_splits=k, shuffle=True, random_state=0)

    for train_idx, eval_idx in k_fold.split(x_data):
        X_train, y_train = x_data.iloc[train_idx], y_data[train_idx]
        X_eval, y_eval = x_data.iloc[eval_idx], y_data[eval_idx]

        dtrain = xgb.DMatrix(data=X_train, label=y_train)
        dval = xgb.DMatrix(data=X_eval, label=y_eval)

        wlist = [(dtrain, "train"), (dval, "eval")]

        params = {"objective": "reg:squarederror", "eval_metric": "mae", "seed": 0}

        model = xgb.train(
            params=params,
            dtrain=dtrain,
            num_boost_round=1000,
            verbose_eval=1000,
            evals=wlist,
        )
        models.append(model)

    return models

In [4]:
import os

if not os.path.exists("encoded_result"):
    os.makedirs("encoded_result")

In [5]:
for n in range(4, 10 + 1):
    encoded_train = pd.read_csv(f"encoded_data/encoded_rlt_train_{n}.csv", index_col=0)
    encoded_test = pd.read_csv(f"encoded_data/encoded_rlt_test_{n}.csv", index_col=0)

    X_train = encoded_train
    y_train = train_df.loc[:, "hhb":"na"]

    print(n)
    models = {}
    for label in y_train.columns:
        print("train column : ", label)
        models[label] = train_model(X_train, y_train[label])
        print()

    for col in models:
        preds = []
        for model in models[col]:
            preds.append(model.predict(xgb.DMatrix(encoded_test)))
        pred = np.mean(preds, axis=0)

        submission[col] = pred

    submission.to_csv(f"encoded_result/encoded_{n}.csv", index=False)

    for col in models:
        preds = []
        for model in models[col]:
            preds.append(model.predict(xgb.DMatrix(encoded_train)))
        pred = np.mean(preds, axis=0)

        train_df[col] = pred

    train_df.to_csv(f"encoded_result/train_encoded_{n}.csv", index=False)

4
train column :  hhb
[0]	train-mae:5.35348	eval-mae:5.26427
[999]	train-mae:0.31755	eval-mae:2.71913
[0]	train-mae:5.33550	eval-mae:5.38564
[999]	train-mae:0.30608	eval-mae:2.70105
[0]	train-mae:5.33358	eval-mae:5.38950
[999]	train-mae:0.30169	eval-mae:2.69816
[0]	train-mae:5.32920	eval-mae:5.41263
[999]	train-mae:0.30094	eval-mae:2.70751
[0]	train-mae:5.35471	eval-mae:5.25258
[999]	train-mae:0.30678	eval-mae:2.72927

train column :  hbo2
[0]	train-mae:2.45483	eval-mae:2.49652
[999]	train-mae:0.10223	eval-mae:0.92713
[0]	train-mae:2.45878	eval-mae:2.47459
[999]	train-mae:0.10619	eval-mae:0.89456
[0]	train-mae:2.46534	eval-mae:2.43288
[999]	train-mae:0.10243	eval-mae:0.90316
[0]	train-mae:2.46220	eval-mae:2.45119
[999]	train-mae:0.10117	eval-mae:0.91585
[0]	train-mae:2.46275	eval-mae:2.44811
[999]	train-mae:0.10645	eval-mae:0.90520

train column :  ca
[0]	train-mae:6.01293	eval-mae:6.00948
[999]	train-mae:0.30106	eval-mae:2.70681
[0]	train-mae:6.00767	eval-mae:6.04479
[999]	train-mae:0