In [5]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train_x = train.drop(['ID', 'Calories_Burned'], axis = 1)
train_y = train['Calories_Burned']
test_x = test.drop('ID', axis = 1)

ordinal_features = ['Weight_Status', 'Gender']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])

    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

# CatBoost 모델 정의
cat_params = {
    "learning_rate": 0.20909079092170735,
    "depth": 6,
    "od_pval": 0.236844398775451,
    "model_size_reg": 0.30614059763442997,
    "l2_leaf_reg": 5.535171839105427,
    "loss_function": "RMSE",
    "random_seed": 42,
    "verbose": 1
}

# 랜덤포레스트 모델 정의
rf_params = {
    'n_estimators': 100,
    'max_depth': 6,
    'random_state': 91,
    'n_jobs': -1
}

# CatBoost 모델과 랜덤포레스트 모델의 앙상블
cat_model = CatBoostRegressor(**cat_params)
rf_model = RandomForestRegressor(**rf_params)

n_splits = 7
kf = KFold(n_splits=n_splits, random_state=91, shuffle=True)

cat_val_preds = np.zeros(len(train_x))
rf_val_preds = np.zeros(len(train_x))
test_preds_cat = np.zeros(len(test_x))
test_preds_rf = np.zeros(len(test_x))

for train_idx, val_idx in kf.split(train_x, train_y):
    X_train, X_val = train_x.iloc[train_idx], train_x.iloc[val_idx]
    y_train, y_val = train_y.iloc[train_idx], train_y.iloc[val_idx]

    cat_model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, verbose=0)
    cat_val_preds[val_idx] = cat_model.predict(X_val)
    test_preds_cat += cat_model.predict(test_x) / n_splits

    rf_model.fit(X_train, y_train)
    rf_val_preds[val_idx] = rf_model.predict(X_val)
    test_preds_rf += rf_model.predict(test_x) / n_splits

# CatBoost 모델 검증 데이터 예측 및 RMSE 계산
cat_val_rmse = np.sqrt(mean_squared_error(train_y, cat_val_preds))
print(f'CatBoost Validation RMSE: {cat_val_rmse:.4f}')

# 랜덤포레스트 모델 검증 데이터 예측 및 RMSE 계산
rf_val_rmse = np.sqrt(mean_squared_error(train_y, rf_val_preds))
print(f'RandomForest Validation RMSE: {rf_val_rmse:.4f}')

# 앙상블 모델 검증 데이터 예측 및 RMSE 계산
ensemble_val_preds = 0.6 * cat_val_preds + 0.4 * rf_val_preds
ensemble_rmse = np.sqrt(mean_squared_error(train_y, ensemble_val_preds))
print(f'Ensemble Validation RMSE: {ensemble_rmse:.4f}')

# CatBoost 모델 결과 저장
submission = pd.DataFrame({'ID': test['ID'], 'Calories_Burned': test_preds_cat})
submission.to_csv('n_submission_cat.csv', index=False)

# 랜덤포레스트 모델 결과 저장
submission = pd.DataFrame({'ID': test['ID'], 'Calories_Burned': test_preds_rf})
submission.to_csv('n_submission_rf.csv', index=False)

# 앙상블 모델 결과 저장
submission = pd.DataFrame({'ID': test['ID'], 'Calories_Burned': 0.6 * test_preds_cat + 0.4 * test_preds_rf})
submission.to_csv('n_submission_ensemble.csv', index=False)

CatBoost Validation RMSE: 0.9498
RandomForest Validation RMSE: 7.6964
Ensemble Validation RMSE: 3.2190
