In [1]:
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import os
from os.path import join
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error



In [2]:
# 데이터 로드
data_dir = "~/aiffel/kaggle_kakr_housing/data"
train_data_path = join(data_dir, 'train.csv')
sub_data_path = join(data_dir, 'test.csv')

In [3]:
data = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)

In [4]:
# 1. 피처 엔지니어링
# 'price'를 타겟 변수로 분리
y = data['price']
X = data.drop(['price'], axis=1)

In [5]:
# 카테고리형 변수를 원-핫 인코딩
X = pd.get_dummies(X)
sub = pd.get_dummies(sub)
sub = sub.reindex(columns=X.columns, fill_value=0)

In [6]:
# 훈련/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# 2. 모델 정의 및 하이퍼파라미터 튜닝
models = {
    'RandomForest': RandomForestRegressor(random_state=2019),
    'XGBRegressor': XGBRegressor(random_state=2019),
    'LGBMRegressor': LGBMRegressor(random_state=2019)
}

In [8]:
# 하이퍼파라미터 그리드 설정
param_grids = {
    'RandomForest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    'XGBRegressor': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.05, 0.1],
        'subsample': [0.8, 1.0]
    },
    'LGBMRegressor': {
        'n_estimators': [100, 200],
        'num_leaves': [31, 62],
        'learning_rate': [0.05, 0.1],
        'feature_fraction': [0.8, 1.0]
    }
}

In [9]:
# 그리드 서치 및 모델 학습
best_estimators = {}
for name, model in models.items():
    print(f"Training and tuning {name}...")
    grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best {name}: {grid_search.best_params_}")

Training and tuning RandomForest...
Best RandomForest: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Training and tuning XGBRegressor...




Best XGBRegressor: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Training and tuning LGBMRegressor...
Best LGBMRegressor: {'feature_fraction': 0.8, 'learning_rate': 0.05, 'n_estimators': 200, 'num_leaves': 62}


In [10]:
# 3. 모델 블렌딩 (간단한 평균 블렌딩)
predictions = np.zeros(sub.shape[0])
for name, model in best_estimators.items():
    print(f"Predicting with {name}...")
    pred = model.predict(sub)
    predictions += pred / len(best_estimators)

Predicting with RandomForest...
Predicting with XGBRegressor...
Predicting with LGBMRegressor...


In [11]:
# 검증 데이터에 대한 성능 평가
for name, model in best_estimators.items():
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    print(f"{name} Validation MAE: {mae}")

RandomForest Validation MAE: 75412.98417559279
XGBRegressor Validation MAE: 71083.49356449535
LGBMRegressor Validation MAE: 69566.23641810025


In [12]:
# 4. 제출 파일 생성
submission = pd.DataFrame({
    'id': sub['id'],
    'price': predictions
})

submission.to_csv('submission.csv', index=False)
print("Submission saved!")

Submission saved!


In [None]:
# Private Score: 116160.24594