In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor  # 회귀 문제로 수정
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error
from math import sqrt

In [2]:
train = pd.read_csv('./train_data.csv', index_col=0)
test = pd.read_csv('./test_data.csv', index_col=0)

In [3]:
X = train.drop('Price', axis=1)  # 특성 (Features)
y = train['Price']  # 타겟 (Target)

# 학습 데이터와 테스트 데이터로 분할 (80% 학습, 20% 테스트)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

# XGBoost 모델 초기화
xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42)

# 하이퍼파라미터 그리드 정의
param_grid_xgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

# 그리드 서치 수행
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
grid_search_xgb.fit(X, y)

# 최적 파라미터 출력
print(f"Best parameters (XGBoost): {grid_search_xgb.best_params_}")
print(f"Best RMSE (XGBoost): {-grid_search_xgb.best_score_}")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters (XGBoost): {'colsample_bytree': 0.9, 'learning_rate': 0.3, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 1}
Best RMSE (XGBoost): 1.4029183009358164


In [6]:
# 최적 모델로 예측 수행
best_model_xgb = grid_search_xgb.best_estimator_
y_pred_xgb = best_model_xgb.predict(test)

In [7]:
submission = pd.read_csv('./sample_submission.csv')
submission['가격(백만원)'] = y_pred_xgb
submission.to_csv('./Grid_XGB_sample.csv', index=False)

In [12]:
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.model_selection import ParameterGrid
from tqdm import tqdm
# LightGBM 모델 초기화
lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42)

# 하이퍼파라미터 그리드 정의
param_grid_lgb = {
    'learning_rate': [0.01, 0.1, 0.3],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 5, 10],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

# 그리드 서치 수행
grid_search_lgb = GridSearchCV(estimator=lgb_model, param_grid=param_grid_lgb, scoring='neg_root_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
grid_search_lgb.fit(X, y)

print(f"Best parameters (LightGBM): {grid_search_lgb.best_params_}")
print(f"Best RMSE (LightGBM): {-grid_search_lgb.best_score_}")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 7497, number of used features: 42
[LightGBM] [Info] Start training from score 62.331949
Best parameters (LightGBM): {'colsample_bytree': 1, 'learning_rate': 0.1, 'max_depth': -1, 'num_leaves': 31, 'subsample': 0.8}
Best RMSE (LightGBM): 1.3558846841260266


In [13]:
# 최적 모델로 예측 수행
best_model_lgb = grid_search_lgb.best_estimator_
y_pred_lgb = best_model_lgb.predict(test)

In [14]:
submission = pd.read_csv('./sample_submission.csv')
submission['가격(백만원)'] = y_pred_lgb
submission.to_csv('./Grid_LGBM_sample.csv', index=False)