## XGBoost

In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")
diamonds = sns.load_dataset("diamonds") ## data
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [2]:
from sklearn.model_selection import train_test_split
X, y = diamonds.drop('price', axis=1), diamonds[['price']]

cats = X.select_dtypes(exclude=np.number).columns.tolist()
# Convert to Pandas category
for col in cats:
   X[col] = X[col].astype('category')

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
import xgboost as xgb

In [None]:
dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)
dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)
params = {"objective": "reg:squarederror"} ## 목적함수. 손실함수랑 똑같은 말이라고 볼 수 있음.

n = 500
model = xgb.train(
   params = params,
   dtrain = dtrain_reg, ## data
   num_boost_round = n, ## 몇 개의 나무를 쓸 것이냐.
)

> `DMatrix`는 써도 되고, 안써도 되나, 쓰는 쪽이 더 좋음. 약간 데이터프레임 인코딩하는 느낌으로다가.
>
> 2차 미분까지 사용하므로, `loss_function`은 반드시 `convex`한 형태여야 함.
>
> 존나 오래걸리네

In [None]:
from sklearn.metrics import mean_squared_error

preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 564.712


In [None]:
n=100
params = {"objective": "reg:squarederror", "num_boost_round": 14}
evals = [(dtrain_reg, "train"), (dtest_reg, "validation")]

model = xgb.train(
   params=params,
   dtrain=dtrain_reg,
   evals=evals
)

[0]	train-rmse:2874.49146	validation-rmse:2817.90814
[1]	train-rmse:2092.16823	validation-rmse:2054.95423
[2]	train-rmse:1552.12189	validation-rmse:1527.41146
[3]	train-rmse:1187.31114	validation-rmse:1177.15463
[4]	train-rmse:945.00210	validation-rmse:946.17688
[5]	train-rmse:787.62126	validation-rmse:798.39908
[6]	train-rmse:688.66889	validation-rmse:710.32109
[7]	train-rmse:626.37436	validation-rmse:654.59324
[8]	train-rmse:589.02723	validation-rmse:624.32385
[9]	train-rmse:565.28857	validation-rmse:604.03193


In [None]:
preds = model.predict(dtest_reg)
rmse = mean_squared_error(y_test, preds, squared=False)

print(f"RMSE of the base model: {rmse:.3f}")

RMSE of the base model: 604.032


In [None]:
from sklearn.model_selection import GridSearchCV
from hyperopt import fmin, tpe, hp, STATUS_OK ## 외부 함수인가봄?
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV


X_train_encoded = X_train.copy()
X_test_encoded = X_test.copy()

yc_train = y_train.copy()
yc_train[:] = 0
yc_train[y_train[:] > np.mean(y_train[:])] = 1

yc_test = y_test.copy()
yc_test[:] = 0
yc_test[y_test[:] > np.mean(y_test[:])] = 1

yc_test = y_test
yc_train = y_train

le = LabelEncoder()

# Encode categorical features
for col in ['cut', 'color', 'clarity']:
    X_train_encoded[col] = le.fit_transform(X_train_encoded[col])
    X_test_encoded[col] = le.transform(X_test_encoded[col]) # Use the same encoder for test data

## 그리드를 설정함. 3*3*2개의 경우를 서치함.
param_grid = {"max_depth":    [4, 5, 6],
              "n_estimators": [500, 600, 700],
              "learning_rate": [0.01, 0.015]}
xgb_model = xgb.XGBClassifier()
xgb_model = xgb.XGBRegressor()

# try out every combination of the above values
search = GridSearchCV(xgb_model, param_grid, cv=2).fit(X_train_encoded, yc_train.values.ravel()) ## 하이퍼 파라미터 튜닝. 그리드 서치로 찾음. 2-fold
print("The best hyperparameters are ",search.best_params_)

## random하게 찾아보는 것.
# space = {
#    'max_depth': hp.quniform('max_depth', 2, 8, 1),
#    'learning_rate': hp.loguniform('learning_rate', -5, -2),
#    'subsample': hp.uniform('subsample', 0.5, 1)
#}
#def objective(params):
#    params['max_depth'] = int(params['max_depth'])
#    xgb_model = xgb.XGBRegressor(eval_metric='rmse', **params)
#    xgb_model.fit(X_train_encoded, yc_train.values.ravel())
#    y_pred = xgb_model.predict(X_test_encoded)
#    score = accuracy_score(yc_test, y_pred)
#    return {'loss': -score, 'status': STATUS_OK}

# Perform the optimization
#best_params = fmin(objective, space, algo=tpe.suggest, max_evals=100)
#print("Best set of hyperparameters: ", best_params)

The best hyperparameters are  {'learning_rate': 0.015, 'max_depth': 6, 'n_estimators': 700}


`-` 최적의 파라미터를 모형에 적용

In [32]:
b_param = {'max_depth': 6, 'learning_rate': 0.015, 'n_estimators': 700}
#b_param = {'max_depth': 8, 'learning_rate': 0.078, 'subsample': 0.98}
xgb_model = xgb.XGBClassifier(**b_param)
xgb_model = xgb.XGBRegressor(**b_param)
xgb_model.fit(X_train_encoded, yc_train.values.ravel())

y_pred = xgb_model.predict(X_test_encoded)
#score = accuracy_score(yc_test, y_pred)
score = np.mean((np.array(yc_test) - np.array(y_pred))**2)

print(score)

30566674.262810536


https://velog.io/@hyunicecream/GridSearchCV%EB%9E%80-%EC%96%B4%EB%96%BB%EA%B2%8C-%EC%82%AC%EC%9A%A9%ED%95%A0%EA%B9%8C