In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

from sklearn.svm import SVC
from xgboost import XGBClassifier

# Read the data

In [41]:
#와인의 속성으로 와인을 맞추는 데이터
data = pd.read_csv('./data/winequality-red.csv', encoding='utf8')

In [42]:
data.shape

(1599, 12)

In [43]:
data.head(20)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
5,7.4,0.66,0.0,1.8,0.075,13.0,40.0,0.9978,3.51,0.56,9.4,5
6,7.9,0.6,0.06,1.6,0.069,15.0,59.0,0.9964,3.3,0.46,9.4,5
7,7.3,0.65,0.0,1.2,0.065,15.0,21.0,0.9946,3.39,0.47,10.0,7
8,7.8,0.58,0.02,2.0,0.073,9.0,18.0,0.9968,3.36,0.57,9.5,7
9,7.5,0.5,0.36,6.1,0.071,17.0,102.0,0.9978,3.35,0.8,10.5,5


In [44]:
print(data['quality'].value_counts())

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64


In [45]:
# Making binary classificaion for the response variable.
bins = (2, 6.5, 8) # 2~6.5, 6.5~8로 나누겠다
group_names = [0, 1] # 0:Bad, 1:Good
data['quality'] = pd.cut(data['quality'], bins=bins, labels=group_names)

In [46]:
print(data['quality'].value_counts())

0    1382
1     217
Name: quality, dtype: int64


# Split the data

In [47]:
X = data.drop('quality', axis=1)
y = data['quality']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

# Standardize the data

In [49]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Parameter tuning by `GridSearchCV`

## 1. SVC

In [50]:
# 비효율적 하이퍼파라미터 탐색: 54 Candidates
# param_grid = {
#     'C': [0.1, 1, 10],
#     'gamma': [0.0001, 0.001, 0.01],
#     'degree' : [2,3],
#     'kernel': ['linear', 'rbf', 'poly']
# }

param_grid = [
    {'C' : [0.1, 1, 10], 'degree': [2,3], 'kernel':['ploy']},
    {'C' : [0.1, 1, 10], 'gamma' : [0.0001, 0.001, 0.01], 'kernel':['linear', 'rbf']}
]

In [51]:
svc_grid = GridSearchCV(estimator=SVC(random_state=1234), param_grid=param_grid, cv=5, refit=True, verbose=2, n_jobs=-1)

In [52]:
svc_grid.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


TypeError: object of type 'CategoricalDtype' has no len()

In [None]:
# Best parameter for SVC
svc_grid.best_params_

In [None]:
y_test_pred = svc_grid.predict(X_test)

In [None]:
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('=' * 60)
print('<Classification Report>')
print(metrics.classification_report(y_test, y_test_pred))
print('=' * 60)
print('<AUC>')
print(metrics.roc_auc_score(y_test, y_test_pred))

## 2. XGBoost

XGBoost는 매우 많은 하이퍼파라미터를 제공합니다. 그에 대한 설명은 아래 XGBoost 공식 문서를 참조해주세요.
- https://xgboost.readthedocs.io/en/latest/parameter.html

저는 아래 하이퍼파라미터를 주로 세팅합니다.
- max_depth: 개별 나무의 깊이
- learning_rate: Boosting 단계 별 학습율
- n_estimators: 나무의 수
- subsample: 각 나무를 학습할 때 사용하는 포인트 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- colsample_bytree: 각 나무에서 사용하는 features 수의 비율 (0 초과 1 이하의 숫자로 비율 설정)
- gamma: 각 나무에서 분할하는데 필요한 최소 손실감소량
- reg_lambda: 각 나무에 배정되는 weights에 대하여 L2-regularization 강도

In [None]:
param_grid = {
    'max_depth': [3, 4, 5], # 얕게 학습하기 위해 작은 값을 줘야 함
    'learning_rate': [0.001, 0.01, 0.05],
    'n_estimators': [100, 200],
    'subsample': [0.7, 0.8],
#     'colsample_bytree': [0.8, 1],
#     'gamma': [0.1, 0.3, 0.5, 0.7, 0.9],
    'reg_lambda': [0.01, 0.05]
}

In [None]:
xgb = XGBClassifier(random_state=1234)
xgb_grid = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, verbose=2, n_jobs=-1)

In [None]:
xgb_grid.fit(X_train, y_train)

In [None]:
# Best parameter for XGBoost
xgb_grid.best_params_

In [None]:
y_test_pred = xgb_grid.predict(X_test)

In [None]:
print('<Confusion Matrix>')
print(metrics.confusion_matrix(y_test, y_test_pred))
print('=' * 60)
print('<Classification Report>')
print(metrics.classification_report(y_test, y_test_pred))
print('=' * 60)
print('<AUC>')
print(metrics.roc_auc_score(y_test, y_test_pred))

# Parameter tuning by `BayesianOptimization`
- https://github.com/fmfn/BayesianOptimization

In [None]:
# sklearn.model_selection.cross_val_score 
xgb = XGBClassifier()
cross_val_score(xgb, X_train, y_train, scoring='roc_auc', cv=5)

In [None]:
# !pip install bayesian-optimization

In [None]:
from bayes_opt import BayesianOptimization

In [None]:
?BayesianOptimization

In [None]:
?XGBClassifier

In [None]:
def xgboostcv(max_depth, 
              learning_rate, 
              n_estimators, 
              subsample, 
              reg_lambda,
#               colsample_bytree,
#               gamma
             ):
    xgb = XGBClassifier(n_jobs=-1,
                        max_depth=int(max_depth),
                        learning_rate=learning_rate,
                        n_estimators=int(n_estimators),
                        subsample=subsample,
                        reg_lambda=reg_lambda,
#                         colsample_bytree=colsample_bytree,
#                         gamma=gamma
                       )
    return cross_val_score(xgb, X_train, y_train, cv=5).mean()

In [None]:
xgboostBO = BayesianOptimization(xgboostcv,
                                 {'max_depth': [3, 10],
                                  'learning_rate': [0.001, 0.1],
                                  'n_estimators': [100, 200],
                                  'subsample': [0.7, 0.8],
                                  'reg_lambda': [0.01, 0.1],
#                                   'colsample_bytree': [0.8, 1],
#                                   'gamma': [0.1, 0.3, 0.5, 0.7, 0.9]
                                  })

In [None]:
xgboostBO.maximize(init_points=2, n_iter=5)

In [None]:
xgboostBO.max

In [None]:
best_params = xgboostBO.max['params']

In [None]:
best_params['max_depth']

In [None]:
xgb_final = XGBClassifier(max+depth = int(best_params['max_depth']),
                         learning_rate = best_params['learning_rate'],
                         n_estimators = int(best_params['n_estimators']),
                         reg_lambda = best_params['reg_lambda'],
                         subsample = best_params['subsample'])