In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score

from hyperopt import hp, STATUS_OK, fmin, tpe, Trials
from xgboost import XGBClassifier

In [None]:
import hyperopt
print(hyperopt.__version__)
print(np.__version__)

#!pip install -U numpy

0.2
1.22.4


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [None]:
data_path = '/content/drive/MyDrive/Colab Notebooks/'

train = pd.read_csv(data_path + 'dataset/train.csv')
test = pd.read_csv(data_path + 'dataset/test.csv')
sample_submission = pd.read_csv(data_path + 'dataset/sample_submission.csv')

# **Preprocessing**

## **LabelEncoder**

In [None]:
cols = ['LINE', 'PRODUCT_CODE']

for i in cols:
    le = LabelEncoder()
    le = le.fit(train[i])
    train[i] = le.transform(train[i])
    
    for label in np.unique(test[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i] = le.transform(test[i])

## **drop**

In [None]:
cols = ['PRODUCT_ID', 'TIMESTAMP']
for col in cols:
    train.drop([col], axis=1, inplace=True)
    test.drop([col], axis=1, inplace=True)

## **fillna**

In [None]:
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)

In [None]:
print('train shape :', train.shape, '\ntest shape :', test.shape)

train shape : (598, 2879) 
test shape : (310, 2877)


In [None]:
train.head(1)

Unnamed: 0,Y_Class,Y_Quality,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,1,0.533433,2,0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.34,40.89,32.56,34.09,77.77,0.0,0.0,0.0,0.0,0.0


In [None]:
test.head(1)

Unnamed: 0,LINE,PRODUCT_CODE,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,...,X_2866,X_2867,X_2868,X_2869,X_2870,X_2871,X_2872,X_2873,X_2874,X_2875
0,5,2,2.0,94.0,0.0,45.0,10.0,0.0,51.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# **Model**

## **split**

In [None]:
X = train.drop(columns=['Y_Class', 'Y_Quality']) # data
y = train['Y_Class']                             # target

X_tst = test

In [None]:
X_trn, X_val, y_trn, y_val = train_test_split(
    X, y, 
    test_size=0.07,
    stratify=y
)

In [None]:
print('X_trn (data) :', X_trn.shape, '\ny_trn (target) :', y_trn.shape, 
      '\nX_val (data) :', X_val.shape, '\ny_val (target) :', y_val.shape)

X_trn (data) : (556, 2877) 
y_trn (target) : (556,) 
X_val (data) : (42, 2877) 
y_val (target) : (42,)


## **xgb parameters**

- **booster**
  - 사용할 부스터 선택
  - 'gbtree' : 트리 모델
  - 'gblinear' : 선형 모델
- **n_estimators**
  - 트리 모델 개수
  - 100
- **learning_rate**
  - 학습 단계별로 이전 결과를 얼마나 반영할지 설정 (작을수록 오래 걸림)
  - 0.3 `[0, ∞] --- 0.01 ~ 0.2`
- min_split_loss
  - 트리의 리프 노드에서 추가 분할을 만드는 데 필요한 최소 손실 감소 (클수록 보수적임)
  - 0 `[0, ∞]`
  - ***과적합이 심하다면, learning_rate 감소(0.01 ~ 0.1) n_estimators 증가***
- **max_depth**
  - 트리 최대 깊이 (클수록 과적합 가능성 높아짐)
  - 6 `[0, ∞] --- 3 ~ 10`
  - ***과적합이 심하다면, 감소***
- **min_child_weight**
  - child에 필요한 instance weight (hessian)의 최소 합계 (클수록 보수적임)
  - 1 `[0, ∞]`
  - ***과적합이 심하다면, 증가***
- max_delta_step
  - 클래스가 극도로 불균형할 경우, 로지스틱 회귀에 도움이 됨
  - 0 `[0, ∞] --- 1 ~ 10`
- **subsample**
  - training instances의 subsample 비율로, 과적합 방지 역할 (0.5로 설정하면 학습 데이터의 절반을 랜덤하게 샘플링)
  - 1 `(0, 1] --- 0.5 ~ 1`
  - ***과적합이 심하다면, 조정***
- sampling_method
  - training instances를 샘플링하는 데 사용할 method
  - uniform : 각 training instances가 선택될 확률이 동일
- colsample_by* 매개 변수는 누적적으로 작동
  - **colsample_bytree**
    - 각 트리를 구성할 때 열의 하위 샘플 비율
    - 1 `(0, 1] --- 0.5 ~ 1`
    - ***과적합이 심하다면, 조정***
  - colsample_bylevel
    - 각 수준에 대한 열의 부분표본 비율
    - 1 `(0, 1]`
  - colsample_bynode
    - 각 노드에 대한 열의 하위 표본 비율
    - 1 `(0, 1]`
- **scale_pos_weight**
  - 불균형 클래스에 유용 `음성 데이터 수 / 양성 데이터 수`
- **eval_metric**
  - logloss

## **HyperOpt**

In [None]:
space = {'learning_rate' : hp.uniform('learning_rate', 0.01, 0.2),
         'min_split_loss' : hp.uniform('min_split_loss', 0, 1),
         'max_depth' : hp.quniform('max_depth', 3, 15, 1),
         'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
         'subsample' : hp.uniform('subsample', 0.5, 1),
         'colsample_bytree' : hp.uniform('colsample_bytree', 0.5, 1),
         'scale_pos_weight' : hp.quniform('scale_pos_weight', 1, 6, 2)} # 'n_estimators' : hp.quniform('n_estimators', 1000, 2000, 100),

def objective_func(space):
  xgb = XGBClassifier(
      learning_rate = space['learning_rate'],
      min_split_loss = space['min_split_loss'],
      max_depth = int(space['max_depth']),
      min_child_weight = int(space['min_child_weight']),
      subsample = space['subsample'],
      colsample_bytree = space['colsample_bytree'],
      scale_pos_weight = int(space['scale_pos_weight'])
  )

  accuracy = cross_val_score(
      xgb,
      X_trn,
      y_trn,
      scoring = 'accuracy',
      cv = 2
  )

  return {'loss' : -1 * np.mean(accuracy), 'status' : STATUS_OK}

In [None]:
trials = Trials()
best = fmin(
    fn = objective_func,
    space = space,
    algo = tpe.suggest,
    max_evals = 100,
    trials = trials
)
print('best :', best)

100%|██████████| 100/100 [1:53:20<00:00, 68.00s/it, best loss: -0.7553956834532374]
best : {'colsample_bytree': 0.7033642809778637, 'learning_rate': 0.14042298986174134, 'max_depth': 5.0, 'min_child_weight': 2.0, 'min_split_loss': 0.7002219052816658, 'scale_pos_weight': 2.0, 'subsample': 0.9753902224969269}


In [None]:
xgb_best = XGBClassifier(
    learning_rate = round(0.14042298986174134, 5),
    min_split_loss = round(0.7002219052816658, 5),
    max_depth = 5,
    min_child_weight = 2,
    subsample = round(0.9753902224969269, 5),
    colsample_bytree = round(0.7033642809778637, 5),
    scale_pos_weight = 2,
    n_estimators = 500
)

evals = [(X_trn, y_trn), (X_val, y_val)]
xgb_best.fit(
    X_trn,
    y_trn,
    early_stopping_rounds=100,
    eval_metric='mlogloss', # logloss 안 됨
    eval_set=evals,
    verbose=True
)

[0]	validation_0-mlogloss:0.974124	validation_1-mlogloss:1.01004
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 100 rounds.
[1]	validation_0-mlogloss:0.881661	validation_1-mlogloss:0.925151
[2]	validation_0-mlogloss:0.804188	validation_1-mlogloss:0.870941
[3]	validation_0-mlogloss:0.7369	validation_1-mlogloss:0.820177
[4]	validation_0-mlogloss:0.680612	validation_1-mlogloss:0.774169
[5]	validation_0-mlogloss:0.635839	validation_1-mlogloss:0.739376
[6]	validation_0-mlogloss:0.594031	validation_1-mlogloss:0.703743
[7]	validation_0-mlogloss:0.556154	validation_1-mlogloss:0.680655
[8]	validation_0-mlogloss:0.526571	validation_1-mlogloss:0.656782
[9]	validation_0-mlogloss:0.496107	validation_1-mlogloss:0.633227
[10]	validation_0-mlogloss:0.467338	validation_1-mlogloss:0.611973
[11]	validation_0-mlogloss:0.438742	validation_1-mlogloss:0.607481
[12]	validation_0-mlogloss:0.417062	valid

XGBClassifier(colsample_bytree=0.70336, learning_rate=0.14042, max_depth=5,
              min_child_weight=2, min_split_loss=0.70022, n_estimators=500,
              objective='multi:softprob', scale_pos_weight=2,
              subsample=0.97539)

In [None]:
pred = xgb_best.predict(X_tst)

sample_submission['Y_Class'] = pred
sample_submission.to_csv(data_path + 'xgb_hyperopt_1.csv', index=False)

# **Result**

In [None]:
xgb_hyperopt_1 = pd.read_csv(data_path + 'xgb_hyperopt_1.csv')
xgb_hyperopt_1.Y_Class.value_counts()

1    264
0     43
2      3
Name: Y_Class, dtype: int64