## 匯入所需套件

In [None]:
import pandas as pd
import numpy as np
import random

In [None]:
import xgboost as xgb

In [None]:
from xgboost import XGBClassifier

## 匯入資料

將在R裡進行過undersampling的「Y」及「N」的1:1資料(one.csv)匯入python，以尋找最佳參數

In [None]:
data = pd.read_csv("one.csv")

## 資料的轉換與選取
進行資料的轉換及選取，使資料成為可以尋找最佳參數的型式

In [None]:
#將「N」及「Y」利用map函數轉變為「0」與「1」
mapping = {
           'N': 0,
           'Y': 1,
           }
data["Y1"] = data["Y1"].map(mapping)

In [None]:
#進行資料選取
a = data.iloc[:,0:129] #除了Y1以外的其他欄位
b = data.iloc[:, [130]] #僅有Y1變數欄位

## 尋找最佳參數

In [None]:
best_param = list()
best_seednumber = 123
best_logloss = np.Inf
best_logloss_index = 0

dtrain = xgb.DMatrix(a, b, feature_names = list(a))

for iter in range(1000):
    param = {
           'objective' : "binary:logistic",            
           'max_depth' : np.random.randint(6,30),         
           'eta' : np.random.uniform(.01, .3),            
           'gamma' : np.random.uniform(0.0, 0.2),         
           'subsample' : np.random.uniform(.6, 1),             
           'colsample_bytree' : np.random.uniform(.5, .8), 
           'min_child_weight' : np.random.randint(1,41),
           'max_delta_step' : np.random.randint(1,11)}

    cv_nround = 100                                  
    cv_nfold = 5                                     
    seed_number = np.random.randint(0,100)
    random.seed(seed_number)

    mdcv = xgb.cv(params = param, dtrain=dtrain,metrics=["auc","rmse","error","logloss"],
                   nfold=cv_nfold, num_boost_round=cv_nround, verbose_eval = None,
                   early_stopping_rounds=8, maximize=False)

    min_logloss = min(mdcv['test-logloss-mean'])
    min_logloss_index = mdcv.index[mdcv["test-logloss-mean"] == min(mdcv["test-logloss-mean"])][0]

    if min_logloss < best_logloss:
        best_logloss = min_logloss
        best_logloss_index = min_logloss_index
        best_seednumber = seed_number
        best_param = param


random.seed(best_seednumber)
nround = best_logloss_index
print('best_round = %d, best_seednumber = %d' %(nround,best_seednumber))
print('best_param : ------------------------------')
print(best_param)