# Kaggle Study 8일차(porto)

코드출처 : https://www.kaggle.com/aharless/xgboost-cv-lb-284

## 1회차

In [1]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50 

처음에는 MAX_ROUNDS를 상당히 높게 설정하고 OPTIMIZE_ROUNDS를 사용하여 적절한 수의 라운드를 설정하려고 했다.(내 판단으로는 모든 폴드 중에서 best_ntree_limit의 최대 값에 가까워야 하며, 모델이 적절히 정규화되었다면 조금 더 높을 수도 있고... 또는 상세 =을 설정할 수도 있습니다.참이고 세부 사항을 살펴본 후 모든 접힘에 적합한 여러 라운드를 찾아보십시오.) 그런 다음 Optimize_ROUNDs를 끄고 MAX_ROUNDs를 적절한 총 라운드 수로 설정하려 했다.  
  
각 폴드에 가장 적합한 라운드를 선택하여 "조기 중지"할 때의 문제는 검증 데이터에 지나치게 적합하다는 것이다.  
  
따라서 테스트 데이터를 예측하기 위한 최적의 모델을 생성하지 않을 수 있으며, 다른 모델과의 스택/결합을 위한 검증 데이터를 생성하는 데 사용될 경우 이 모델의 앙상블에 너무 많은 무게가 실리게 된다.  
  
또 다른 가능성(XGBoost의 경우 기본값인 것 같음)은 최적 라운드가 아닌 조기 스톱이 실제로 발생하는 라운드(개선 부족을 확인하는 지연 시간 포함)를 사용하는 것이다. 이렇게 하면 과적합 문제가 해결되지만(지체가 충분히 길다면) 아직까지는 도움이 되지 않은 것 같다. (모든 접힘에 대해 일정한 회차 수보다 접힘당 20라운드 조기 정지 점수를 더 못 받아 조기 정지가 실제로 맞지 않는 것 같았다.)

In [2]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

In [3]:
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [22]:
def gini_xgb(pred,dtrain):
    labels = dtrain.get_label()
    gini_score=-eval_gini(labels,preds)
    return[('gini',gini_score)]

def add_noise(series,noise_level):
    return series * (1+noise_level*np.random.randn(len(series)))

def target_encode(trn_series=None,
                 val_series=None,
                 tst_series=None,
                 target=None,
                 min_samples_leaf=1,
                 smoothing=1,
                 noise_level=0):
    assert len(trn_series)==len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series,target],axis=1)
    
    averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean','count'])
    
    smoothing = 1 / (1+np.exp(-(averages['count']-min_samples_leaf)/smoothing))
    
    prior = target.mean()
    
    averages[target.name] = prior * ( 1- smoothing) + averages['mean'] * smoothing
    averages.drop(['mean','count'],axis=1,inplace=True)
    
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_trn_series.index = trn_series.index
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [5]:
train_df = pd.read_csv('C:/Users/이동훈/Desktop/github/kaggle/kagglestudy/Data/porto/train.csv',na_values='-1')
test_df = pd.read_csv('C:/Users/이동훈/Desktop/github/kaggle/kagglestudy/Data/porto/test.csv',na_values='-1')

In [9]:
train_features = ["ps_car_13","ps_reg_03","ps_ind_05_cat","ps_ind_03",
                  "ps_ind_15","ps_reg_02","ps_car_14","ps_car_12",
                  "ps_car_01_cat","ps_car_07_cat","ps_ind_17_bin","ps_car_03_cat",
                  "ps_reg_01","ps_car_15","ps_ind_01","ps_ind_16_bin",
                  "ps_ind_07_bin","ps_car_06_cat","ps_car_04_cat","ps_ind_06_bin",
                  "ps_car_09_cat","ps_car_02_cat","ps_ind_02_cat","ps_car_11",
                  "ps_car_05_cat","ps_calc_09","ps_calc_05","ps_ind_08_bin",
                  "ps_car_08_cat","ps_ind_09_bin","ps_ind_04_cat","ps_ind_18_bin",
                  "ps_ind_12_bin","ps_ind_14"]
combs = [('ps_reg_01','ps_car_02_cat'),
        ('ps_reg_01','ps_car_04_cat')]

In [10]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [11]:
y_valid_pred=0*y
y_test_pred=0

In [13]:
K=5
kf = KFold(n_splits=K,random_state=1,shuffle=True)
np.random.seed(0)

In [15]:
model = XGBClassifier(n_estimators=MAX_ROUNDS,
                     max_depth=4,
                     objective='binary:logistic',
                     learning_rate=LEARNING_RATE,
                     subsample=.8,
                     min_child_weight=6,
                     colsample_bytree=.8,
                     scale_pos_weight=1.6,
                     gamma=10,
                     reg_alpha=8,
                     reg_lambda=1.3)

In [24]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)


Fold  0


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-3-e1bbcc8b7298> (3)[0m
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 9:[0m
[1mdef eval_gini(y_true, y_prob):
    <source elided>
    n = len(y_true)
[1m    for i in range(n-1, -1, -1):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


  Gini =  0.2856959531750338

Fold  1
  Gini =  0.2825270426290394

Fold  2
  Gini =  0.2744124272744569

Fold  3
  Gini =  0.29925913576337726

Fold  4
  Gini =  0.28468083823013424

Gini for full training set:


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-3-e1bbcc8b7298> (3)[0m
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 9:[0m
[1mdef eval_gini(y_true, y_prob):
    <source elided>
    n = len(y_true)
[1m    for i in range(n-1, -1, -1):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-3-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  state.func_ir.loc))


0.28509183378958614

In [25]:
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

In [26]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('xgb_submit.csv', float_format='%.6f', index=False)

## 2회차

In [28]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50

처음에는 MAX_ROUNDS를 상당히 높게 설정하고 OPTIMIZE_ROUNDS를 사용하여 적절한 수의 라운드를 설정하려고 했다.(내 판단으로는 모든 폴드 중에서 best_ntree_limit의 최대 값에 가까워야 하며, 모델이 적절히 정규화되었다면 조금 더 높을 수도 있고... 또는 상세 =을 설정할 수도 있습니다.참이고 세부 사항을 살펴본 후 모든 접힘에 적합한 여러 라운드를 찾아보십시오.) 그런 다음 Optimize_ROUNDs를 끄고 MAX_ROUNDs를 적절한 총 라운드 수로 설정하려 했다.  
  
각 폴드에 가장 적합한 라운드를 선택하여 "조기 중지"할 때의 문제는 검증 데이터에 지나치게 적합하다는 것이다.  
  
따라서 테스트 데이터를 예측하기 위한 최적의 모델을 생성하지 않을 수 있으며, 다른 모델과의 스택/결합을 위한 검증 데이터를 생성하는 데 사용될 경우 이 모델의 앙상블에 너무 많은 무게가 실리게 된다.  
  
또 다른 가능성(XGBoost의 경우 기본값인 것 같음)은 최적 라운드가 아닌 조기 스톱이 실제로 발생하는 라운드(개선 부족을 확인하는 지연 시간 포함)를 사용하는 것이다. 이렇게 하면 과적합 문제가 해결되지만(지체가 충분히 길다면) 아직까지는 도움이 되지 않은 것 같다. (모든 접힘에 대해 일정한 회차 수보다 접힘당 20라운드 조기 정지 점수를 더 못 받아 조기 정지가 실제로 맞지 않는 것 같았다.)

In [29]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

In [30]:
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

In [31]:
def gini_xgb(pred,dtrain):
    labels = dtrain.get_label()
    gini_score=-eval_gini(labels,preds)
    return[('gini',gini_score)]

def add_noise(series,noise_level):
    return series * (1+noise_level*np.random.randn(len(series)))

def target_encode(trn_series=None,
                 val_series=None,
                 tst_series=None,
                 target=None,
                 min_samples_leaf=1,
                 smoothing=1,
                 noise_level=0):
    assert len(trn_series)==len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series,target],axis=1)
    
    averages = temp.groupby(by=trn_series.name)[target.name].agg(['mean','count'])
    
    smoothing = 1 / (1+np.exp(-(averages['count']-min_samples_leaf)/smoothing))
    
    prior = target.mean()
    
    averages[target.name] = prior * ( 1- smoothing) + averages['mean'] * smoothing
    averages.drop(['mean','count'],axis=1,inplace=True)
    
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_trn_series.index = trn_series.index
    
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_val_series.index = val_series.index
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    ft_tst_series.index = tst_series.index
    
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [32]:
train_df = pd.read_csv('C:/Users/이동훈/Desktop/github/kaggle/kagglestudy/Data/porto/train.csv',na_values='-1')
test_df = pd.read_csv('C:/Users/이동훈/Desktop/github/kaggle/kagglestudy/Data/porto/test.csv',na_values='-1')

In [33]:
train_features = ["ps_car_13","ps_reg_03","ps_ind_05_cat","ps_ind_03",
                  "ps_ind_15","ps_reg_02","ps_car_14","ps_car_12",
                  "ps_car_01_cat","ps_car_07_cat","ps_ind_17_bin","ps_car_03_cat",
                  "ps_reg_01","ps_car_15","ps_ind_01","ps_ind_16_bin",
                  "ps_ind_07_bin","ps_car_06_cat","ps_car_04_cat","ps_ind_06_bin",
                  "ps_car_09_cat","ps_car_02_cat","ps_ind_02_cat","ps_car_11",
                  "ps_car_05_cat","ps_calc_09","ps_calc_05","ps_ind_08_bin",
                  "ps_car_08_cat","ps_ind_09_bin","ps_ind_04_cat","ps_ind_18_bin",
                  "ps_ind_12_bin","ps_ind_14"]
combs = [('ps_reg_01','ps_car_02_cat'),
        ('ps_reg_01','ps_car_04_cat')]

In [37]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1,f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'%(name1,n_c+1,(time.time()-start)/60),end='')
    print('\r'*75,end='')
    train_df[name1] = train_df[f1].apply(lambda x:str(x)) + '_' +train_df[f2].apply(lambda x:str(x))
    test_df[name1] = test_df[f1].apply(lambda x:str(x)) + '_' +test_df[f2].apply(lambda x:str(x))
    
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values)+list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))
    
    train_features.append(name1)
    
X= train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if '_cat'in f]

current feature                                 ps_reg_01_plus_ps_car_04_cat    2 in   0.1

In [38]:
y_valid_pred = 0*y
y_test_pred = 0

In [39]:
K=5
kf = KFold(n_splits=K,random_state=1,shuffle=True)
np.random.seed(0)

In [41]:
model = XGBClassifier(n_estimators=MAX_ROUNDS,
                     max_depth=4,
                     objective='binary:logistic',
                     learning_rate=LEARNING_RATE,
                     subsample=.8,
                     min_child_weight=6,
                     colsample_bytree=.8,
                     scale_pos_weight=1.6,
                     gamma=10,
                     reg_alpha=8,
                     reg_lambda=1.3)

In [42]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)


Fold  0


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-30-e1bbcc8b7298> (3)[0m
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 9:[0m
[1mdef eval_gini(y_true, y_prob):
    <source elided>
    n = len(y_true)
[1m    for i in range(n-1, -1, -1):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  state.func_ir.lo

  Gini =  0.2856959531750338

Fold  1
  Gini =  0.2825270426290394

Fold  2
  Gini =  0.2744124272744569

Fold  3
  Gini =  0.29925913576337726

Fold  4
  Gini =  0.28468083823013424

Gini for full training set:


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mnon-precise type pyobject[0m
[0m[1m[1] During: typing of argument at <ipython-input-30-e1bbcc8b7298> (3)[0m
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: [1m[1mcannot determine Numba type of <class 'numba.dispatcher.LiftedLoop'>[0m
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 9:[0m
[1mdef eval_gini(y_true, y_prob):
    <source elided>
    n = len(y_true)
[1m    for i in range(n-1, -1, -1):
[0m    [1m^[0m[0m
[0m[0m
  @jit
[1m
File "<ipython-input-30-e1bbcc8b7298>", line 3:[0m
[1mdef eval_gini(y_true, y_prob):
[1m    y_true = np.asarray(y_true)
[0m    [1m^[0m[0m
[0m
  state.func_ir.lo

0.28509183378958614

In [43]:
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

In [44]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('xgb_submit.csv', float_format='%.6f', index=False)