## Model Fiting and Prediction

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import matplotlib.pyplot as plt 
import xgboost as xgb
from sklearn.metrics import auc
from sklearn.model_selection import GridSearchCV
import pickle

In [4]:
train = pd.read_csv('input/train_fe.csv')

In [5]:
X  = train.drop(columns = ['Unnamed: 0', 'is_attributed'])
y  = train['is_attributed'].astype('uint8')
del train

In [17]:
param_set = {'n_estimators':[30,40,50],
             'learning_rate': [0.01,0.05,0.1,1],
             'num_leaves': [3,5,7,9],  # we should let it be smaller than 2^(max_depth)
             'max_depth': [4,5,7,8],  # -1 means no limit
             'max_bin': [50,100,200],  # Number of bucketed bin for feature values
             }

estimator = lgb.LGBMClassifier(boosting_type='gbdt', 
                           objective='binary',
                           metric='auc',
                           min_split_gain=0,
                           min_child_weight=5,
                           min_child_samples=10,
                           subsample= 0.7,  # Subsample ratio of the training instance.
                           subsample_freq=1,  # frequence of subsample, <=0 means no enable\
                           colsample_bytree=0.7,  # Subsample ratio of columns when constructing each tree.
                           reg_alpha=1, 
                           reg_lambda=0,
                           seed=410, 
                           nthread=4, 
                           silent=True)


gsearch = GridSearchCV(estimator, param_grid = param_set, scoring='roc_auc',n_jobs=1,iid=False, cv=10)

lgb_model = gsearch.fit(X.values, y.values)
lgb_model.best_params_, lgb_model.best_score_



([mean: 0.95685, std: 0.01548, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 30, 'num_leaves': 3},
  mean: 0.96505, std: 0.01327, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 30, 'num_leaves': 5},
  mean: 0.96856, std: 0.01129, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 30, 'num_leaves': 7},
  mean: 0.97043, std: 0.01097, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 30, 'num_leaves': 9},
  mean: 0.95752, std: 0.01592, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 40, 'num_leaves': 3},
  mean: 0.96521, std: 0.01369, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 40, 'num_leaves': 5},
  mean: 0.96912, std: 0.01116, params: {'learning_rate': 0.01, 'max_bin': 50, 'max_depth': 4, 'n_estimators': 40, 'num_leaves': 7},
  mean: 0.97098, std: 0.01076, params: {'learning_rate': 0.01, 'max_bin': 50

In [None]:
lgb_params = {'boosting_type': 'gbdt',
              'objective': 'binary',
              'metric':'auc',
              'learning_rate': 0.05,
              'num_leaves': 9,
              'max_depth': 7,
              'min_child_samples': 10,
              'max_bin': 100,
              'subsample': 0.7,
              'subsample_freq': 1,
              'colsample_bytree': 0.7,
              'min_child_weight': 5,
              'min_split_gain': 0,
              'reg_alpha': 1,
              'reg_lambda': 0,
              'nthread': 4,
              'verbose': 0,
             }

predictors = list(X.columns())
categorical_features = []
xgtrain = lgb.Dataset(dtrain[predictors].values, label=y.values,
                      feature_name=predictors,
                      categorical_feature=categorical_features
                     )

evals_results = {}

lightGBM_model = lgb.train(lgb_params, 
                     xgtrain, 
                     evals_result=evals_results, 
                     verbose_eval=10, 
                     feval=feval)

In [19]:
filename = 'lgb_model.sav'
pickle.dump(lgb_model, open(filename, 'wb'))
# some time later.. load the model from disk
#loaded_model = pickle.load(open(filename, 'rb'))

### Use lightGBM predict each test_fe file and merge into one CSV

In [47]:
for i in np.arange(127)+1:
    print('Begin to forecast for file: ' + 'input/test_fe/test_'+str(i)+'fe.csv')
    test = pd.read_csv('input/test_fe/test_'+str(i)+'fe.csv', header=0)
    test = test.drop(columns=['Unnamed: 0'])
    if i == 1:
        p = lgb_model.predict(test.values)
    else:
        p = np.append(p,lgb_model.predict(test.values))
    print('forecast finished: '+ str(len(p))+'. Total Work finished: '+ str(i)+ '/127')

Begin to forecast for file: input/test_fe/test_1fe.csv


  if diff:


forecast finished: 148000. Total Work finished: 1/127
Begin to forecast for file: input/test_fe/test_2fe.csv


  if diff:


forecast finished: 296000. Total Work finished: 2/127
Begin to forecast for file: input/test_fe/test_3fe.csv


  if diff:


forecast finished: 444000. Total Work finished: 3/127
Begin to forecast for file: input/test_fe/test_4fe.csv


  if diff:


forecast finished: 592000. Total Work finished: 4/127
Begin to forecast for file: input/test_fe/test_5fe.csv


  if diff:


forecast finished: 740000. Total Work finished: 5/127
Begin to forecast for file: input/test_fe/test_6fe.csv


  if diff:


forecast finished: 888000. Total Work finished: 6/127
Begin to forecast for file: input/test_fe/test_7fe.csv


  if diff:


forecast finished: 1036000. Total Work finished: 7/127
Begin to forecast for file: input/test_fe/test_8fe.csv


  if diff:


forecast finished: 1184000. Total Work finished: 8/127
Begin to forecast for file: input/test_fe/test_9fe.csv


  if diff:


forecast finished: 1332000. Total Work finished: 9/127
Begin to forecast for file: input/test_fe/test_10fe.csv


  if diff:


forecast finished: 1480000. Total Work finished: 10/127
Begin to forecast for file: input/test_fe/test_11fe.csv


  if diff:


forecast finished: 1628000. Total Work finished: 11/127
Begin to forecast for file: input/test_fe/test_12fe.csv


  if diff:


forecast finished: 1776000. Total Work finished: 12/127
Begin to forecast for file: input/test_fe/test_13fe.csv


  if diff:


forecast finished: 1924000. Total Work finished: 13/127
Begin to forecast for file: input/test_fe/test_14fe.csv


  if diff:


forecast finished: 2072000. Total Work finished: 14/127
Begin to forecast for file: input/test_fe/test_15fe.csv


  if diff:


forecast finished: 2220000. Total Work finished: 15/127
Begin to forecast for file: input/test_fe/test_16fe.csv


  if diff:


forecast finished: 2368000. Total Work finished: 16/127
Begin to forecast for file: input/test_fe/test_17fe.csv


  if diff:


forecast finished: 2516000. Total Work finished: 17/127
Begin to forecast for file: input/test_fe/test_18fe.csv


  if diff:


forecast finished: 2664000. Total Work finished: 18/127
Begin to forecast for file: input/test_fe/test_19fe.csv


  if diff:


forecast finished: 2812000. Total Work finished: 19/127
Begin to forecast for file: input/test_fe/test_20fe.csv


  if diff:


forecast finished: 2960000. Total Work finished: 20/127
Begin to forecast for file: input/test_fe/test_21fe.csv


  if diff:


forecast finished: 3108000. Total Work finished: 21/127
Begin to forecast for file: input/test_fe/test_22fe.csv


  if diff:


forecast finished: 3256000. Total Work finished: 22/127
Begin to forecast for file: input/test_fe/test_23fe.csv


  if diff:


forecast finished: 3404000. Total Work finished: 23/127
Begin to forecast for file: input/test_fe/test_24fe.csv


  if diff:


forecast finished: 3552000. Total Work finished: 24/127
Begin to forecast for file: input/test_fe/test_25fe.csv


  if diff:


forecast finished: 3700000. Total Work finished: 25/127
Begin to forecast for file: input/test_fe/test_26fe.csv


  if diff:


forecast finished: 3848000. Total Work finished: 26/127
Begin to forecast for file: input/test_fe/test_27fe.csv


  if diff:


forecast finished: 3996000. Total Work finished: 27/127
Begin to forecast for file: input/test_fe/test_28fe.csv


  if diff:


forecast finished: 4144000. Total Work finished: 28/127
Begin to forecast for file: input/test_fe/test_29fe.csv


  if diff:


forecast finished: 4292000. Total Work finished: 29/127
Begin to forecast for file: input/test_fe/test_30fe.csv


  if diff:


forecast finished: 4440000. Total Work finished: 30/127
Begin to forecast for file: input/test_fe/test_31fe.csv


  if diff:


forecast finished: 4588000. Total Work finished: 31/127
Begin to forecast for file: input/test_fe/test_32fe.csv


  if diff:


forecast finished: 4736000. Total Work finished: 32/127
Begin to forecast for file: input/test_fe/test_33fe.csv


  if diff:


forecast finished: 4884000. Total Work finished: 33/127
Begin to forecast for file: input/test_fe/test_34fe.csv


  if diff:


forecast finished: 5032000. Total Work finished: 34/127
Begin to forecast for file: input/test_fe/test_35fe.csv


  if diff:


forecast finished: 5180000. Total Work finished: 35/127
Begin to forecast for file: input/test_fe/test_36fe.csv


  if diff:


forecast finished: 5328000. Total Work finished: 36/127
Begin to forecast for file: input/test_fe/test_37fe.csv


  if diff:


forecast finished: 5476000. Total Work finished: 37/127
Begin to forecast for file: input/test_fe/test_38fe.csv


  if diff:


forecast finished: 5624000. Total Work finished: 38/127
Begin to forecast for file: input/test_fe/test_39fe.csv


  if diff:


forecast finished: 5772000. Total Work finished: 39/127
Begin to forecast for file: input/test_fe/test_40fe.csv


  if diff:


forecast finished: 5920000. Total Work finished: 40/127
Begin to forecast for file: input/test_fe/test_41fe.csv


  if diff:


forecast finished: 6068000. Total Work finished: 41/127
Begin to forecast for file: input/test_fe/test_42fe.csv


  if diff:


forecast finished: 6216000. Total Work finished: 42/127
Begin to forecast for file: input/test_fe/test_43fe.csv


  if diff:


forecast finished: 6364000. Total Work finished: 43/127
Begin to forecast for file: input/test_fe/test_44fe.csv


  if diff:


forecast finished: 6512000. Total Work finished: 44/127
Begin to forecast for file: input/test_fe/test_45fe.csv


  if diff:


forecast finished: 6660000. Total Work finished: 45/127
Begin to forecast for file: input/test_fe/test_46fe.csv


  if diff:


forecast finished: 6808000. Total Work finished: 46/127
Begin to forecast for file: input/test_fe/test_47fe.csv


  if diff:


forecast finished: 6956000. Total Work finished: 47/127
Begin to forecast for file: input/test_fe/test_48fe.csv


  if diff:


forecast finished: 7104000. Total Work finished: 48/127
Begin to forecast for file: input/test_fe/test_49fe.csv


  if diff:


forecast finished: 7252000. Total Work finished: 49/127
Begin to forecast for file: input/test_fe/test_50fe.csv


  if diff:


forecast finished: 7400000. Total Work finished: 50/127
Begin to forecast for file: input/test_fe/test_51fe.csv


  if diff:


forecast finished: 7548000. Total Work finished: 51/127
Begin to forecast for file: input/test_fe/test_52fe.csv


  if diff:


forecast finished: 7696000. Total Work finished: 52/127
Begin to forecast for file: input/test_fe/test_53fe.csv


  if diff:


forecast finished: 7844000. Total Work finished: 53/127
Begin to forecast for file: input/test_fe/test_54fe.csv


  if diff:


forecast finished: 7992000. Total Work finished: 54/127
Begin to forecast for file: input/test_fe/test_55fe.csv


  if diff:


forecast finished: 8140000. Total Work finished: 55/127
Begin to forecast for file: input/test_fe/test_56fe.csv


  if diff:


forecast finished: 8288000. Total Work finished: 56/127
Begin to forecast for file: input/test_fe/test_57fe.csv


  if diff:


forecast finished: 8436000. Total Work finished: 57/127
Begin to forecast for file: input/test_fe/test_58fe.csv


  if diff:


forecast finished: 8584000. Total Work finished: 58/127
Begin to forecast for file: input/test_fe/test_59fe.csv


  if diff:


forecast finished: 8732000. Total Work finished: 59/127
Begin to forecast for file: input/test_fe/test_60fe.csv


  if diff:


forecast finished: 8880000. Total Work finished: 60/127
Begin to forecast for file: input/test_fe/test_61fe.csv


  if diff:


forecast finished: 9028000. Total Work finished: 61/127
Begin to forecast for file: input/test_fe/test_62fe.csv


  if diff:


forecast finished: 9176000. Total Work finished: 62/127
Begin to forecast for file: input/test_fe/test_63fe.csv


  if diff:


forecast finished: 9324000. Total Work finished: 63/127
Begin to forecast for file: input/test_fe/test_64fe.csv


  if diff:


forecast finished: 9472000. Total Work finished: 64/127
Begin to forecast for file: input/test_fe/test_65fe.csv


  if diff:


forecast finished: 9620000. Total Work finished: 65/127
Begin to forecast for file: input/test_fe/test_66fe.csv


  if diff:


forecast finished: 9768000. Total Work finished: 66/127
Begin to forecast for file: input/test_fe/test_67fe.csv


  if diff:


forecast finished: 9916000. Total Work finished: 67/127
Begin to forecast for file: input/test_fe/test_68fe.csv


  if diff:


forecast finished: 10064000. Total Work finished: 68/127
Begin to forecast for file: input/test_fe/test_69fe.csv


  if diff:


forecast finished: 10212000. Total Work finished: 69/127
Begin to forecast for file: input/test_fe/test_70fe.csv


  if diff:


forecast finished: 10360000. Total Work finished: 70/127
Begin to forecast for file: input/test_fe/test_71fe.csv


  if diff:


forecast finished: 10508000. Total Work finished: 71/127
Begin to forecast for file: input/test_fe/test_72fe.csv


  if diff:


forecast finished: 10656000. Total Work finished: 72/127
Begin to forecast for file: input/test_fe/test_73fe.csv


  if diff:


forecast finished: 10804000. Total Work finished: 73/127
Begin to forecast for file: input/test_fe/test_74fe.csv


  if diff:


forecast finished: 10952000. Total Work finished: 74/127
Begin to forecast for file: input/test_fe/test_75fe.csv


  if diff:


forecast finished: 11100000. Total Work finished: 75/127
Begin to forecast for file: input/test_fe/test_76fe.csv


  if diff:


forecast finished: 11248000. Total Work finished: 76/127
Begin to forecast for file: input/test_fe/test_77fe.csv


  if diff:


forecast finished: 11396000. Total Work finished: 77/127
Begin to forecast for file: input/test_fe/test_78fe.csv


  if diff:


forecast finished: 11544000. Total Work finished: 78/127
Begin to forecast for file: input/test_fe/test_79fe.csv


  if diff:


forecast finished: 11692000. Total Work finished: 79/127
Begin to forecast for file: input/test_fe/test_80fe.csv


  if diff:


forecast finished: 11840000. Total Work finished: 80/127
Begin to forecast for file: input/test_fe/test_81fe.csv


  if diff:


forecast finished: 11988000. Total Work finished: 81/127
Begin to forecast for file: input/test_fe/test_82fe.csv


  if diff:


forecast finished: 12136000. Total Work finished: 82/127
Begin to forecast for file: input/test_fe/test_83fe.csv


  if diff:


forecast finished: 12284000. Total Work finished: 83/127
Begin to forecast for file: input/test_fe/test_84fe.csv


  if diff:


forecast finished: 12432000. Total Work finished: 84/127
Begin to forecast for file: input/test_fe/test_85fe.csv


  if diff:


forecast finished: 12580000. Total Work finished: 85/127
Begin to forecast for file: input/test_fe/test_86fe.csv


  if diff:


forecast finished: 12728000. Total Work finished: 86/127
Begin to forecast for file: input/test_fe/test_87fe.csv


  if diff:


forecast finished: 12876000. Total Work finished: 87/127
Begin to forecast for file: input/test_fe/test_88fe.csv


  if diff:


forecast finished: 13024000. Total Work finished: 88/127
Begin to forecast for file: input/test_fe/test_89fe.csv


  if diff:


forecast finished: 13172000. Total Work finished: 89/127
Begin to forecast for file: input/test_fe/test_90fe.csv


  if diff:


forecast finished: 13320000. Total Work finished: 90/127
Begin to forecast for file: input/test_fe/test_91fe.csv


  if diff:


forecast finished: 13468000. Total Work finished: 91/127
Begin to forecast for file: input/test_fe/test_92fe.csv


  if diff:


forecast finished: 13616000. Total Work finished: 92/127
Begin to forecast for file: input/test_fe/test_93fe.csv


  if diff:


forecast finished: 13764000. Total Work finished: 93/127
Begin to forecast for file: input/test_fe/test_94fe.csv


  if diff:


forecast finished: 13912000. Total Work finished: 94/127
Begin to forecast for file: input/test_fe/test_95fe.csv


  if diff:


forecast finished: 14060000. Total Work finished: 95/127
Begin to forecast for file: input/test_fe/test_96fe.csv


  if diff:


forecast finished: 14208000. Total Work finished: 96/127
Begin to forecast for file: input/test_fe/test_97fe.csv


  if diff:


forecast finished: 14356000. Total Work finished: 97/127
Begin to forecast for file: input/test_fe/test_98fe.csv


  if diff:


forecast finished: 14504000. Total Work finished: 98/127
Begin to forecast for file: input/test_fe/test_99fe.csv


  if diff:


forecast finished: 14652000. Total Work finished: 99/127
Begin to forecast for file: input/test_fe/test_100fe.csv


  if diff:


forecast finished: 14800000. Total Work finished: 100/127
Begin to forecast for file: input/test_fe/test_101fe.csv


  if diff:


forecast finished: 14948000. Total Work finished: 101/127
Begin to forecast for file: input/test_fe/test_102fe.csv


  if diff:


forecast finished: 15096000. Total Work finished: 102/127
Begin to forecast for file: input/test_fe/test_103fe.csv


  if diff:


forecast finished: 15244000. Total Work finished: 103/127
Begin to forecast for file: input/test_fe/test_104fe.csv


  if diff:


forecast finished: 15392000. Total Work finished: 104/127
Begin to forecast for file: input/test_fe/test_105fe.csv


  if diff:


forecast finished: 15540000. Total Work finished: 105/127
Begin to forecast for file: input/test_fe/test_106fe.csv


  if diff:


forecast finished: 15688000. Total Work finished: 106/127
Begin to forecast for file: input/test_fe/test_107fe.csv


  if diff:


forecast finished: 15836000. Total Work finished: 107/127
Begin to forecast for file: input/test_fe/test_108fe.csv


  if diff:


forecast finished: 15984000. Total Work finished: 108/127
Begin to forecast for file: input/test_fe/test_109fe.csv


  if diff:


forecast finished: 16132000. Total Work finished: 109/127
Begin to forecast for file: input/test_fe/test_110fe.csv


  if diff:


forecast finished: 16280000. Total Work finished: 110/127
Begin to forecast for file: input/test_fe/test_111fe.csv


  if diff:


forecast finished: 16428000. Total Work finished: 111/127
Begin to forecast for file: input/test_fe/test_112fe.csv


  if diff:


forecast finished: 16576000. Total Work finished: 112/127
Begin to forecast for file: input/test_fe/test_113fe.csv


  if diff:


forecast finished: 16724000. Total Work finished: 113/127
Begin to forecast for file: input/test_fe/test_114fe.csv


  if diff:


forecast finished: 16872000. Total Work finished: 114/127
Begin to forecast for file: input/test_fe/test_115fe.csv


  if diff:


forecast finished: 17020000. Total Work finished: 115/127
Begin to forecast for file: input/test_fe/test_116fe.csv


  if diff:


forecast finished: 17168000. Total Work finished: 116/127
Begin to forecast for file: input/test_fe/test_117fe.csv


  if diff:


forecast finished: 17316000. Total Work finished: 117/127
Begin to forecast for file: input/test_fe/test_118fe.csv


  if diff:


forecast finished: 17464000. Total Work finished: 118/127
Begin to forecast for file: input/test_fe/test_119fe.csv


  if diff:


forecast finished: 17612000. Total Work finished: 119/127
Begin to forecast for file: input/test_fe/test_120fe.csv


  if diff:


forecast finished: 17760000. Total Work finished: 120/127
Begin to forecast for file: input/test_fe/test_121fe.csv


  if diff:


forecast finished: 17908000. Total Work finished: 121/127
Begin to forecast for file: input/test_fe/test_122fe.csv


  if diff:


forecast finished: 18056000. Total Work finished: 122/127
Begin to forecast for file: input/test_fe/test_123fe.csv


  if diff:


forecast finished: 18204000. Total Work finished: 123/127
Begin to forecast for file: input/test_fe/test_124fe.csv


  if diff:


forecast finished: 18352000. Total Work finished: 124/127
Begin to forecast for file: input/test_fe/test_125fe.csv


  if diff:


forecast finished: 18500000. Total Work finished: 125/127
Begin to forecast for file: input/test_fe/test_126fe.csv


  if diff:


forecast finished: 18648000. Total Work finished: 126/127
Begin to forecast for file: input/test_fe/test_127fe.csv
forecast finished: 18790469. Total Work finished: 127/127


  if diff:


In [51]:
sub = pd.DataFrame()
sub.loc[:,'click_id'] = np.arange(len(p))
sub.loc[:,'is_attributed'] = p
sub.to_csv('/output/ligGBM_forecast.csv',index=False)

### Using Xgboost

In [52]:
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV



In [56]:
#test_results = pd.read_csv('test_results.csv')
def modelfit(alg, X, y,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):

    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X.values, label=y.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
             early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #建模
    alg.fit(X.values ,y.values ,eval_metric='auc')
        
    #对训练集预测
    dtrain_predictions = alg.predict(X.values)
    dtrain_predprob = alg.predict_proba(X.values)[:,1]
        
    #输出模型的一些结果
    print("准确率 : %.4g" % metrics.accuracy_score(y.values, dtrain_predictions))
    print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(y, dtrain_predprob))

In [57]:
xgb1 = XGBClassifier(
        learning_rate =0.1,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, X, y)

  if diff:


准确率 : 0.9558
AUC 得分 (训练集): 0.993632


In [None]:
# 对于max_depth和min_child_weight查找最好的参数
param_test1 = {
    'max_depth':np.arange(3,10,2),
    'min_child_weight':np.arange(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
                       param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_, gsearch1.best_score_

In [None]:
# 对于max_depth和min_child_weight查找最好的参数
param_test2 = {
    'max_depth':[4,5,6],
    'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_, gsearch2.best_score_

In [None]:
#交叉验证对min_child_weight寻找最合适的参数
param_test2b = {
    'min_child_weight':[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
                                        min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2b.fit(X,y)
gsearch2b.best_params_, gsearch2b.best_score_

In [None]:
#Grid seach选择合适的gamma
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
#对subsample 和 colsample_bytree用grid search寻找最合适的参数
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.best_params_, gsearch4.best_score_

In [None]:
# 同上
param_test5 = {
    'subsample':[i/100.0 for i in range(75,90,5)],
    'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.best_params_, gsearch5.best_score_

In [None]:
#对reg_alpha用grid search寻找最合适的参数
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test6, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch6.fit(train[predictors],train[target])
gsearch6.best_params_, gsearch6.best_score_

In [None]:
# 换一组参数对reg_alpha用grid search寻找最合适的参数
param_test7 = {
    'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
}
gsearch7 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
                                        min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
                                        objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
                       param_grid = param_test7, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch7.fit(train[predictors],train[target])
gsearch7.best_params_, gsearch7.best_score_

### Blend prediction from different models

In [None]:
# All credits go to original authors!
import pandas as pd

test_files = ['../input/lewis-undersampler-9562-version/pred.csv',
              '../input/weighted-app-chanel-os/subnew.csv',
              '../input/single-xgboost-lb-0-9639/xgb_sub.csv',
              '../input/lightgbm-with-count-features/sub_lgb_balanced99.csv'
              ]

model_test_data = []
for test_file in test_files:
    print('read ' + test_file)
    model_test_data.append(pd.read_csv(test_file, encoding='utf-8'))
n_models = len(model_test_data)

weights = [0.10, 0.15, 0.25, 0.50]
column_name = 'is_attributed'

print('predict')
test_predict_column = [0.] * len(model_test_data[0][column_name])
for ind in range(0, n_models):
    test_predict_column += model_test_data[ind][column_name] * weights[ind]

print('make result')
final_result = model_test_data[0]['click_id']
final_result = pd.concat((final_result, pd.DataFrame(
    {column_name: test_predict_column})), axis=1)
final_result.to_csv("average_result.csv", index=False)