In [13]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.model_selection import GridSearchCV,train_test_split,RandomizedSearchCV

import warnings

warnings.filterwarnings('ignore')


In [14]:
def metricas(model,Xt,Xv,yt,yv):
    print "ROC train:%.3f | ROC test:%.3f " %(roc_auc_score(y_true=yt,y_score=model.predict_proba(Xt)[:,1]),roc_auc_score(y_true=yv,y_score=model.predict_proba(Xv)[:,1]))
    print "ACC train:%.3f | ACC test:%.3f " %(accuracy_score(y_true=yt,y_pred=model.predict(Xt)),accuracy_score(y_true=yv,y_pred=model.predict(Xv)))

In [15]:
df = pickle.load(open('churn_con_woe','rb'))

In [16]:
df.head()

Unnamed: 0,V_ACCT_LGHT,C_IP,C_VMP,V_N_VM_MESS,V_T_DAY_CHG,V_T_EVE_CALLS,V_T_EVE_CHG,V_T_NIG_CALLS,V_T_NIG_CHG,V_T_INT_CALLS,...,W_V_ACCT_LGHT,W_V_N_VM_MESS,W_V_T_DAY_CHG,W_V_T_EVE_CALLS,W_V_T_EVE_CHG,W_V_T_NIG_CALLS,W_V_T_NIG_CHG,W_V_T_INT_CALLS,W_V_T_INT_CHG,W_V_CS_CALLS
0,128,No,Yes,25,45.07,99,16.78,91,11.01,3,...,0.061033,1.219037,-1.864202,0.068283,0.065696,-0.089148,-0.05179,-0.29035,0.007263,0.295956
1,156,No,No,0,29.67,116,16.78,86,10.73,2,...,0.061033,-0.147174,0.561314,-0.053201,0.065696,-0.089148,-0.05179,-0.29035,0.007263,0.295956
2,57,No,No,0,14.6,127,16.48,93,10.42,2,...,0.053118,-0.147174,0.178109,-0.053201,0.065696,-0.089148,-0.05179,-0.29035,0.007263,0.295956
3,58,No,Yes,20,33.07,89,18.16,92,10.65,2,...,0.053118,1.219037,0.561314,0.031576,0.065696,-0.089148,-0.05179,-0.29035,0.007263,0.295956
4,149,No,No,0,31.89,81,17.12,79,11.89,1,...,0.061033,-0.147174,0.561314,0.031576,0.065696,-0.089148,-0.05179,-0.29035,0.007263,0.295956


In [17]:
X = df[[v for v in df.columns if v[:2]=='V_']+[v for v in df.columns if v[:4]=='W_C_']].copy()
y = df['TARGET'].copy()

In [18]:
Xt,Xv,yt,yv = train_test_split(X,y,train_size=0.7)

In [19]:
modelos = [RandomForestClassifier(),AdaBoostClassifier(),XGBClassifier()]

In [20]:
for model in modelos:
    model.fit(Xt,yt)
    metricas(model,Xt,Xv,yt,yv)

ROC train:1.000 | ROC test:0.959 
ACC train:0.995 | ACC test:0.952 
ROC train:0.909 | ROC test:0.848 
ACC train:0.887 | ACC test:0.895 
ROC train:0.964 | ROC test:0.957 
ACC train:0.964 | ACC test:0.966 


In [21]:
param_grid = dict ( max_depth=range(2,10),
       n_estimators=range(10,250,10),
      booster = ['gbtree', 'gblinear', 'dart'],
      gamma = np.arange(0.1,1,0.1)
     )

In [22]:
grid = RandomizedSearchCV(cv=3,
                          error_score='accuracy',
                          estimator=modelos[-1],
                          n_iter=300,
                          n_jobs=-1,
                          param_distributions=param_grid,verbose=True)

In [23]:
grid.fit(Xt,yt)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  77 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done 230 tasks      | elapsed:   32.9s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 838 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  2.4min finished


RandomizedSearchCV(cv=3, error_score='accuracy',
          estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1),
          fit_params=None, iid=True, n_iter=300, n_jobs=-1,
          param_distributions={'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240], 'max_depth': [2, 3, 4, 5, 6, 7, 8, 9], 'gamma': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]), 'booster': ['gbtree', 'gblinear', 'dart']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=True)

In [24]:
metricas(grid.best_estimator_,Xt,Xv,yt,yv)

ROC train:1.000 | ROC test:0.954 
ACC train:0.999 | ACC test:0.966 


In [25]:
grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0.1, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)