# <center>Data Science Training</center>
<center><b>Grid Search and Cross-validation Template</b><br>
Leo, 2016</center>

In [None]:
import numpy as np
import pandas as pd

import sklearn as sk
from sklearn.cross_validation import train_test_split,KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score,accuracy_score,average_precision_score,roc_curve,auc,precision_recall_curve
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import log_loss
%matplotlib inline
import matplotlib.pyplot as plt

import xgboost as xgb
from scipy import interp

import random

from __future__ import division
from __future__ import print_function

# Read files and preparation before cross-validation

In [None]:
dataset_modelA_clean = pd.read_csv("input_file_pat'",sep="|", na_values=["\N", "NULL"]) 

In [None]:
ALL_TARGETS = ['target']
IDS = ['masking']

In [None]:
targeting_features_file = 'feature_columns_metadata_path'
feature_df_targeting = pd.read_csv(targeting_features_file,na_values=["\N","NULL"])

feature_columns = feature_df_targeting.targeting_features.tolist()

print(len(feature_columns))

In [None]:
# get data set for targeting
dataset_modelA_clean_targeting = dataset_modelA_clean[np.concatenate([feature_columns, ALL_TARGETS])].copy()

In [None]:
print(ALL_TARGETS)

for target in ALL_TARGETS:
    print("target number %s"%target)
    print(dataset_modelA_clean_targeting[target].sum())
    print("target proportion %s"%target)
    print(dataset_modelA_clean_targeting[target].mean())

In [None]:
train, test = train_test_split(dataset_modelA_clean_targeting,train_size=0.7)

print('Train data has %i rows and %i columns'%(train.shape[0], train.shape[1]))
print('Test data has %i rows and %i columns'%(test.shape[0], test.shape[1]))

In [None]:
COLS_TO_DROP = ALL_TARGETS + IDS  

X_train = train.drop(COLS_TO_DROP,axis=1)
X_test = test.drop(COLS_TO_DROP,axis=1)

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled =pd.DataFrame(X_train_scaled, index=X_train.index, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, index=X_test.index, columns=X_test.columns)

# Grid Search + CrossVal for LR

In [None]:
results_test = test[['masking','target']].copy()
results_train = train[['masking','target']].copy()

target = 'target'
grid_parameters = {'penalty':['l2'], 'C':[0.5], 'class_weight':[None]} #'penalty':['l1','l2'] 'C':[1,0.1,0.01] 'class_weight':[None,'balanced']

print("Start training with Grid Search")
print('Start LR training for target %s'%(target))

clf_lg_base=LogisticRegression(n_jobs=1,random_state=27,verbose=0)
clf = GridSearchCV(clf_lg_base,grid_parameters,scoring='roc_auc',cv=3,verbose=1,n_jobs=1)

y_train = np.array(train[target].astype(np.uint8))
clf.fit(X_train_scaled, y_train)

print("Best parameters set found on development set:\n")
print(clf.best_estimator_)

print("\n Grid scores on development set:")
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

y_test = np.array(test[target].astype(np.uint8))
y_true, y_pred = y_test, clf.predict_proba(X_test_scaled)
print("Scores on the evaluation dataset")
print("ROC AUC SCORE\t:\t" + str(roc_auc_score(y_true, y_pred[:,1])))
print("ACCURACY SCORE\t:\t" + str(accuracy_score(y_true, clf.predict(X_test_scaled)))) #
print("PRECISION SCORE\t:\t" + str(average_precision_score(y_true, y_pred[:,1])))

proba = y_pred[:,1]

cols = ['proba_'+target]

proba_df = pd.DataFrame(data=proba,index=test[feature_columns].index,columns=cols)
results_test = pd.concat([results_test,proba_df],axis=1)


In [None]:
results_roc=pd.DataFrame([])
results_lift=pd.DataFrame(range(0,101), columns=['quantiles'])

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(results_test[target], results_test['proba_'+target])
roc_auc = auc(fpr, tpr)
temp = pd.DataFrame(np.vstack((fpr,tpr,thresholds))).T
temp.columns = ['fpr','tpr','thresholds']
temp['fpr'] = temp.fpr.apply(lambda x: np.around(x,decimals=2))
temp = temp.groupby('fpr', as_index=False).agg({'tpr' : 'max', 'thresholds' : 'min'})
temp = temp[['fpr','tpr','thresholds']]
temp.loc[temp.fpr==0,'tpr']=0.0
temp.columns = ['fpr_%s' %(target), 'tpr_%s' %(target),'thresholds_%s' %(target)]
temp['roc_auc_%s' %(target)]=roc_auc
results_roc=pd.concat([results_roc,temp],axis=1)

# Compute Lift curve
sorted_proba = np.array(list(reversed(np.argsort(results_test['proba_'+target].values))))
xtestshape0=results_test[target].count().astype(int)
y_test=results_test[target]
centile = xtestshape0//100
positives = sum(y_test)
lift = [0]
for q in xrange(1,101):
    if q == 100:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:xtestshape0]])
    else:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:q*centile]])
    lift.append(lift[q-1]+100*tp/float(positives))
quantiles = range(0,101)
results_lift['lift_%s' %(target)]=lift
results_lift['lift_10_%s' %(target)]=lift[10]/10.

print("Model auc: %f, lift at 10: %f" %(roc_auc, lift[10]/10.))

In [None]:
feature_importances_data = []
features = X_test.columns
for feature_name, feature_importance in zip(features,clf.best_estimator_.coef_.ravel()):
    feature_importances_data.append({
            'feature': feature_name,
            'importance': feature_importance
        })

feature_importances = pd.DataFrame(feature_importances_data)

In [None]:
feature_importances['abs_imp'] = feature_importances['importance'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

In [None]:
def run_cross_validation(_df, _classifier, _features_columns):
    # cross validation type can be changed here
    ss = sk.cross_validation.ShuffleSplit(len(_df.masking.unique()), n_iter=3, test_size=.5, random_state=0)
    target='target'
    prob_of = 'prob_of_all'
    
    results_cv_targeting = pd.DataFrame([], columns=['masking', target, 'fold', prob_of])

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    mean_lift = []
    mean_tp = 0.0
    mean_fp = range(0, 101)

    nb_calls_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_pos_targets', 'nb_pos_targets', 'pos_rate', 
                                           'Percentage_of_pos_targets_found', 'Percentage_of_Population', 'Lift'])
    feature_importances = pd.DataFrame([], columns=['feature', 'importance', 'fold'])

    fig = plt.figure(figsize=(6, 12))
    fig.subplots_adjust(bottom=-0.5, left=-0.5, top=0.5, right=1.5)


    print ('modeling started')

    for i, (train_index, valid_index) in enumerate(ss):

        customer_id = _df.masking.unique().copy()
        shuffled_customer_id = np.array(sorted(customer_id, key=lambda k: random.random()))
        train_customer_id = shuffled_customer_id[train_index]
        valid_customer_id = shuffled_customer_id[valid_index]

        train = _df.loc[ _df.masking.isin(train_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)
        valid = _df.loc[_df.masking.isin(valid_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)

        temp = valid[['masking', target]].copy()
        temp['fold'] = i

        # modeling#
        train_X = train.drop(['masking', target], axis=1)
        valid_X = valid.drop(['masking', target], axis=1)
        
        scaler = preprocessing.StandardScaler().fit(train_X)

        train_X_scaled = scaler.transform(train_X)
        valid_X_scaled = scaler.transform(valid_X)

        train_X_scaled =pd.DataFrame(train_X_scaled, index=train_X.index, columns=train_X.columns)
        valid_X_scaled = pd.DataFrame(valid_X_scaled, index=valid_X.index, columns=valid_X.columns)

        train_Y = np.array(train[target].astype(np.uint8))
        valid_Y = np.array(valid[target].astype(np.uint8))

        probas_ = _classifier.fit(train_X_scaled, train_Y).predict_proba(valid_X_scaled)
        probabilities = pd.DataFrame(data=probas_[:, 1], index=valid_X.index, columns=[prob_of])

        temp = temp.join(probabilities, how='left')
        results_cv_targeting = results_cv_targeting.append(temp)

        ###############################################################################
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = sk.metrics.roc_curve(valid_Y, probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = sk.metrics.auc(fpr, tpr)


        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        ###############################################################################
        # compute lift at 10%#
        sorted_proba = np.array(list(reversed(np.argsort(probas_[:, 1]))))
        X_test = valid_X
        y_test = valid_Y
        centile = X_test.shape[0] / 100
        positives = sum(y_test)
        lift = [0]
        for q in xrange(1, 101):
            if q == 100:
                tp = sum(np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:X_test.shape[0]]])
            else:
                tp = sum(
                    np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:q * X_test.shape[0] / 100]])
            lift.append(lift[q - 1] + 100 * tp / float(positives))
        quantiles = range(0, 101)
        mean_tp += interp(mean_fp, mean_fp, lift)
        mean_tp[0] = 0.0
        mean_lift.append(lift[10] / 10.)


        plt.subplot(2, 2, 2)
        plt.plot(quantiles, lift, label='Lift fold %d at 10 = %0.2f' % (i, lift[10] / 10.))
        print ('shuffle: %i, AUC: %f, lift at 10 percent: %f' % (i, roc_auc, lift[10] / 10.))
        
        ###############################################################################
        # Calculate nb contacts to make
        nb_calls = temp[['target','prob_of_all','fold']].copy()
        nb_calls = nb_calls.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
        nb_calls['cum_Xsellers'] = np.cumsum(nb_calls.target)
        nb_calls = nb_calls.reset_index(drop=False)
        nb_calls = nb_calls.rename(columns={'index':'rank'})
        nb_calls['nb_contacts_100'] = nb_calls.loc[nb_calls.cum_Xsellers==100,'rank'].min()
        nb_calls['nb_contacts_200'] = nb_calls.loc[nb_calls.cum_Xsellers==200,'rank'].min()
        nb_calls['nb_contacts_500'] = nb_calls.loc[nb_calls.cum_Xsellers==500,'rank'].min()
        nb_calls['nb_contacts_1000'] = nb_calls.loc[nb_calls.cum_Xsellers==1000,'rank'].min()
        nb_calls['nb_contacts_2000'] = nb_calls.loc[nb_calls.cum_Xsellers==2000,'rank'].min()
        nb_calls['nb_contacts_3000'] = nb_calls.loc[nb_calls.cum_Xsellers==3000,'rank'].min()
        nb_calls['nb_contacts_all'] = nb_calls.loc[nb_calls.cum_Xsellers==nb_calls.cum_Xsellers.max(),'rank'].min()
        nb_calls = nb_calls[['nb_contacts_100','nb_contacts_200', 'nb_contacts_500','nb_contacts_1000', 'nb_contacts_2000','nb_contacts_3000','nb_contacts_all']].min()
        nb_calls = pd.DataFrame(nb_calls,columns=['nb_contacts'])
        nb_calls['total_population'] = temp.shape[0]
        nb_calls['total_pos_targets'] = temp.target.sum()
        nb_calls['nb_pos_targets']=[100,200,500,1000,2000,3000, temp.target.sum()]
        nb_calls['pos_rate'] = nb_calls.nb_pos_targets/nb_calls.nb_contacts
        nb_calls['Percentage_of_pos_targets_found'] = nb_calls.nb_pos_targets/nb_calls.total_pos_targets
        nb_calls['Percentage_of_Population'] = nb_calls.nb_contacts/nb_calls.total_population
        nb_calls['Lift'] = nb_calls.Percentage_of_pos_targets_found/nb_calls.Percentage_of_Population

        nb_calls_cv = nb_calls_cv.append(nb_calls)
        
        ###############################################################################
        feature_importances_data = []
        features = train_X.columns
        for feature_name, feature_importance in zip(features,_classifier.coef_.ravel()):
            feature_importances_data.append({
                'feature': feature_name,
                'importance': feature_importance
            })

        temp = pd.DataFrame(feature_importances_data)
        temp['fold'] = i
        feature_importances = feature_importances.append(temp)
    
    nb_calls_cv = nb_calls_cv.reset_index().groupby('index').mean().sort_values(by='nb_pos_targets')
    results_cv_targeting = results_cv_targeting.reset_index(drop=True)
    
    feature_importances = feature_importances.groupby('feature')['importance'].agg([np.mean, np.std])
    feature_importances = feature_importances.sort_values(by='mean')
    feature_importances = feature_importances.reset_index()

    plt.subplot(2, 2, 1)
    mean_tpr /= len(ss)
    mean_tpr[-1] = 1.0
    mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.subplot(2, 2, 2)
    mean_tp /= len(ss)
    mean_tp[-1] = 100.0
    mean_lift10 = np.mean(mean_lift)
    print('Mean AUC: %f, Mean lift at 10 percent: %f' % (mean_auc, mean_lift10))
    plt.plot(mean_fp, mean_tp, 'k--', label='Mean lift at 10 = %0.2f' % mean_lift10, lw=2)

    plt.plot([0, 100], [0, 100], 'k--', color=(0.6, 0.6, 0.6))
    plt.xlim([-5, 105])
    plt.ylim([-5, 105])
    plt.xlabel('Percentage of population')
    plt.ylabel('Cumulative gain')
    plt.title('Lift', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.show()

    return results_cv_targeting, feature_importances, nb_calls_cv

In [None]:
#Logistic regression

classifier = LogisticRegression(penalty='l2', C=0.5, class_weight=None, random_state=27)

results_cv_targeting, feature_importances, nb_calls = run_cross_validation(dataset_modelA_clean_targeting, classifier , feature_columns)

In [None]:
feature_importances['abs_imp'] = feature_importances['mean'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

# Grid Search + CrossVal for RF

In [None]:
results_test = test[['masking','target']].copy()
results_train = train[['masking','target']].copy()
target = 'target'

grid_parameters = {'n_estimators':[200],'min_samples_split':[5], 'min_samples_leaf':[5]} 
#n_estimators, criterion='gini',max_depth=None, min_samples_split=2, min_samples_leaf=1,max_features='auto',class_weight=None, 


print("Start training with Grid Search")
 
print('Start RF training for target %s'%(target))

clf_rf_base=RandomForestClassifier(n_jobs=4,random_state=27,verbose=0)
clf = GridSearchCV(clf_rf_base,grid_parameters,scoring='roc_auc',cv=3,verbose=1,n_jobs=4)

y_train = np.array(train[target].astype(np.uint8))
clf.fit(X_train, y_train)

print("Best parameters set found on development set:\n")
print(clf.best_estimator_)

print("\n Grid scores on development set:")
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

y_test = np.array(test[target].astype(np.uint8))
y_true, y_pred = y_test, clf.predict_proba(X_test)
print("Scores on the evaluation dataset")
print("ROC AUC SCORE\t:\t" + str(roc_auc_score(y_true, y_pred[:,1])))
print("ACCURACY SCORE\t:\t" + str(accuracy_score(y_true, clf.predict(X_test))))
print("PRECISION SCORE\t:\t" + str(average_precision_score(y_true, y_pred[:,1])))

##
proba = y_pred[:,1]

cols = ['proba_'+target]

proba_df = pd.DataFrame(data=proba,index=test[feature_columns].index,columns=cols)
results_test = pd.concat([results_test,proba_df],axis=1)

In [None]:
results_roc=pd.DataFrame([])
results_lift=pd.DataFrame(range(0,101), columns=['quantiles'])

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(results_test[target], results_test['proba_'+target])
roc_auc = auc(fpr, tpr)
temp = pd.DataFrame(np.vstack((fpr,tpr,thresholds))).T
temp.columns = ['fpr','tpr','thresholds']
temp['fpr'] = temp.fpr.apply(lambda x: np.around(x,decimals=2))
temp = temp.groupby('fpr', as_index=False).agg({'tpr' : 'max', 'thresholds' : 'min'})
temp = temp[['fpr','tpr','thresholds']]
temp.loc[temp.fpr==0,'tpr']=0.0
temp.columns = ['fpr_%s' %(target), 'tpr_%s' %(target),'thresholds_%s' %(target)]
temp['roc_auc_%s' %(target)]=roc_auc
results_roc=pd.concat([results_roc,temp],axis=1)

# Compute Lift curve
sorted_proba = np.array(list(reversed(np.argsort(results_test['proba_'+target].values))))
xtestshape0=results_test[target].count().astype(int)
y_test=results_test[target]
centile = xtestshape0//100
positives = sum(y_test)
lift = [0]
for q in xrange(1,101):
    if q == 100:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:xtestshape0]])
    else:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:q*centile]])
    lift.append(lift[q-1]+100*tp/float(positives))
quantiles = range(0,101)
results_lift['lift_%s' %(target)]=lift
results_lift['lift_10_%s' %(target)]=lift[10]/10.

print("Model auc: %f, lift at 10: %f" %(roc_auc, lift[10]/10.))

In [None]:
feature_importances_data = []
features = X_test.columns
for feature_name, feature_importance in zip(features,clf.best_estimator_.feature_importances_):
    feature_importances_data.append({
            'feature': feature_name,
            'importance': feature_importance
        })

feature_importances = pd.DataFrame(feature_importances_data)

In [None]:
feature_importances['abs_imp'] = feature_importances['importance'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

In [None]:
def run_cross_validation(_df, _classifier, _features_columns):
    # cross validation type can be changed here
    ss = sk.cross_validation.ShuffleSplit(len(_df.masking.unique()), n_iter=5, test_size=.3, random_state=0)
    target='target'
    prob_of = 'prob_of_all'
    
    results_cv_targeting = pd.DataFrame([], columns=['masking', target, 'fold', prob_of])

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    mean_lift = []
    mean_tp = 0.0
    mean_fp = range(0, 101)

    nb_calls_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_pos_targets', 'nb_pos_targets', 'pos_rate', 
                                           'Percentage_of_pos_targets_found', 'Percentage_of_Population', 'Lift'])
    feature_importances = pd.DataFrame([], columns=['feature', 'importance', 'fold'])

    fig = plt.figure(figsize=(6, 12))
    fig.subplots_adjust(bottom=-0.5, left=-0.5, top=0.5, right=1.5)


    print ('modeling started')

    for i, (train_index, valid_index) in enumerate(ss):

        customer_id = _df.masking.unique().copy()
        shuffled_customer_id = np.array(sorted(customer_id, key=lambda k: random.random()))
        train_customer_id = shuffled_customer_id[train_index]
        valid_customer_id = shuffled_customer_id[valid_index]

        train = _df.loc[ _df.masking.isin(train_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)
        valid = _df.loc[_df.masking.isin(valid_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)

        temp = valid[['masking', target]].copy()
        temp['fold'] = i

        # modeling#
        train_X = train.drop(['masking', target], axis=1)
        valid_X = valid.drop(['masking', target], axis=1)

        train_Y = np.array(train[target].astype(np.uint8))
        valid_Y = np.array(valid[target].astype(np.uint8))

        probas_ = _classifier.fit(train_X, train_Y).predict_proba(valid_X)
        probabilities = pd.DataFrame(data=probas_[:, 1], index=valid_X.index, columns=[prob_of])

        temp = temp.join(probabilities, how='left')
        results_cv_targeting = results_cv_targeting.append(temp)

        ###############################################################################
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = sk.metrics.roc_curve(valid_Y, probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = sk.metrics.auc(fpr, tpr)


        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        ###############################################################################
        # compute lift at 10%#
        sorted_proba = np.array(list(reversed(np.argsort(probas_[:, 1]))))
        X_test = valid_X
        y_test = valid_Y
        centile = X_test.shape[0] / 100
        positives = sum(y_test)
        lift = [0]
        for q in xrange(1, 101):
            if q == 100:
                tp = sum(np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:X_test.shape[0]]])
            else:
                tp = sum(
                    np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:q * X_test.shape[0] / 100]])
            lift.append(lift[q - 1] + 100 * tp / float(positives))
        quantiles = range(0, 101)
        mean_tp += interp(mean_fp, mean_fp, lift)
        mean_tp[0] = 0.0
        mean_lift.append(lift[10] / 10.)


        plt.subplot(2, 2, 2)
        plt.plot(quantiles, lift, label='Lift fold %d at 10 = %0.2f' % (i, lift[10] / 10.))
        print ('shuffle: %i, AUC: %f, lift at 10 percent: %f' % (i, roc_auc, lift[10] / 10.))
        
        ###############################################################################
        # Calculate nb contacts to make
        nb_calls = temp[['target','prob_of_all','fold']].copy()
        nb_calls = nb_calls.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
        nb_calls['cum_Xsellers'] = np.cumsum(nb_calls.target)
        nb_calls = nb_calls.reset_index(drop=False)
        nb_calls = nb_calls.rename(columns={'index':'rank'})
        nb_calls['nb_contacts_100'] = nb_calls.loc[nb_calls.cum_Xsellers==100,'rank'].min()
        nb_calls['nb_contacts_200'] = nb_calls.loc[nb_calls.cum_Xsellers==200,'rank'].min()
        nb_calls['nb_contacts_500'] = nb_calls.loc[nb_calls.cum_Xsellers==500,'rank'].min()
        nb_calls['nb_contacts_1000'] = nb_calls.loc[nb_calls.cum_Xsellers==1000,'rank'].min()
        nb_calls['nb_contacts_2000'] = nb_calls.loc[nb_calls.cum_Xsellers==2000,'rank'].min()
        nb_calls['nb_contacts_3000'] = nb_calls.loc[nb_calls.cum_Xsellers==3000,'rank'].min()
        nb_calls['nb_contacts_all'] = nb_calls.loc[nb_calls.cum_Xsellers==nb_calls.cum_Xsellers.max(),'rank'].min()
        nb_calls = nb_calls[['nb_contacts_100','nb_contacts_200', 'nb_contacts_500','nb_contacts_1000', 'nb_contacts_2000','nb_contacts_3000','nb_contacts_all']].min()
        nb_calls = pd.DataFrame(nb_calls,columns=['nb_contacts'])
        nb_calls['total_population'] = temp.shape[0]
        nb_calls['total_pos_targets'] = temp.target.sum()
        nb_calls['nb_pos_targets']=[100,200,500,1000,2000,3000, temp.target.sum()]
        nb_calls['pos_rate'] = nb_calls.nb_pos_targets/nb_calls.nb_contacts
        nb_calls['Percentage_of_pos_targets_found'] = nb_calls.nb_pos_targets/nb_calls.total_pos_targets
        nb_calls['Percentage_of_Population'] = nb_calls.nb_contacts/nb_calls.total_population
        nb_calls['Lift'] = nb_calls.Percentage_of_pos_targets_found/nb_calls.Percentage_of_Population

        nb_calls_cv = nb_calls_cv.append(nb_calls)
        
        ###############################################################################
        feature_importances_data = []
        features = train_X.columns
        for feature_name, feature_importance in zip(features,_classifier.feature_importances_):
            feature_importances_data.append({
                'feature': feature_name,
                'importance': feature_importance
            })

        temp = pd.DataFrame(feature_importances_data)
        temp['fold'] = i
        feature_importances = feature_importances.append(temp)
    
    nb_calls_cv = nb_calls_cv.reset_index().groupby('index').mean().sort_values(by='nb_pos_targets')
    results_cv_targeting = results_cv_targeting.reset_index(drop=True)
    
    feature_importances = feature_importances.groupby('feature')['importance'].agg([np.mean, np.std])
    feature_importances = feature_importances.sort_values(by='mean')
    feature_importances = feature_importances.reset_index()

    plt.subplot(2, 2, 1)
    mean_tpr /= len(ss)
    mean_tpr[-1] = 1.0
    mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.subplot(2, 2, 2)
    mean_tp /= len(ss)
    mean_tp[-1] = 100.0
    mean_lift10 = np.mean(mean_lift)
    print('Mean AUC: %f, Mean lift at 10 percent: %f' % (mean_auc, mean_lift10))
    plt.plot(mean_fp, mean_tp, 'k--', label='Mean lift at 10 = %0.2f' % mean_lift10, lw=2)

    plt.plot([0, 100], [0, 100], 'k--', color=(0.6, 0.6, 0.6))
    plt.xlim([-5, 105])
    plt.ylim([-5, 105])
    plt.xlabel('Percentage of population')
    plt.ylabel('Cumulative gain')
    plt.title('Lift', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.show()

    return results_cv_targeting, feature_importances, nb_calls_cv

In [None]:
#Random Forest
classifier = RandomForestClassifier(n_estimators=200,n_jobs=4, random_state=27,min_samples_split=5, min_samples_leaf=5)

results_cv_targeting, feature_importances, nb_calls = run_cross_validation(dataset_modelA_clean_targeting,classifier,feature_columns)

In [None]:
feature_importances['abs_imp'] = feature_importances['mean'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

# Grid Search + CrossVal for XGBOOST

In [None]:
results_test = test[['masking','target']].copy()
results_train = train[['masking','target']].copy()
target = 'target'

grid_parameters = {'max_depth': [6],'n_estimators': [200],'learning_rate':[0.05],'max_delta_step':[1],
                   'min_child_weight':[25],'max_delta_step':[1],'gamma':[0.1],'scale_pos_weight':[1], 
                   'colsample_bytree':[0.85],'subsample':[0.85],'colsample_bylevel':[0.85]} #'scale_pos_weight':[1,n] n neg/pos

print("Start training with Grid Search")
    
print('Start Xgboost training for target %s'%(target))

clf_xgb_base= xgb.XGBClassifier(nthread=10, seed=27)
clf = GridSearchCV(clf_xgb_base,grid_parameters,scoring='roc_auc',cv=3,verbose=1,n_jobs=4) 

y_train = np.array(train[target].astype(np.uint8))
clf.fit(X_train, y_train) #eval_metric='logloss', here is for early_stop if specify

print("Best parameters set found on development set:\n")
print(clf.best_estimator_)

print("Best Scores")
print(clf.best_score_)

print("\n Grid scores on development set:")
for params, mean_score, scores in clf.grid_scores_:
    print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() / 2, params))

y_test = np.array(test[target].astype(np.uint8))
y_true, y_pred = y_test, clf.predict_proba(X_test)
print("Scores on the evaluation dataset")
print("ROC AUC SCORE\t:\t" + str(roc_auc_score(y_true, y_pred[:,1])))
print("ACCURACY SCORE\t:\t" + str(accuracy_score(y_true, clf.predict(X_test))))
print("PRECISION SCORE\t:\t" + str(average_precision_score(y_true, y_pred[:,1])))

proba = y_pred[:,1]

cols = ['proba_'+target]

proba_df = pd.DataFrame(data=proba,index=test.index,columns=cols)
results_test = pd.concat([results_test,proba_df],axis=1)

In [None]:
results_roc=pd.DataFrame([])
results_lift=pd.DataFrame(range(0,101), columns=['quantiles'])

# Compute ROC curve and area the curve
fpr, tpr, thresholds = roc_curve(results_test[target], results_test['proba_'+target])
roc_auc = auc(fpr, tpr)
temp = pd.DataFrame(np.vstack((fpr,tpr,thresholds))).T
temp.columns = ['fpr','tpr','thresholds']
temp['fpr'] = temp.fpr.apply(lambda x: np.around(x,decimals=2))
temp = temp.groupby('fpr', as_index=False).agg({'tpr' : 'max', 'thresholds' : 'min'})
temp = temp[['fpr','tpr','thresholds']]
temp.loc[temp.fpr==0,'tpr']=0.0
temp.columns = ['fpr_%s' %(target), 'tpr_%s' %(target),'thresholds_%s' %(target)]
temp['roc_auc_%s' %(target)]=roc_auc
results_roc=pd.concat([results_roc,temp],axis=1)

# Compute Lift curve
sorted_proba = np.array(list(reversed(np.argsort(results_test['proba_'+target].values))))
xtestshape0=results_test[target].count().astype(int)
y_test=results_test[target]
centile = xtestshape0//100
positives = sum(y_test)
lift = [0]
for q in xrange(1,101):
    if q == 100:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:xtestshape0]])
    else:
        tp = sum(np.array(y_test)[sorted_proba[(q-1)*centile:q*centile]])
    lift.append(lift[q-1]+100*tp/float(positives))
quantiles = range(0,101)
results_lift['lift_%s' %(target)]=lift
results_lift['lift_10_%s' %(target)]=lift[10]/10.

print("Model auc: %f, lift at 10: %f" %(roc_auc, lift[10]/10.))

In [None]:
def get_importance(_bst, _importance_type):
    # if it's weight, then omap stores the number of missing values
    fmap = ''
    if _importance_type == 'weight':
        # do a simpler tree dump to save time
        trees = _bst.get_dump(fmap, with_stats=False)

        fmap = {}
        for tree in trees:
            for line in tree.split('\n'):
                # look for the opening square bracket
                arr = line.split('[')
                # if no opening bracket (leaf node), ignore this line
                if len(arr) == 1:
                    continue

                # extract feature name from string between []
                fid = arr[1].split(']')[0].split('<')[0]

                if fid not in fmap:
                    # if the feature hasn't been seen yet
                    fmap[fid] = 1
                else:
                    fmap[fid] += 1

        return fmap

    else:
        trees = _bst.get_dump(fmap, with_stats=True)

        _importance_type += '='
        fmap = {}
        gmap = {}
        for tree in trees:
            for line in tree.split('\n'):
                # look for the opening square bracket
                arr = line.split('[')
                # if no opening bracket (leaf node), ignore this line
                if len(arr) == 1:
                    continue

                # look for the closing bracket, extract only info within that bracket
                fid = arr[1].split(']')

                # extract gain or cover from string after closing bracket
                g = float(fid[1].split(_importance_type)[1].split(',')[0])

                # extract feature name from string before closing bracket
                fid = fid[0].split('<')[0]

                if fid not in fmap:
                    # if the feature hasn't been seen yet
                    fmap[fid] = 1
                    gmap[fid] = g
                else:
                    fmap[fid] += 1
                    gmap[fid] += g

        # calculate average value (gain/cover) for each feature
        for fid in gmap:
            gmap[fid] = gmap[fid] / fmap[fid]

        return gmap

In [None]:
feature_importances_data = []
features = X_test.columns
for feature_name, feature_importance in get_importance(clf.best_estimator_.booster(), 'gain').iteritems():
    feature_importances_data.append({
            'feature': feature_name,
            'importance': feature_importance
        })

feature_importances = pd.DataFrame(feature_importances_data)

In [None]:
feature_importances['abs_imp'] = feature_importances['importance'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

In [None]:
plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

In [None]:
def run_cross_validation(_df, _classifier, _features_columns):
    # cross validation type can be changed here
    ss = sk.cross_validation.ShuffleSplit(len(_df.masking.unique()), n_iter=5, test_size=.3, random_state=0)
    target='target'
    prob_of = 'prob_of_all'
    
    results_cv_targeting = pd.DataFrame([], columns=['masking', target, 'fold', prob_of])

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    mean_lift = []
    mean_tp = 0.0
    mean_fp = range(0, 101)

    nb_calls_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_pos_targets', 'nb_pos_targets', 'pos_rate', 
                                           'Percentage_of_pos_targets_found', 'Percentage_of_Population', 'Lift'])
    feature_importances = pd.DataFrame([], columns=['feature', 'importance', 'fold'])

    fig = plt.figure(figsize=(6, 12))
    fig.subplots_adjust(bottom=-0.5, left=-0.5, top=0.5, right=1.5)


    print ('modeling started')

    for i, (train_index, valid_index) in enumerate(ss):

        customer_id = _df.masking.unique().copy()
        shuffled_customer_id = np.array(sorted(customer_id, key=lambda k: random.random()))
        train_customer_id = shuffled_customer_id[train_index]
        valid_customer_id = shuffled_customer_id[valid_index]

        train = _df.loc[ _df.masking.isin(train_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)
        valid = _df.loc[_df.masking.isin(valid_customer_id), np.concatenate([_features_columns, [target]],
                        axis=0)].copy().reset_index(drop=True)

        temp = valid[['masking', target]].copy()
        temp['fold'] = i

        # modeling#
        train_X = train.drop(['masking', target], axis=1)
        valid_X = valid.drop(['masking', target], axis=1)

        train_Y = np.array(train[target].astype(np.uint8))
        valid_Y = np.array(valid[target].astype(np.uint8))

        probas_ = _classifier.fit(train_X, train_Y,eval_metric='auc', eval_set=[(valid_X, valid_Y)],early_stopping_rounds=40).predict_proba(valid_X) 
        probabilities = pd.DataFrame(data=probas_[:, 1], index=valid_X.index, columns=[prob_of])

        temp = temp.join(probabilities, how='left')
        results_cv_targeting = results_cv_targeting.append(temp)

        ###############################################################################
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = sk.metrics.roc_curve(valid_Y, probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = sk.metrics.auc(fpr, tpr)


        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        ###############################################################################
        # compute lift at 10%#
        sorted_proba = np.array(list(reversed(np.argsort(probas_[:, 1]))))
        X_test = valid_X
        y_test = valid_Y
        centile = X_test.shape[0] / 100
        positives = sum(y_test)
        lift = [0]
        for q in xrange(1, 101):
            if q == 100:
                tp = sum(np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:X_test.shape[0]]])
            else:
                tp = sum(
                    np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:q * X_test.shape[0] / 100]])
            lift.append(lift[q - 1] + 100 * tp / float(positives))
        quantiles = range(0, 101)
        mean_tp += interp(mean_fp, mean_fp, lift)
        mean_tp[0] = 0.0
        mean_lift.append(lift[10] / 10.)


        plt.subplot(2, 2, 2)
        plt.plot(quantiles, lift, label='Lift fold %d at 10 = %0.2f' % (i, lift[10] / 10.))
        print ('shuffle: %i, AUC: %f, lift at 10 percent: %f' % (i, roc_auc, lift[10] / 10.))
        
        ###############################################################################
        # Calculate nb contacts to make
        nb_calls = temp[['target','prob_of_all','fold']].copy()
        nb_calls = nb_calls.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
        nb_calls['cum_Xsellers'] = np.cumsum(nb_calls.target)
        nb_calls = nb_calls.reset_index(drop=False)
        nb_calls = nb_calls.rename(columns={'index':'rank'})
        nb_calls['nb_contacts_100'] = nb_calls.loc[nb_calls.cum_Xsellers==100,'rank'].min()
        nb_calls['nb_contacts_200'] = nb_calls.loc[nb_calls.cum_Xsellers==200,'rank'].min()
        nb_calls['nb_contacts_500'] = nb_calls.loc[nb_calls.cum_Xsellers==500,'rank'].min()
        nb_calls['nb_contacts_1000'] = nb_calls.loc[nb_calls.cum_Xsellers==1000,'rank'].min()
        nb_calls['nb_contacts_2000'] = nb_calls.loc[nb_calls.cum_Xsellers==2000,'rank'].min()
        nb_calls['nb_contacts_3000'] = nb_calls.loc[nb_calls.cum_Xsellers==3000,'rank'].min()
        nb_calls['nb_contacts_all'] = nb_calls.loc[nb_calls.cum_Xsellers==nb_calls.cum_Xsellers.max(),'rank'].min()
        nb_calls = nb_calls[['nb_contacts_100','nb_contacts_200', 'nb_contacts_500','nb_contacts_1000', 'nb_contacts_2000','nb_contacts_3000','nb_contacts_all']].min()
        nb_calls = pd.DataFrame(nb_calls,columns=['nb_contacts'])
        nb_calls['total_population'] = temp.shape[0]
        nb_calls['total_pos_targets'] = temp.target.sum()
        nb_calls['nb_pos_targets']=[100,200,500,1000,2000,3000, temp.target.sum()]
        nb_calls['pos_rate'] = nb_calls.nb_pos_targets/nb_calls.nb_contacts
        nb_calls['Percentage_of_pos_targets_found'] = nb_calls.nb_pos_targets/nb_calls.total_pos_targets
        nb_calls['Percentage_of_Population'] = nb_calls.nb_contacts/nb_calls.total_population
        nb_calls['Lift'] = nb_calls.Percentage_of_pos_targets_found/nb_calls.Percentage_of_Population

        nb_calls_cv = nb_calls_cv.append(nb_calls)
        
        ###############################################################################
        feature_importances_data = []
        features = train_X.columns
        for feature_name, feature_importance in get_importance(_classifier.booster(), 'gain').iteritems():
            feature_importances_data.append({
                'feature': feature_name,
                'importance': feature_importance
            })

        temp = pd.DataFrame(feature_importances_data)
        temp['fold'] = i
        feature_importances = feature_importances.append(temp)
        
    
    nb_calls_cv = nb_calls_cv.reset_index().groupby('index').mean().sort_values(by='nb_pos_targets')
    results_cv_targeting = results_cv_targeting.reset_index(drop=True)
    
    feature_importances = feature_importances.groupby('feature')['importance'].agg([np.mean, np.std])
    feature_importances = feature_importances.sort_values(by='mean')
    feature_importances = feature_importances.reset_index()

    plt.subplot(2, 2, 1)
    mean_tpr /= len(ss)
    mean_tpr[-1] = 1.0
    mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.subplot(2, 2, 2)
    mean_tp /= len(ss)
    mean_tp[-1] = 100.0
    mean_lift10 = np.mean(mean_lift)
    print('Mean AUC: %f, Mean lift at 10 percent: %f' % (mean_auc, mean_lift10))
    plt.plot(mean_fp, mean_tp, 'k--', label='Mean lift at 10 = %0.2f' % mean_lift10, lw=2)

    plt.plot([0, 100], [0, 100], 'k--', color=(0.6, 0.6, 0.6))
    plt.xlim([-5, 105])
    plt.ylim([-5, 105])
    plt.xlabel('Percentage of population')
    plt.ylabel('Cumulative gain')
    plt.title('Lift', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.show()

    return results_cv_targeting, feature_importances, nb_calls_cv

In [None]:
# parameters of the classifier need to be changed according to datasets
classifier = xgb.XGBClassifier(objective='binary:logistic',max_depth=6,n_estimators=300, learning_rate=0.05,max_delta_step=1,
                        min_child_weight=25, gamma=0.1, scale_pos_weight=1, colsample_bytree=0.85, subsample=0.85,colsample_bylevel=0.85,
                        nthread=10, seed=27)

results_cv_targeting, feature_importances, nb_calls_cv = run_cross_validation(dataset_modelA_clean_targeting, classifier , feature_columns) 

In [None]:
feature_importances['abs_imp'] = feature_importances['mean'].apply(lambda x: abs(x))
feature_importances_sort = feature_importances.sort_values(by='abs_imp',ascending=False)
feature_importances_sort['relative_imp'] = 100.0 * (feature_importances_sort['abs_imp'] / feature_importances_sort['abs_imp'].max())
feature_importances_sort = feature_importances_sort[::-1].reset_index(drop=True)

plt.figure(figsize=(10, 20))
plt.title("Feature importances for Model")
plt.barh(feature_importances_sort.index, feature_importances_sort['relative_imp'],
         color='#348ABD', align="center", lw='3', edgecolor='#348ABD', alpha=0.6)
plt.yticks(feature_importances_sort.index, feature_importances_sort['feature'], fontsize=12,)
plt.ylim([-1, feature_importances_sort.index.max()+1])
plt.xlim([0, feature_importances_sort['relative_imp'].max()*1.1])
plt.show()

# Optional: Choose thresholds

In [None]:
def run_cross_validation(_df, _classifier, _features_columns):
    # cross validation type can be changed here
    kf = KFold(len(_df), n_folds=3, shuffle=True)
    target='target'
    prob_of = 'prob_of_all'
    
    results_cv_targeting = pd.DataFrame([], columns=['masking', target, 'fold', prob_of])

    mean_tpr = 0.0
    mean_fpr = np.linspace(0, 1, 100)
    all_tpr = []
    mean_lift = []
    mean_tp = 0.0
    mean_fp = range(0, 101)

    nb_calls_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_pos_targets', 'nb_pos_targets', 'pos_rate', 
                                           'Percentage_of_pos_targets_found', 'Percentage_of_Population', 'Lift'])
    feature_importances = pd.DataFrame([], columns=['feature', 'importance', 'fold'])

    fig = plt.figure(figsize=(6, 12))
    fig.subplots_adjust(bottom=-0.5, left=-0.5, top=0.5, right=1.5)

    X = _df[np.concatenate([_features_columns,[target]])]
    y = _df[target]
    
    print ('modeling started')

    for i, (train_index, valid_index) in enumerate(kf):
        
        train_X, valid_X = X.iloc[train_index], X.iloc[valid_index]
        train_Y, valid_Y = y.iloc[train_index], y.iloc[valid_index]

        temp = valid_X[['masking', target]].copy()
        temp['fold'] = i

        # modeling#
        train_X = train_X.drop(['masking', target], axis=1)
        valid_X = valid_X.drop(['masking', target], axis=1)

        train_Y = np.array(train_Y.astype(np.uint8))
        valid_Y = np.array(valid_Y.astype(np.uint8))
        


        probas_ = _classifier.fit(train_X, train_Y).predict_proba(valid_X)
        probabilities = pd.DataFrame(data=probas_[:, 1], index=valid_X.index, columns=[prob_of])

        temp = temp.join(probabilities, how='left')
        results_cv_targeting = results_cv_targeting.append(temp)

        ###############################################################################
        # Compute ROC curve and area the curve
        fpr, tpr, thresholds = sk.metrics.roc_curve(valid_Y, probas_[:, 1])
        mean_tpr += interp(mean_fpr, fpr, tpr)
        mean_tpr[0] = 0.0
        roc_auc = sk.metrics.auc(fpr, tpr)


        plt.subplot(2, 2, 1)
        plt.plot(fpr, tpr, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))

        ###############################################################################
        # compute lift at 10%#
        sorted_proba = np.array(list(reversed(np.argsort(probas_[:, 1]))))
        X_test = valid_X
        y_test = valid_Y
        centile = X_test.shape[0] / 100
        positives = sum(y_test)
        lift = [0]
        for q in xrange(1, 101):
            if q == 100:
                tp = sum(np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:X_test.shape[0]]])
            else:
                tp = sum(
                    np.array(y_test)[sorted_proba[(q - 1) * X_test.shape[0] / 100:q * X_test.shape[0] / 100]])
            lift.append(lift[q - 1] + 100 * tp / float(positives))
        quantiles = range(0, 101)
        mean_tp += interp(mean_fp, mean_fp, lift)
        mean_tp[0] = 0.0
        mean_lift.append(lift[10] / 10.)


        plt.subplot(2, 2, 2)
        plt.plot(quantiles, lift, label='Lift fold %d at 10 = %0.2f' % (i, lift[10] / 10.))
        print ('shuffle: %i, AUC: %f, lift at 10 percent: %f' % (i, roc_auc, lift[10] / 10.))
        
        ###############################################################################
        # Calculate nb contacts to make
        nb_calls = temp[['target','prob_of_all','fold']].copy()
        nb_calls = nb_calls.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
        nb_calls['cum_Xsellers'] = np.cumsum(nb_calls.target)
        nb_calls = nb_calls.reset_index(drop=False)
        nb_calls = nb_calls.rename(columns={'index':'rank'})
        nb_calls['nb_contacts_100'] = nb_calls.loc[nb_calls.cum_Xsellers==100,'rank'].min()
        nb_calls['nb_contacts_200'] = nb_calls.loc[nb_calls.cum_Xsellers==200,'rank'].min()
        nb_calls['nb_contacts_500'] = nb_calls.loc[nb_calls.cum_Xsellers==500,'rank'].min()
        nb_calls['nb_contacts_1000'] = nb_calls.loc[nb_calls.cum_Xsellers==1000,'rank'].min()
        nb_calls['nb_contacts_2000'] = nb_calls.loc[nb_calls.cum_Xsellers==2000,'rank'].min()
        nb_calls['nb_contacts_3000'] = nb_calls.loc[nb_calls.cum_Xsellers==3000,'rank'].min()
        nb_calls['nb_contacts_all'] = nb_calls.loc[nb_calls.cum_Xsellers==nb_calls.cum_Xsellers.max(),'rank'].min()
        nb_calls = nb_calls[['nb_contacts_100','nb_contacts_200', 'nb_contacts_500','nb_contacts_1000', 'nb_contacts_2000','nb_contacts_3000','nb_contacts_all']].min()
        nb_calls = pd.DataFrame(nb_calls,columns=['nb_contacts'])
        nb_calls['total_population'] = temp.shape[0]
        nb_calls['total_pos_targets'] = temp.target.sum()
        nb_calls['nb_pos_targets']=[100,200,500,1000,2000,3000, temp.target.sum()]
        nb_calls['pos_rate'] = nb_calls.nb_pos_targets/nb_calls.nb_contacts
        nb_calls['Percentage_of_pos_targets_found'] = nb_calls.nb_pos_targets/nb_calls.total_pos_targets
        nb_calls['Percentage_of_Population'] = nb_calls.nb_contacts/nb_calls.total_population
        nb_calls['Lift'] = nb_calls.Percentage_of_pos_targets_found/nb_calls.Percentage_of_Population

        nb_calls_cv = nb_calls_cv.append(nb_calls)
        
        ###############################################################################
        feature_importances_data = []
        features = train_X.columns
        for feature_name, feature_importance in get_importance(_classifier.booster(), 'gain').iteritems():
            feature_importances_data.append({
                'feature': feature_name,
                'importance': feature_importance
            })

        temp = pd.DataFrame(feature_importances_data)
        temp['fold'] = i
        feature_importances = feature_importances.append(temp)
    
    nb_calls_cv = nb_calls_cv.reset_index().groupby('index').mean().sort_values(by='nb_pos_targets')
    nb_calls_cv['total_population'] = nb_calls_cv['total_population'].apply(lambda x : round(x,0))
    results_cv_targeting = results_cv_targeting.reset_index(drop=True)
    
    feature_importances = feature_importances.groupby('feature')['importance'].agg([np.mean, np.std])
    feature_importances = feature_importances.sort_values(by='mean')
    feature_importances = feature_importances.reset_index()

    plt.subplot(2, 2, 1)
    mean_tpr /= len(kf)
    mean_tpr[-1] = 1.0
    mean_auc = sk.metrics.auc(mean_fpr, mean_tpr)
    plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2)

    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.subplot(2, 2, 2)
    mean_tp /= len(kf)
    mean_tp[-1] = 100.0
    mean_lift10 = np.mean(mean_lift)
    print('Mean AUC: %f, Mean lift at 10 percent: %f' % (mean_auc, mean_lift10))
    plt.plot(mean_fp, mean_tp, 'k--', label='Mean lift at 10 = %0.2f' % mean_lift10, lw=2)

    plt.plot([0, 100], [0, 100], 'k--', color=(0.6, 0.6, 0.6))
    plt.xlim([-5, 105])
    plt.ylim([-5, 105])
    plt.xlabel('Percentage of population')
    plt.ylabel('Cumulative gain')
    plt.title('Lift', fontsize=14)
    plt.grid(True)
    plt.legend(loc="lower right")

    plt.show()

    return results_cv_targeting, feature_importances, nb_calls_cv

In [None]:
# parameters of the classifier need to be changed according to datasets
classifier = xgb.XGBClassifier(objective='binary:logistic',max_depth=6,n_estimators=200, learning_rate=0.05,max_delta_step=1,
                        min_child_weight=25, gamma=0.1, scale_pos_weight=1, colsample_bytree=0.85, subsample=0.85,colsample_bylevel=0.85,
                        nthread=10, seed=27)

results_cv_targeting, feature_importances, nb_calls_cv = run_cross_validation(dataset_modelA_clean_targeting, classifier , feature_columns) 

In [None]:
nb_calls_cv

In [None]:
nb_calls_all_cv = pd.DataFrame([],columns=['nb_contacts', 'total_population', 'total_pos_targets', 'nb_pos_targets', 'pos_rate', 
                                           'Percentage_of_pos_targets_found', 'Percentage_of_Population', 'Lift'])

# Calculate nb calls to make
nb_calls_all = results_cv_targeting[['target', 'prob_of_all']].copy()
nb_calls_all = nb_calls_all.sort_values(by='prob_of_all', ascending=False).reset_index(drop=True)
nb_calls_all['cum_Xsellers'] = np.cumsum(nb_calls_all.target)
nb_calls_all = nb_calls_all.reset_index(drop=False)
nb_calls_all = nb_calls_all.rename(columns={'index': 'rank'})
nb_calls_all['nb_calls_100'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 100, 'rank'].min()
nb_calls_all['nb_calls_200'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 200, 'rank'].min()
nb_calls_all['nb_calls_500'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 500, 'rank'].min()
nb_calls_all['nb_calls_1000'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 1000, 'rank'].min()
nb_calls_all['nb_calls_2000'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 2000, 'rank'].min()
nb_calls_all['nb_calls_3000'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == 3000, 'rank'].min()
nb_calls_all['nb_calls_all'] = nb_calls_all.loc[nb_calls_all.cum_Xsellers == nb_calls_all.cum_Xsellers.max(), 'rank'].min()
nb_calls_all = nb_calls_all[
    ['nb_calls_100', 'nb_calls_200', 'nb_calls_500', 'nb_calls_1000', 'nb_calls_2000', 'nb_calls_3000',
     'nb_calls_all']].min()
nb_calls_all = pd.DataFrame(nb_calls_all, columns=['nb_contacts'])
nb_calls_all['total_population'] = results_cv_targeting.shape[0]
nb_calls_all['total_pos_targets'] = results_cv_targeting.target.sum()
nb_calls_all['nb_pos_targets']=[100,200,500,1000,2000,3000, results_cv_targeting.target.sum()]
nb_calls_all['pos_rate'] = nb_calls_all.nb_pos_targets/nb_calls_all.nb_contacts
nb_calls_all['Percentage_of_pos_targets_found'] = nb_calls_all.nb_pos_targets/nb_calls_all.total_pos_targets
nb_calls_all['Percentage_of_Population'] = nb_calls_all.nb_contacts/nb_calls_all.total_population
nb_calls_all['Lift'] = nb_calls_all.Percentage_of_pos_targets_found/nb_calls_all.Percentage_of_Population


nb_calls_all_cv = nb_calls_all_cv.append(nb_calls_all)

In [None]:
nb_calls_all_cv

In [None]:
##objective for having 500 positive targets

p1 = results_cv_targeting.sort_values(by='prob_of_all',ascending=False).reset_index(drop=True)
p1['nb_pos_target']=np.cumsum(p1.target)
index = p1.loc[p1.nb_pos_target==500].index.min()
print('min number to contact %f to get 500 Xsellers'%(index))

In [None]:
##objective for taking maximum of agents capability (1/10 per week, min conversion rate 10%)

r1 = results_cv_targeting.sort_values(by='prob_of_all',ascending=False).reset_index(drop=True)
r1['nb_pos_target'] = np.cumsum(r1.target)
r1 = r1.reset_index(drop=False)
r1 = r1.rename(columns={'index':'nb_contact'})
r1.nb_contact = r1.nb_contact+1
r1['pos_rate'] = r1['nb_pos_target'] / r1['nb_contact']
index = r1[(r1.pos_rate>0.1)].index.max()
print('max nb to contact %f customers to guarantee 10percente conversion rate'%(index))