In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
pd.set_option('display.max_columns',None)

In [10]:
train = pd.read_csv('Training_3oct_sh.csv')
target = '47'
IDcol = 'application_key'
train.shape

(80000, 49)

In [11]:
from sklearn.ensemble import GradientBoostingClassifier  #GBM algorithm
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4



def modelfit(alg, dtrain, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target])
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    #Perform cross-validation:
    if performCV:
        cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain[target], cv=cv_folds, scoring='roc_auc')
    
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
    
    if performCV:
        print("CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))
        
    #Print Feature Importance:
    if printFeatureImportance:
        feat_imp = pd.Series(alg.feature_importances_, predictors).sort_values(ascending=False)
        feat_imp.plot(kind='bar', title='Feature Importances')
        plt.ylabel('Feature Importance Score')

In [12]:
predictors = [x for x in train.columns if x not in [target, IDcol]]
# gbm0 = GradientBoostingClassifier(random_state=10)
# modelfit(gbm0, train, predictors)

# n_estimator 80

In [13]:
#Choose all predictors except target & IDcols
# predictors = [x for x in train.columns if x not in [target, IDcol]]
param_test1 = {"n_estimators":[80,90,100,110,120,130,140,150,160,170]}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(train[predictors],train[target])

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.2, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=50, min_samples_split=500,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=10, subsample=0.8, verbose=0,
              warm_start=False),
       fit_params={}, iid=False, n_jobs=4,
       param_grid={'n_estimators': [80, 90, 100, 110, 120, 130, 140, 150, 160, 170]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [14]:
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.83617, std: 0.00780, params: {'n_estimators': 80},
  mean: 0.83576, std: 0.00775, params: {'n_estimators': 90},
  mean: 0.83555, std: 0.00784, params: {'n_estimators': 100},
  mean: 0.83513, std: 0.00783, params: {'n_estimators': 110},
  mean: 0.83501, std: 0.00789, params: {'n_estimators': 120},
  mean: 0.83454, std: 0.00780, params: {'n_estimators': 130},
  mean: 0.83390, std: 0.00806, params: {'n_estimators': 140},
  mean: 0.83358, std: 0.00812, params: {'n_estimators': 150},
  mean: 0.83342, std: 0.00831, params: {'n_estimators': 160},
  mean: 0.83310, std: 0.00832, params: {'n_estimators': 170}],
 {'n_estimators': 80},
 0.83617238564389)

# max_depth': 5,

In [19]:
param_test2 = {'max_depth':[5,7,9,11,13,15,17,19], 'min_samples_split':[600,800,1000,1200,1400,1600]}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.2, n_estimators=80, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(train[predictors],train[target])
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.83757, std: 0.00694, params: {'max_depth': 5, 'min_samples_split': 600},
  mean: 0.83764, std: 0.00713, params: {'max_depth': 5, 'min_samples_split': 800},
  mean: 0.83748, std: 0.00758, params: {'max_depth': 5, 'min_samples_split': 1000},
  mean: 0.83761, std: 0.00769, params: {'max_depth': 5, 'min_samples_split': 1200},
  mean: 0.83758, std: 0.00749, params: {'max_depth': 5, 'min_samples_split': 1400},
  mean: 0.83820, std: 0.00730, params: {'max_depth': 5, 'min_samples_split': 1600},
  mean: 0.83648, std: 0.00673, params: {'max_depth': 7, 'min_samples_split': 600},
  mean: 0.83654, std: 0.00663, params: {'max_depth': 7, 'min_samples_split': 800},
  mean: 0.83758, std: 0.00663, params: {'max_depth': 7, 'min_samples_split': 1000},
  mean: 0.83680, std: 0.00716, params: {'max_depth': 7, 'min_samples_split': 1200},
  mean: 0.83761, std: 0.00681, params: {'max_depth': 7, 'min_samples_split': 1400},
  mean: 0.83768, std: 0.00618, params: {'max_depth': 7, 'min_samples_split': 160

# min sample split 1000 and min sample leaf 80

In [21]:
param_test3 = {'min_samples_split':[1000,1200,1400,1600,1800,2000,2200,2400], 'min_samples_leaf':[30,40,50,60,70,80]}
gsearch3 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=5,max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.83823, std: 0.00731, params: {'min_samples_leaf': 30, 'min_samples_split': 1000},
  mean: 0.83804, std: 0.00704, params: {'min_samples_leaf': 30, 'min_samples_split': 1200},
  mean: 0.83768, std: 0.00748, params: {'min_samples_leaf': 30, 'min_samples_split': 1400},
  mean: 0.83803, std: 0.00711, params: {'min_samples_leaf': 30, 'min_samples_split': 1600},
  mean: 0.83808, std: 0.00710, params: {'min_samples_leaf': 30, 'min_samples_split': 1800},
  mean: 0.83797, std: 0.00731, params: {'min_samples_leaf': 30, 'min_samples_split': 2000},
  mean: 0.83772, std: 0.00751, params: {'min_samples_leaf': 30, 'min_samples_split': 2200},
  mean: 0.83758, std: 0.00701, params: {'min_samples_leaf': 30, 'min_samples_split': 2400},
  mean: 0.83823, std: 0.00727, params: {'min_samples_leaf': 40, 'min_samples_split': 1000},
  mean: 0.83786, std: 0.00694, params: {'min_samples_leaf': 40, 'min_samples_split': 1200},
  mean: 0.83775, std: 0.00726, params: {'min_samples_leaf': 40, 'min_samples_spl

# max feature 23

In [23]:
param_test4 = {'max_features':[7,9,11,13,15,17,19,21,23,25]}
gsearch4 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=80,max_depth=5, min_samples_split=1000, min_samples_leaf=80, subsample=0.8, random_state=10),
param_grid = param_test4, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch4.fit(train[predictors],train[target])
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.83820, std: 0.00707, params: {'max_features': 7},
  mean: 0.83853, std: 0.00710, params: {'max_features': 9},
  mean: 0.83837, std: 0.00731, params: {'max_features': 11},
  mean: 0.83829, std: 0.00718, params: {'max_features': 13},
  mean: 0.83862, std: 0.00709, params: {'max_features': 15},
  mean: 0.83860, std: 0.00740, params: {'max_features': 17},
  mean: 0.83835, std: 0.00696, params: {'max_features': 19},
  mean: 0.83862, std: 0.00747, params: {'max_features': 21},
  mean: 0.83873, std: 0.00703, params: {'max_features': 23},
  mean: 0.83861, std: 0.00690, params: {'max_features': 25}],
 {'max_features': 23},
 0.8387290764439094)

# subsample': 0.85

In [26]:
param_test5 = {'subsample':[0.5,0.6,0.7,0.75,0.8,0.85,0.9]}
gsearch5 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.15, n_estimators=80,max_depth=5,min_samples_split=1000, min_samples_leaf=80, subsample=0.8, random_state=10,max_features=13),
param_grid = param_test5, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch5.fit(train[predictors],train[target])
gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_

([mean: 0.83869, std: 0.00679, params: {'subsample': 0.5},
  mean: 0.83869, std: 0.00680, params: {'subsample': 0.6},
  mean: 0.83870, std: 0.00704, params: {'subsample': 0.7},
  mean: 0.83885, std: 0.00687, params: {'subsample': 0.75},
  mean: 0.83868, std: 0.00703, params: {'subsample': 0.8},
  mean: 0.83916, std: 0.00708, params: {'subsample': 0.85},
  mean: 0.83865, std: 0.00719, params: {'subsample': 0.9}],
 {'subsample': 0.85},
 0.8391571851187152)

In [27]:
lead_df=pd.read_csv('Lead_3oct_sh.csv',index_col='application_key')
# lead_df.columns=colm[:-1]
# lead_df=(lead_df-lead_df.mean(axis=0))/lead_df.std(axis=0)
# lead_df=lead_df[imp_col[:-1]]
# lead_df=Pca(lead_df)
print(len(predictors),lead_df.shape, 'gbm ')


lead_pred=gsearch5.predict(lead_df)
proba=gsearch5.predict_proba(lead_df)
lead_df['pridicted']=lead_pred
lead_df['prob']=proba[:,0]
lead_df=lead_df.sort_values(by=['prob'],ascending=False)
result=lead_df['pridicted'].astype(int)
result.to_csv('Datadevils_IITGuwahati_206.csv')

47 (25000, 47) gbm 


In [24]:
# lead_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,pridicted,prob
application_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
369499,1873.0,0.0,0.0,0.0,0.0,4360.0,135740.0,19820.0,84204.0,173567.0,0.0,0.0,48164.0,99100.0,489849.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2196.0,5445.0,5445.0,1460.0,5870.0,2008.0,0.0,1155.0,10.6667,0.0,0.0,3.0,11.0,1.0,0.0,0.0,51.069,0.0,10.0,0.97276,0.0,0.0,1.0,0.0,0.989401
352099,1921.0,0.0919,0.0,0.0,0.0,5002.0,123875.0,23585.0,204939.0,217215.0,0.0,3188.0,25194.0,123875.0,300192.0,0.0,0.0,0.0,0.0,0.0,4.467,457.11,480.4,2.32,1223.0,4684.0,4684.0,30386.0,5962.0,3589.0,0.0,1086.0,5.25,0.0,0.0,5.0,13.0,6.0,0.0,0.0,0.0,0.0,13.0,0.90942,0.0,0.0,1.0,0.0,0.989387
368180,1861.0,0.1036,0.0,0.0,0.0,2973.0,182925.0,31154.0,135553.0,280601.0,0.0,3367.0,802961.0,247751.0,499812.0,0.0,0.0,0.0,0.0,0.0,5.507,950.0,0.0,0.39,1816.0,3315.0,3315.0,1703.0,6722.0,3833.0,0.0,1015.0,2.0833,0.0,0.0,2.0,13.0,4.0,0.0,0.0,3.8,0.025,13.0,0.98292,0.0,0.0,1.0,0.0,0.98925
369170,1950.0,0.0,0.0,0.0,0.0,6918.0,49631.0,19634.0,300720.0,108487.0,0.0,2274.0,35339.0,128830.0,409205.0,0.0,0.0,0.0,0.0,0.0,3.891,298.35,0.0,2.39,2100.0,5901.0,5901.0,30386.0,13353.0,0.0,0.0,2070.0,19.0833,0.0,0.0,5.0,20.0,5.0,0.0,0.0,0.0,0.0,14.0,0.98846,0.0,0.0,1.0,0.0,0.98919
354652,1943.0,0.0544,0.0,0.0,0.0,1982.0,37454.0,20811.0,136064.0,103542.0,0.0,2968.0,41201.0,123875.0,322075.0,0.0,0.0,0.0,0.0,0.0,8.319,933.64,0.0,2.01,2623.0,10250.0,10250.0,30386.0,10250.0,0.0,0.0,2241.0,20.8333,0.0,0.0,4.0,19.0,2.0,0.0,0.0,0.0,0.0,12.0,0.9002,0.0,0.0,1.0,0.0,0.98919


# best columns 

In [28]:
modelfit(gsearch5, train, predictors)


Model Report
Accuracy : 0.8209
AUC Score (Train): 0.854807
CV Score : Mean - 0.8387382 | Std - 0.006893564 | Min - 0.828587 | Max - 0.8472151


AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'