#### Continuation from previous notebook

Recall from second notebook: 'Find a base and add on new features'. 

Created a small set of new features in the third notebook. 

In the final (fourth) notebook, run models on the 'final' dataset.

Run models: 
   1. Random Forest 
   2. Logistic Regression  
   3. SGD
   4. GBC
   5. AdaBoost Decision Tree.
   6. Ensemble of ensembles (voting=soft).
   7. Ensemble of 'best' (voting=soft).

In [44]:
## IMPORTS ##

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt

# garbage collector
import gc

In [45]:
# Load data fcn
def load_credit_data(data_path):
    csv_path = os.path.join("data", data_path)
    return pd.read_csv(csv_path)

In [46]:
# Load training data
# training_df = load_credit_data ("training_top.csv")
training_df = load_credit_data ("training_top_id.csv")
print (training_df.shape)

(307511, 268)


In [47]:
# Load features for training
features_training_df = load_credit_data ("training_new_features_v1.csv")
print (features_training_df.shape)
features_training_df.head()

(307511, 92)


Unnamed: 0,SK_ID_CURR,APP_CREDIT_TO_ANNUITY_RATIO,APP_CREDIT_TO_GOODS_RATIO,APP_INCOME_PER_CHLD,APP_CREDIT_TO_INCOME_RATIO,APP_ANNUITY_TO_INCOME_RATIO,APP_SOURCES_PROD,APP_EXT_SOURCES_MEAN,APP_EXT_SCORES_STD,APP_CAR_TO_BIRTH_RATIO,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,100002,16.461104,1.158397,202500.0,2.007889,0.121977,0.003043,0.161787,0.092026,-0.000627,...,-312.0,-25.0,-565.0,-295.0,1.0,1.0,1.0,31.0,12.0,20.421053
1,100003,36.234085,1.145199,270000.0,4.79075,0.132216,0.119932,0.466757,0.219895,-0.000627,...,-806.0,-536.0,-2310.0,-797.0,1.0,1.0,1.0,14.0,1.0,7.16
2,100004,20.0,1.0,67500.0,2.0,0.099999,0.119932,0.642739,0.122792,-0.001365,...,-763.0,-724.0,-784.0,-754.0,1.0,1.0,1.0,11.0,3.0,7.666667
3,100006,10.532818,1.052803,135000.0,2.316167,0.219898,0.119932,0.650442,0.136021,-0.000627,...,-211.0,-11.0,-545.0,-206.0,1.0,1.0,1.0,77.0,1.0,19.375
4,100007,23.461618,1.0,121500.0,4.222222,0.179961,0.119932,0.322738,0.136021,-0.000627,...,-852.5,-14.0,-2326.0,-851.0,1.0,5e-05,0.954545,31.0,-12.0,3.636364


In [48]:
# Merge for full set
# training_df = pd.concat ([training_df, features_training_df], axis=1)
training_df_merged = training_df.merge(right=features_training_df, on = 'SK_ID_CURR', how = 'left')

print (training_df_merged.shape)
training_df_merged.head()

(307511, 359)


Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,100002,0,202500.0,406597.5,24700.5,0.018801,-9461,-637,-3648.0,-2120,...,-312.0,-25.0,-565.0,-295.0,1.0,1.0,1.0,31.0,12.0,20.421053
1,100003,0,270000.0,1293502.5,35698.5,0.003541,-16765,-1188,-1186.0,-291,...,-806.0,-536.0,-2310.0,-797.0,1.0,1.0,1.0,14.0,1.0,7.16
2,100004,0,67500.0,135000.0,6750.0,0.010032,-19046,-225,-4260.0,-2531,...,-763.0,-724.0,-784.0,-754.0,1.0,1.0,1.0,11.0,3.0,7.666667
3,100006,0,135000.0,312682.5,29686.5,0.008019,-19005,-3039,-9833.0,-2437,...,-211.0,-11.0,-545.0,-206.0,1.0,1.0,1.0,77.0,1.0,19.375
4,100007,0,121500.0,513000.0,21865.5,0.028663,-19932,-3038,-4311.0,-3458,...,-852.5,-14.0,-2326.0,-851.0,1.0,5e-05,0.954545,31.0,-12.0,3.636364


In [49]:
# Load testing data
testing_df = load_credit_data ("testing_top_id.csv")
print (testing_df.shape)

(48744, 268)


In [50]:
# Load features for testing
features_testing_df = load_credit_data ("testing_new_features_v1.csv")
print (features_testing_df.shape)
features_testing_df.head()

(48744, 92)


Unnamed: 0,SK_ID_CURR,APP_CREDIT_TO_ANNUITY_RATIO,APP_CREDIT_TO_GOODS_RATIO,APP_INCOME_PER_CHLD,APP_CREDIT_TO_INCOME_RATIO,APP_ANNUITY_TO_INCOME_RATIO,APP_SOURCES_PROD,APP_EXT_SOURCES_MEAN,APP_EXT_SCORES_STD,APP_CAR_TO_BIRTH_RATIO,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,100001,27.664697,1.264,135000.0,4.213333,0.152299,0.094803,0.567263,0.353601,-0.000624,...,-1715.0,-1619.0,-2916.0,-1709.0,1.0,1.0,1.0,36.0,-11.0,7.285714
1,100005,12.82487,1.2376,99000.0,2.250182,0.175453,0.071345,0.429869,0.136694,-0.000624,...,-585.0,-466.0,-706.0,-586.0,1.0,1.0,1.0,37.0,-1.0,23.555556
2,100013,9.505482,1.0528,202500.0,3.275378,0.344576,0.119686,0.655389,0.062788,-0.00025,...,-1383.0,-14.0,-2705.0,-1383.0,1.0,0.000266,0.935484,38.0,-21.0,5.180645
3,100028,32.130726,1.0,105000.0,5.0,0.155614,0.164177,0.549372,0.055432,-0.000624,...,-812.0,-27.0,-1773.0,-812.0,1.0,0.030496,0.911504,19.0,-7.0,3.0
4,100038,19.506034,1.0,90000.0,3.475,0.178149,0.119686,0.313916,0.158068,-0.001227,...,-634.0,-457.0,-787.0,-622.0,1.0,1.0,1.0,18.0,9.0,12.25


In [51]:
# Merge for full set
# testing_df = pd.concat ([testing_df, features_testing_df], axis=1)
testing_df_merged = testing_df.merge(right=features_testing_df, on = 'SK_ID_CURR', how = 'left')

print (testing_df_merged.shape)
testing_df_merged.head()

(48744, 359)


Unnamed: 0,SK_ID_CURR,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,100001,0,135000.0,568800.0,20560.5,0.01885,-19241,-2329,-5170.0,-812,...,-1715.0,-1619.0,-2916.0,-1709.0,1.0,1.0,1.0,36.0,-11.0,7.285714
1,100005,0,99000.0,222768.0,17370.0,0.035792,-18064,-4469,-9118.0,-1623,...,-585.0,-466.0,-706.0,-586.0,1.0,1.0,1.0,37.0,-1.0,23.555556
2,100013,0,202500.0,663264.0,69777.0,0.019101,-20038,-4458,-2175.0,-3503,...,-1383.0,-14.0,-2705.0,-1383.0,1.0,0.000266,0.935484,38.0,-21.0,5.180645
3,100028,2,315000.0,1575000.0,49018.5,0.026392,-13976,-1866,-2000.0,-4208,...,-812.0,-27.0,-1773.0,-812.0,1.0,0.030496,0.911504,19.0,-7.0,3.0
4,100038,1,180000.0,625500.0,32067.0,0.010032,-13040,-2191,-4000.0,-4262,...,-634.0,-457.0,-787.0,-622.0,1.0,1.0,1.0,18.0,9.0,12.25


In [52]:
# Drop ID
training_df = training_df_merged.drop ('SK_ID_CURR', axis=1)
testing_df = testing_df_merged.drop ('SK_ID_CURR', axis=1)

In [53]:
training_df.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,0,202500.0,406597.5,24700.5,0.018801,-9461,-637,-3648.0,-2120,9.0,...,-312.0,-25.0,-565.0,-295.0,1.0,1.0,1.0,31.0,12.0,20.421053
1,0,270000.0,1293502.5,35698.5,0.003541,-16765,-1188,-1186.0,-291,9.0,...,-806.0,-536.0,-2310.0,-797.0,1.0,1.0,1.0,14.0,1.0,7.16
2,0,67500.0,135000.0,6750.0,0.010032,-19046,-225,-4260.0,-2531,26.0,...,-763.0,-724.0,-784.0,-754.0,1.0,1.0,1.0,11.0,3.0,7.666667
3,0,135000.0,312682.5,29686.5,0.008019,-19005,-3039,-9833.0,-2437,9.0,...,-211.0,-11.0,-545.0,-206.0,1.0,1.0,1.0,77.0,1.0,19.375
4,0,121500.0,513000.0,21865.5,0.028663,-19932,-3038,-4311.0,-3458,9.0,...,-852.5,-14.0,-2326.0,-851.0,1.0,5e-05,0.954545,31.0,-12.0,3.636364


In [54]:
testing_df.head()

Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,INSTALL_DAYS_ENTRY_PAYMENT_MEDIAN,INSTALL_DAYS_INSTALMENT_MAX,INSTALL_DAYS_INSTALMENT_MIN,INSTALL_DAYS_INSTALMENT_MEDIAN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MAX,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MIN,INSTALL_INSTPAY_PAY_TO_INSTALL_RATIO_MEAN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MAX,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MIN,INSTALL_INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY_MEAN
0,0,135000.0,568800.0,20560.5,0.01885,-19241,-2329,-5170.0,-812,9.0,...,-1715.0,-1619.0,-2916.0,-1709.0,1.0,1.0,1.0,36.0,-11.0,7.285714
1,0,99000.0,222768.0,17370.0,0.035792,-18064,-4469,-9118.0,-1623,9.0,...,-585.0,-466.0,-706.0,-586.0,1.0,1.0,1.0,37.0,-1.0,23.555556
2,0,202500.0,663264.0,69777.0,0.019101,-20038,-4458,-2175.0,-3503,5.0,...,-1383.0,-14.0,-2705.0,-1383.0,1.0,0.000266,0.935484,38.0,-21.0,5.180645
3,2,315000.0,1575000.0,49018.5,0.026392,-13976,-1866,-2000.0,-4208,9.0,...,-812.0,-27.0,-1773.0,-812.0,1.0,0.030496,0.911504,19.0,-7.0,3.0
4,1,180000.0,625500.0,32067.0,0.010032,-13040,-2191,-4000.0,-4262,16.0,...,-634.0,-457.0,-787.0,-622.0,1.0,1.0,1.0,18.0,9.0,12.25


In [55]:
# Load labels data
labels_df = load_credit_data ("y_labels.csv")
print (labels_df.shape)

y_train = labels_df['TARGET'].copy()

(307511, 1)


In [None]:
# ROC curve
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

### Random Forest

In [None]:
# Run GridSearch cross validation with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

final_rf = RandomForestClassifier(random_state=123)

# Random forest (similar to before with 250 + a bit bigger 300) 
param_grid = {'n_estimators': [20, 250, 300]}

# CV = 3 to cut short computational time
grid_search_final_rf = GridSearchCV(estimator=final_rf, param_grid=param_grid , cv=3, scoring='roc_auc', verbose=100)

grid_search_final_rf.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_final_rf.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_final_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# Find BEST model

forest_final_clf = grid_search_final_rf.best_estimator_
y_probas_final_forest = cross_val_predict(forest_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_final_forest = y_probas_final_forest[:, 1] 
fpr_final_forest, tpr_final_forest, thresholds_final_forest = roc_curve(y_train, y_scores_final_forest)

print ("AUC: ", auc(fpr_final_forest, tpr_final_forest))

In [None]:
predictions = forest_merge_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('random_forest_features_final.csv', index = False)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_rf = LogisticRegression(random_state=123)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],'penalty': ['l1', 'l2']}

grid_search_log_rf = GridSearchCV(estimator=log_rf, param_grid=param_grid , cv=5, scoring='roc_auc', verbose=100)

grid_search_log_rf.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_log_rf.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_log_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

Fitting 5 folds for each of 14 candidates, totalling 70 fits
[CV] C=0.001, penalty=l1 .............................................
[CV] ..... C=0.001, penalty=l1, score=0.730554573229131, total= 2.8min
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.8min remaining:    0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] .... C=0.001, penalty=l1, score=0.7275172951734359, total= 3.8min
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  6.6min remaining:    0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] .... C=0.001, penalty=l1, score=0.7271524403958223, total= 2.4min
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  9.1min remaining:    0.0s
[CV] C=0.001, penalty=l1 .............................................
[CV] .... C=0.001, penalty=l1, score=0.7303388893736781, total= 3.8min
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 12.9min remaining:    0.0s
[CV] C=0.001, penalty=l1 ..........................

In [None]:
# Find BEST model
from sklearn.model_selection import cross_val_predict

log_final_clf = grid_search_log_rf.best_estimator_
y_probas_log_final = cross_val_predict(log_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_log_final = y_probas_log_final[:, 1] 
fpr_log_final, tpr_log_final, thresholds_log_final = roc_curve(y_train, y_scores_log_final)

print ("AUC: ", auc(fpr_log_final, tpr_log_final))

In [None]:
predictions = log_final_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('log_features_final.csv', index = False)

### SGD

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(training_df)

X_tr = scaler.transform(training_df)
X_te = scaler.transform(testing_df)

In [None]:
from sklearn.linear_model import SGDClassifier

model_sgd = SGDClassifier(random_state=123)

param_distributions = {'loss': ['log', 'modified_huber'], 'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 'penalty': ['l1', 'l2']}

grid_search_sgd_cv = GridSearchCV(model_sgd, param_distributions, cv=2, verbose=100) 
grid_search_sgd_cv.fit(X_tr, y_train)

# Results of the grid search for best n_estimator
print(grid_search_sgd_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_sgd_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
from sklearn.model_selection import cross_val_predict

sgd_final_clf = grid_search_sgd_cv.best_estimator_
y_probas_sgd_final = cross_val_predict(sgd_final_clf, X_tr, y_train, cv=3, method="predict_proba")
y_scores_sgd_final = y_probas_sgd_final[:, 1] 
fpr_sgd_final, tpr_sgd_final, thresholds_sgd_final = roc_curve(y_train, y_scores_sgd_final)

print ("AUC: ", auc(fpr_sgd_final, tpr_sgd_final))

In [None]:
predictions = grid_search_sgd_cv.predict_proba(X_te)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('sgd_features_final.csv', index = False)

### GBC

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = GradientBoostingClassifier(random_state=123)

param_distributions = {'max_depth': np.arange(1, 6), 'n_estimators': [20, 150, 250, 300], 'learning_rate': [0.01, 0.1, 0.5, 0.7, 0.8] }

grid_search_gbc_cv = GridSearchCV(model_gbc, param_distributions, cv=2, verbose=100) 
grid_search_gbc_cv.fit(top_training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_gbc_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_gbc_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
from sklearn.model_selection import cross_val_predict

gbc_final_clf = grid_search_gbc_cv.best_estimator_
y_probas_gbc_final = cross_val_predict(gbc_final_clf, top_training_df, y_train, cv=3, method="predict_proba")
y_scores_gbc_final = y_probas_gbc_final[:, 1] 
fpr_gbc_final, tpr_gbc_final, thresholds_gbc_final = roc_curve(y_train, y_scores_gbc_final)

print ("AUC: ", auc(fpr_gbc_final, tpr_gbc_final))

In [None]:
predictions = grid_search_gbc_cv.predict_proba(top_testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('gbc_features_final.csv', index = False)

### AdaBoost on DecisionTree

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

model_ada = AdaBoostClassifier(random_state=123)

param_distributions = {'n_estimators': [20, 250, 300], 'learning_rate': [0.2, 0.5, 0.7]}
grid_search_ada_cv = GridSearchCV(model_ada, param_distributions, cv=2, verbose=100) 

grid_search_ada_cv.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_ada_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_ada_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# Find BEST model
from sklearn.model_selection import cross_val_predict

ada_final_clf = grid_search_ada_cv.best_estimator_
y_probas_ada_final = cross_val_predict(ada_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_ada_final = y_probas_ada_final[:, 1] 
fpr_ada_final, tpr_ada_final, thresholds_ada_final = roc_curve(y_train, y_scores_ada_final)

print ("AUC: ", auc(fpr_ada_final, tpr_ada_final))

In [None]:
predictions = ada_final_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('ada_features_final_dt.csv', index = False)

### Ensembles of 'Best'

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# To make this notebook's output stable across runs
np.random.seed(123)

b_clf1 = RandomForestClassifier(n_estimators=300)
b_clf2 = LogisticRegression(C=10, penalty='l1')
b_clf3 = DecisionTreeClassifier (max_depth=1, criterion='gini')
b_clf4 = SGDClassifier (alpha=0.1, loss='log', penalty='l2')
b_clf5 = GaussianProcessClassifier () # Only 'new' one
b_eclf = VotingClassifier(estimators=[('rfor', b_clf1), ('logreg', b_clf2), ('dt', b_clf3),('sgd', b_clf4),('gpc', b_clf5)], weights=[1,1,1,1,1], voting='soft')
b_eclf = b_eclf.fit(X_tr, y_train) 

In [None]:
from sklearn.model_selection import cross_val_predict

y_probas_bvote_final = cross_val_predict(b_eclf, X_tr, y_train, cv=3, method="predict_proba")
y_scores_bvote_final = y_probas_bvote_final[:, 1] 
fpr_bvote_final, tpr_bvote_final, thresholds_bvote_final = roc_curve(y_train, y_scores_bvote_final)

print ("AUC: ", auc(fpr_bvote_final, tpr_bvote_final))

In [None]:
predictions = b_eclf.predict_proba(X_te)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('best_voting_classifier_features_final.csv', index = False)

### Ensemble of Ensembles

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# conda install py-xgboost
import xgboost as xgb

# To make this notebook's output stable across runs
np.random.seed(123)

clf1 = AdaBoostClassifier(n_estimators=300)
clf2 = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, criterion='gini',max_depth=5)
clf3 = xgb.XGBClassifier(n_estimators=300, nthread=-1, max_depth=5)
clf4 = GradientBoostingClassifier(n_estimators=300, max_depth=5)
eclf = VotingClassifier(estimators=[('ab', clf1), ('etc', clf2), ('xgb', clf3),('gbc', clf4)], weights=[1,1,1,1], voting='soft')
eclf = eclf.fit(training_df, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict

y_probas_vote_final = cross_val_predict(eclf, training_df, y_train, cv=3, method="predict_proba")
y_scores_vote_final = y_probas_vote_final[:, 1] 
fpr_vote_final, tpr_vote_final, thresholds_vote_final = roc_curve(y_train, y_scores_vote_final)

print ("AUC: ", auc(fpr_vote_final, tpr_vote_final))

In [None]:
predictions = eclf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('voting_classifier_ensem_features_final.csv', index = False)

### Plot  curves

In [7]:
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_final_forest, tpr_final_forest, "Random Forest")
plot_roc_curve(fpr_log_final, tpr_log_final, "Log Reg")
plot_roc_curve(fpr_sgd_final, tpr_sgd_final, "SGD")
plot_roc_curve(fpr_gbc_final, tpr_gbc_final, "GBC")
plot_roc_curve(fpr_ada_final, tpr_ada_final, "AdaBoost")
plot_roc_curve(fpr_bvote_final, tpr_bvote_final, "Ensemble - Best")
plot_roc_curve(fpr_vote_final, tpr_vote_final, "Ensemble - Ensemble")
plt.legend(loc="lower right", fontsize=16)
plt.title("Comparing Models")
plt.show()

NameError: name 'fpr_final_forest' is not defined

<matplotlib.figure.Figure at 0x1a0a0262b0>

### Submission scores:

   1. Random Forest 
   2. Logistic Regression  
   3. SGD
   4. GBC
   5. AdaBoost Decision Tree.
   6. Ensemble of ensembles (voting=soft).
   7. Ensemble of 'best' (voting=soft).