#### Continuation from previous notebook

Recall from second notebook: 'Find a base and add on new features'. 

Created a small set of new features in the third notebook. 

In the final (fourth) notebook, run models on the 'final' dataset.

Run models: 
   1. Random Forest 
   2. Logistic Regression  
   3. SGD
   4. GBC
   5. AdaBoost Decision Tree.
   6. Ensemble of ensembles (voting=soft).
   7. Ensemble of 'best' (voting=soft).

In [1]:
## IMPORTS ##

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt

# garbage collector
import gc

In [2]:
# Load data fcn
def load_credit_data(data_path):
    csv_path = os.path.join("data", data_path)
    return pd.read_csv(csv_path)

In [3]:
# Load training data
training_df = load_credit_data ("training_top.csv")
print (training_df.shape)

(307511, 267)


In [9]:
# Load features for training
features_training_df = load_credit_data ("training_new_features_v1.csv")
features_training_df =features_training_df.drop ('SK_ID_CURR', axis=1)
print (features_training_df.shape)
features_training_df.head()

(307511, 20)


Unnamed: 0,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,APP_NEW_PHONE_TO_BIRTH_RATIO,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO,CASH_FUTURE_TO_TERM_RATIO,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
0,16.461104,1.158397,202500.0,0.121977,0.003043,0.161787,0.092026,-0.000627,-0.004357,0.11986,1.78022,2.007889,0.0,1.0,19.353584,0.0,0.0,0.625,1.0,20.421053
1,36.234085,1.145199,270000.0,0.132216,0.119932,0.466757,0.219895,-0.000627,-0.004357,0.049389,0.69697,4.79075,0.050304,1.057664,8.677472,0.0,0.0,0.544643,1.0,7.16
2,20.0,1.0,67500.0,0.099999,0.119932,0.642739,0.122792,-0.001365,-0.115556,0.042791,3.622222,2.0,0.241719,0.828021,3.753045,0.0,0.0,0.5625,1.0,7.666667
3,10.532818,1.052803,135000.0,0.219898,0.119932,0.650442,0.136021,-0.000627,-0.004357,0.032465,0.203027,2.316167,0.078823,1.008456,15.206011,0.0,1.067054,0.571429,1.0,19.375
4,23.461618,1.0,121500.0,0.179961,0.119932,0.322738,0.136021,-0.000627,-0.004357,0.055489,0.364055,4.222222,0.091961,1.046356,12.644075,0.0,0.0,0.557561,333.751175,3.636364


In [11]:
# Merge for full set
training_df = pd.concat ([training_df, features_training_df], axis=1)
print (training_df.shape)
training_df.head()

In [4]:
# Load testing data
testing_df = load_credit_data ("testing_top.csv")
print (testing_df.shape)

(48744, 267)


In [13]:
# Load features for testing
features_testing_df = load_credit_data ("testing_new_features_v1.csv")
features_testing_df = features_testing_df.drop ('SK_ID_CURR', axis=1)
print (features_testing_df.shape)
features_testing_df.head()

(48744, 20)


Unnamed: 0,APP_NEW_CREDIT_TO_ANNUITY_RATIO,APP_NEW_CREDIT_TO_GOODS_RATIO,APP_NEW_INC_PER_CHLD,APP_NEW_ANNUITY_TO_INCOME_RATIO,APP_NEW_SOURCES_PROD,APP_NEW_EXT_SOURCES_MEAN,APP_NEW_SCORES_STD,APP_NEW_CAR_TO_BIRTH_RATIO,APP_NEW_CAR_TO_EMPLOY_RATIO,APP_NEW_PHONE_TO_BIRTH_RATIO,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO,CASH_FUTURE_TO_TERM_RATIO,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
0,27.664697,1.264,135000.0,0.152299,0.094803,0.567263,0.353601,-0.000624,-0.004,0.090432,0.747102,4.213333,0.10594,0.957782,6.020501,0.0,0.0,0.361111,1.0,7.285714
1,12.82487,1.2376,99000.0,0.175453,0.071345,0.429869,0.136694,-0.000624,-0.004,-0.0,-0.0,2.250182,0.080457,0.949975,9.212916,0.0,0.0,0.598485,1.0,23.555556
2,9.505482,1.0528,202500.0,0.344576,0.119686,0.655389,0.062788,-0.00025,-0.001122,0.042719,0.192014,3.275378,0.060075,1.039272,11.163349,0.115301,1.688193,0.631173,47.584916,5.180645
3,32.130726,1.0,105000.0,0.155614,0.164177,0.549372,0.055432,-0.000624,-0.004,0.12915,0.96731,5.0,0.057698,inf,14.073381,0.035934,inf,0.491129,1.486138,3.0
4,19.506034,1.0,90000.0,0.178149,0.119686,0.313916,0.158068,-0.001227,-0.007303,0.06296,0.374715,3.475,0.06858,1.131358,14.564047,0.0,0.0,0.487179,1.0,12.25


In [14]:
# Merge for full set
testing_df = pd.concat ([testing_df, features_testing_df], axis=1)
print (testing_df.shape)
testing_df.head()

(48744, 287)


Unnamed: 0,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,...,APP_NEW_PHONE_TO_EMPLOY_RATIO,APP_NEW_CREDIT_TO_INCOME_RATIO,PREV_NEW_PAYMENT_TO_CREDIT_RATIO,PREV_NEW_CREDIT_TO_APPLICATION_RATIO,PREV_NEW_CREDIT_TO_ANNUITY_RATIO,CREDIT_BAL_TO_LIMIT_RATIO,CREDIT_PAYMENT_TO_MININSTALLRATIO,CASH_FUTURE_TO_TERM_RATIO,INSTPAY_INSTALL_TO_PAY_RATIO,INSTPAY_DIFF_DAYS_INSTALL_DAYS_PAY
0,0,135000.0,568800.0,20560.5,0.01885,-19241,-2329,-5170.0,-812,9.0,...,0.747102,4.213333,0.10594,0.957782,6.020501,0.0,0.0,0.361111,1.0,7.285714
1,0,99000.0,222768.0,17370.0,0.035792,-18064,-4469,-9118.0,-1623,9.0,...,-0.0,2.250182,0.080457,0.949975,9.212916,0.0,0.0,0.598485,1.0,23.555556
2,0,202500.0,663264.0,69777.0,0.019101,-20038,-4458,-2175.0,-3503,5.0,...,0.192014,3.275378,0.060075,1.039272,11.163349,0.115301,1.688193,0.631173,47.584916,5.180645
3,2,315000.0,1575000.0,49018.5,0.026392,-13976,-1866,-2000.0,-4208,9.0,...,0.96731,5.0,0.057698,inf,14.073381,0.035934,inf,0.491129,1.486138,3.0
4,1,180000.0,625500.0,32067.0,0.010032,-13040,-2191,-4000.0,-4262,16.0,...,0.374715,3.475,0.06858,1.131358,14.564047,0.0,0.0,0.487179,1.0,12.25


In [5]:
# Load labels data
labels_df = load_credit_data ("y_labels.csv")
print (labels_df.shape)

y_train = labels_df['TARGET'].copy()

(307511, 1)


In [6]:
# ROC curve
from sklearn.metrics import roc_curve, auc

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

### Random Forest

In [None]:
# Run GridSearch cross validation with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

final_rf = RandomForestClassifier(random_state=123)

# Random forest (similar to before with 250 + a bit bigger 300) 
param_grid = {'n_estimators': [20, 250, 300]}

# CV = 3 to cut short computational time
grid_search_final_rf = GridSearchCV(estimator=final_rf, param_grid=param_grid , cv=3, scoring='roc_auc', verbose=100)

grid_search_final_rf.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_final_rf.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_final_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# Find BEST model

forest_final_clf = grid_search_final_rf.best_estimator_
y_probas_final_forest = cross_val_predict(forest_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_final_forest = y_probas_final_forest[:, 1] 
fpr_final_forest, tpr_final_forest, thresholds_final_forest = roc_curve(y_train, y_scores_final_forest)

print ("AUC: ", auc(fpr_final_forest, tpr_final_forest))

In [None]:
predictions = forest_merge_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('random_forest_features_final.csv', index = False)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

log_rf = LogisticRegression(random_state=123)

param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],'penalty': ['l1', 'l2']}

# CV = 3 to cut short computational time
grid_search_log_rf = GridSearchCV(estimator=log_rf, param_grid=param_grid , cv=3, scoring='roc_auc', verbose=100)

grid_search_log_rf.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_log_rf.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_log_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# Find BEST model
from sklearn.model_selection import cross_val_predict

log_final_clf = grid_search_log_rf.best_estimator_
y_probas_log_final = cross_val_predict(log_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_log_final = y_probas_log_final[:, 1] 
fpr_log_final, tpr_log_final, thresholds_log_final = roc_curve(y_train, y_scores_log_final)

print ("AUC: ", auc(fpr_log_final, tpr_log_final))

In [None]:
predictions = log_final_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('log_features_final.csv', index = False)

### SGD

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(training_df)

X_tr = scaler.transform(training_df)
X_te = scaler.transform(testing_df)

In [None]:
from sklearn.linear_model import SGDClassifier

model_sgd = SGDClassifier(random_state=123)

param_distributions = {'loss': ['log', 'modified_huber'], 'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 'penalty': ['l1', 'l2']}

grid_search_sgd_cv = GridSearchCV(model_sgd, param_distributions, cv=2, verbose=100) 
grid_search_sgd_cv.fit(X_tr, y_train)

# Results of the grid search for best n_estimator
print(grid_search_sgd_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_sgd_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
from sklearn.model_selection import cross_val_predict

sgd_final_clf = grid_search_sgd_cv.best_estimator_
y_probas_sgd_final = cross_val_predict(sgd_final_clf, X_tr, y_train, cv=3, method="predict_proba")
y_scores_sgd_final = y_probas_sgd_final[:, 1] 
fpr_sgd_final, tpr_sgd_final, thresholds_sgd_final = roc_curve(y_train, y_scores_sgd_final)

print ("AUC: ", auc(fpr_sgd_final, tpr_sgd_final))

In [None]:
predictions = grid_search_sgd_cv.predict_proba(X_te)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('sgd_features_final.csv', index = False)

### GBC

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_gbc = GradientBoostingClassifier(random_state=123)

param_distributions = {'max_depth': np.arange(1, 6), 'n_estimators': [20, 150, 250, 300], 'learning_rate': [0.01, 0.1, 0.5, 0.7, 0.8] }

grid_search_gbc_cv = GridSearchCV(model_gbc, param_distributions, cv=2, verbose=100) 
grid_search_gbc_cv.fit(top_training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_gbc_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_gbc_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
from sklearn.model_selection import cross_val_predict

gbc_final_clf = grid_search_gbc_cv.best_estimator_
y_probas_gbc_final = cross_val_predict(gbc_final_clf, top_training_df, y_train, cv=3, method="predict_proba")
y_scores_gbc_final = y_probas_gbc_final[:, 1] 
fpr_gbc_final, tpr_gbc_final, thresholds_gbc_final = roc_curve(y_train, y_scores_gbc_final)

print ("AUC: ", auc(fpr_gbc_final, tpr_gbc_final))

In [None]:
predictions = grid_search_gbc_cv.predict_proba(top_testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('gbc_features_final.csv', index = False)

### AdaBoost on DecisionTree

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

model_ada = AdaBoostClassifier(random_state=123)

param_distributions = {'n_estimators': [20, 250, 300], 'learning_rate': [0.2, 0.5, 0.7]}
grid_search_ada_cv = GridSearchCV(model_ada, param_distributions, cv=2, verbose=100) 

grid_search_ada_cv.fit(training_df, y_train)

# Results of the grid search for best n_estimator
print(grid_search_ada_cv.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_ada_cv.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# Find BEST model
from sklearn.model_selection import cross_val_predict

ada_final_clf = grid_search_ada_cv.best_estimator_
y_probas_ada_final = cross_val_predict(ada_final_clf, training_df, y_train, cv=3, method="predict_proba")
y_scores_ada_final = y_probas_ada_final[:, 1] 
fpr_ada_final, tpr_ada_final, thresholds_ada_final = roc_curve(y_train, y_scores_ada_final)

print ("AUC: ", auc(fpr_ada_final, tpr_ada_final))

In [None]:
predictions = ada_final_clf.predict_proba(testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('ada_features_final_dt.csv', index = False)

### Ensembles of 'Best'

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier

# To make this notebook's output stable across runs
np.random.seed(123)

b_clf1 = RandomForestClassifier(n_estimators=300)
b_clf2 = LogisticRegression(C=10, penalty='l1')
b_clf3 = DecisionTreeClassifier (max_depth=1, criterion='gini')
b_clf4 = SGDClassifier (alpha=0.1, loss='log', penalty='l2')
b_clf5 = GaussianProcessClassifier () # Only 'new' one
b_eclf = VotingClassifier(estimators=[('rfor', b_clf1), ('logreg', b_clf2), ('dt', b_clf3),('sgd', b_clf4),('gpc', b_clf5)], weights=[1,1,1,1,1], voting='soft')
b_eclf = b_eclf.fit(X_tr, y_train) 

In [None]:
from sklearn.model_selection import cross_val_predict

y_probas_bvote_final = cross_val_predict(b_eclf, X_tr, y_train, cv=3, method="predict_proba")
y_scores_bvote_final = y_probas_bvote_final[:, 1] 
fpr_bvote_final, tpr_bvote_final, thresholds_bvote_final = roc_curve(y_train, y_scores_bvote_final)

print ("AUC: ", auc(fpr_bvote_final, tpr_bvote_final))

In [None]:
predictions = b_eclf.predict_proba(X_te)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('best_voting_classifier_features_final.csv', index = False)

### Ensemble of Ensembles

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier

# conda install py-xgboost
import xgboost as xgb

# classifier from xgboost
clf1 = AdaBoostClassifier(n_estimators=300)
clf2 = ExtraTreesClassifier(n_estimators=300, n_jobs=-1, criterion='gini',max_depth=5)
clf3 = xgb.XGBClassifier(n_estimators=300, nthread=-1, max_depth = 5, seed=1234)
clf4 = GradientBoostingClassifier(n_estimators=300)
eclf = VotingClassifier(estimators=[('ab', clf1), ('etc', clf2), ('xgb', clf3),('gbc', clf4)], weights=[1,1,1,1], voting='soft')
eclf = eclf.fit(top_training_df, y_train)

In [None]:
from sklearn.model_selection import cross_val_predict

y_probas_vote_final = cross_val_predict(eclf, top_training_df, y_train, cv=3, method="predict_proba")
y_scores_vote_final = y_probas_vote_final[:, 1] 
fpr_vote_final, tpr_vote_final, thresholds_vote_final = roc_curve(y_train, y_scores_vote_final)

print ("AUC: ", auc(fpr_vote_final, tpr_vote_final))

In [None]:
predictions = eclf.predict_proba(top_testing_df)[:, 1]
submit = load_credit_data ("submit_labels.csv")
submit['TARGET'] = predictions
submit.to_csv('voting_classifier_ensem_features_final.csv', index = False)

### Plot  curves

In [7]:
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_final_forest, tpr_final_forest, "Random Forest")
plot_roc_curve(fpr_log_final, tpr_log_final, "Log Reg")
plot_roc_curve(fpr_sgd_final, tpr_sgd_final, "SGD")
plot_roc_curve(fpr_gbc_final, tpr_gbc_final, "GBC")
plot_roc_curve(fpr_ada_final, tpr_ada_final, "AdaBoost")
plot_roc_curve(fpr_bvote_final, tpr_bvote_final, "Ensemble - Best")
plot_roc_curve(fpr_vote_final, tpr_vote_final, "Ensemble - Ensemble")
plt.legend(loc="lower right", fontsize=16)
plt.title("Comparing Models")
plt.show()

NameError: name 'fpr_final_forest' is not defined

<matplotlib.figure.Figure at 0x1a0a0262b0>

### Submission scores:

   1. Random Forest 
   2. Logistic Regression  
   3. SGD
   4. GBC
   5. AdaBoost Decision Tree.
   6. Ensemble of ensembles (voting=soft).
   7. Ensemble of 'best' (voting=soft).