#### Continuation from previous notebook

1. More feature engineering on merged dataset
2. Run model tuning on 
   1. Random Forest (tune max_features)
   2. KNN
   3. SVC 
   4. Decision Tree
   5. Ensemble model
4. If time permits, attempt a neural network.

In [1]:
## IMPORTS ##

# numpy and pandas for data manipulation
import numpy as np
import pandas as pd 

# sklearn preprocessing for dealing with categorical variables
from sklearn.preprocessing import LabelEncoder

# File system manangement
import os

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

# matplotlib for plotting
import matplotlib.pyplot as plt

# garbage collector
import gc

In [2]:
# Load data fcn
def load_credit_data(data_path):
    csv_path = os.path.join("data", data_path)
    return pd.read_csv(csv_path)

In [3]:
# Load training data
training_df = load_credit_data ("training_merged_preprocessed.csv")
print (training_df.shape)

(307511, 509)


In [4]:
# Load test data
testing_df = load_credit_data ("testing_merged_preprocessed.csv")
print (testing_df.shape)

(48744, 509)


In [36]:
# Load labels data
labels_df = load_credit_data ("y_labels.csv")
print (labels_df.shape)
labels_df.head()

(307511, 1)


Unnamed: 0,TARGET
0,1
1,0
2,0
3,0
4,0


In [38]:
# Look at just important features
y_train = labels_df['TARGET'].copy()

X_train_important = training_df [['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'df_avg_bureau_full_DAYS_CREDIT', 'df_avg_bureau_full_DAYS_CREDIT_ENDDATE', 'AMT_PAYMENT_df_avg_install', 'AMT_ANNUITY', 'DAYS_EMPLOYED', 'df_avg_bureau_full_DAYS_CREDIT_UPDATE', 'df_avg_pos_cash_CNT_INSTALMENT_FUTURE', 'AMT_INSTALMENT_df_avg_install', 'DAYS_LAST_PHONE_CHANGE', 'AMT_CREDIT', 'DAYS_ENTRY_PAYMENT_df_avg_install', 'DAYS_INSTALMENT_df_avg_install', 'df_avg_previous_app_DAYS_FIRST_DUE', 'df_avg_previous_app_DAYS_DECISION', 'df_avg_previous_app_HOUR_APPR_PROCESS_START', 'df_avg_previous_app_AMT_ANNUITY', 'df_avg_previous_app_AMT_CREDIT', 'df_avg_previous_app_AMT_GOODS_PRICE', 'df_avg_previous_app_AMT_APPLICATION', 'df_avg_previous_app_SELLERPLACE_AREA', 'REGION_POPULATION_RELATIVE', 'df_avg_previous_app_DAYS_LAST_DUE_1ST_VERSION', 'df_avg_bureau_full_AMT_CREDIT_SUM', 'AMT_INCOME_TOTAL']]
X_test_important = testing_df [['EXT_SOURCE_2', 'EXT_SOURCE_3', 'DAYS_BIRTH', 'EXT_SOURCE_1', 'DAYS_ID_PUBLISH', 'DAYS_REGISTRATION', 'df_avg_bureau_full_DAYS_CREDIT', 'df_avg_bureau_full_DAYS_CREDIT_ENDDATE', 'AMT_PAYMENT_df_avg_install', 'AMT_ANNUITY', 'DAYS_EMPLOYED', 'df_avg_bureau_full_DAYS_CREDIT_UPDATE', 'df_avg_pos_cash_CNT_INSTALMENT_FUTURE', 'AMT_INSTALMENT_df_avg_install', 'DAYS_LAST_PHONE_CHANGE', 'AMT_CREDIT', 'DAYS_ENTRY_PAYMENT_df_avg_install', 'DAYS_INSTALMENT_df_avg_install', 'df_avg_previous_app_DAYS_FIRST_DUE', 'df_avg_previous_app_DAYS_DECISION', 'df_avg_previous_app_HOUR_APPR_PROCESS_START', 'df_avg_previous_app_AMT_ANNUITY', 'df_avg_previous_app_AMT_CREDIT', 'df_avg_previous_app_AMT_GOODS_PRICE', 'df_avg_previous_app_AMT_APPLICATION', 'df_avg_previous_app_SELLERPLACE_AREA', 'REGION_POPULATION_RELATIVE', 'df_avg_previous_app_DAYS_LAST_DUE_1ST_VERSION', 'df_avg_bureau_full_AMT_CREDIT_SUM', 'AMT_INCOME_TOTAL']]

In [7]:
X_train_important.head()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1,DAYS_ID_PUBLISH,DAYS_REGISTRATION,df_avg_bureau_full_DAYS_CREDIT,df_avg_bureau_full_DAYS_CREDIT_ENDDATE,AMT_PAYMENT_df_avg_install,AMT_ANNUITY,...,df_avg_previous_app_HOUR_APPR_PROCESS_START,df_avg_previous_app_AMT_ANNUITY,df_avg_previous_app_AMT_CREDIT,df_avg_previous_app_AMT_GOODS_PRICE,df_avg_previous_app_AMT_APPLICATION,df_avg_previous_app_SELLERPLACE_AREA,REGION_POPULATION_RELATIVE,df_avg_previous_app_DAYS_LAST_DUE_1ST_VERSION,df_avg_bureau_full_AMT_CREDIT_SUM,AMT_INCOME_TOTAL
0,0.262949,0.139376,-9461,0.083037,-2120,-3648.0,-874.0,-344.25,11559.247105,24700.5,...,9.0,9251.775,179055.0,179055.0,179055.0,500.0,0.018801,125.0,108131.945625,202500.0
1,0.622246,0.535276,-16765,0.311267,-291,-1186.0,-1400.75,-544.5,64754.586,35698.5,...,14.666667,56553.99,484191.0,435436.5,435436.5,533.0,0.003541,-1004.333333,254350.125,270000.0
2,0.555912,0.729567,-19046,0.505998,-2531,-4260.0,-867.0,-488.5,7096.155,6750.0,...,5.0,5357.25,20106.0,24282.0,24282.0,30.0,0.010032,-694.0,94518.9,67500.0
3,0.650442,0.535276,-19005,0.505998,-2437,-9833.0,0.0,0.0,62947.088438,29686.5,...,14.666667,19517.45,291695.5,309643.26,272203.26,894.222222,0.008019,40503.444444,0.0,135000.0
4,0.322738,0.535276,-19932,0.505998,-3458,-4311.0,-1149.0,-783.0,12214.060227,21865.5,...,12.333333,12278.805,166638.75,150530.25,150530.25,409.166667,0.028663,-757.833333,146250.0,121500.0


In [8]:
X_test_important.head()

Unnamed: 0,EXT_SOURCE_2,EXT_SOURCE_3,DAYS_BIRTH,EXT_SOURCE_1,DAYS_ID_PUBLISH,DAYS_REGISTRATION,df_avg_bureau_full_DAYS_CREDIT,df_avg_bureau_full_DAYS_CREDIT_ENDDATE,AMT_PAYMENT_df_avg_install,AMT_ANNUITY,...,df_avg_previous_app_HOUR_APPR_PROCESS_START,df_avg_previous_app_AMT_ANNUITY,df_avg_previous_app_AMT_CREDIT,df_avg_previous_app_AMT_GOODS_PRICE,df_avg_previous_app_AMT_APPLICATION,df_avg_previous_app_SELLERPLACE_AREA,REGION_POPULATION_RELATIVE,df_avg_previous_app_DAYS_LAST_DUE_1ST_VERSION,df_avg_bureau_full_AMT_CREDIT_SUM,AMT_INCOME_TOTAL
0,0.789654,0.15952,-19241,0.752614,-812,-5170.0,-735.0,82.428571,5885.132143,20560.5,...,13.0,3951.0,23787.0,24835.5,24835.5,23.0,0.01885,-1499.0,207623.571429,135000.0
1,0.291656,0.432962,-18064,0.56499,-1623,-9118.0,-190.666667,439.333333,6240.205,17370.0,...,10.5,8031.6,20076.75,78468.75,22308.75,18.0,0.035792,-368.5,219042.0,99000.0
2,0.699787,0.610991,-20038,0.506771,-3503,-2175.0,-1737.5,-1068.0,9740.235774,69777.0,...,14.5,11421.14625,146134.125,158951.25,130871.25,82.0,0.019101,-477.0,518070.015,202500.0
3,0.509677,0.612704,-13976,0.525734,-4208,-2000.0,-1401.75,1934.75,4356.731549,49018.5,...,10.8,9354.951,92920.5,94135.5,49207.5,1409.6,0.026392,72588.4,126739.59,315000.0
4,0.425687,0.519097,-13040,0.202145,-4262,-4000.0,0.0,0.0,11100.3375,32067.0,...,5.5,17782.155,300550.5,267727.5,267727.5,13.0,0.010032,-409.0,0.0,180000.0


In [24]:
# Run Polynomial Features (of 3) on the top 30 important features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 2)
p = poly.fit(X_train_important)

X_tr = poly.transform (X_train_important) 
X_te = poly.transform (X_test_important) 

In [27]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_tr)

X_tr = scaler.transform(X_tr)
X_te = scaler.transform(X_te)

In [None]:
# Random forest (similar to before, but add in max_features)
# Run GridSearch cross validation with Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

model_rf = RandomForestClassifier(random_state=123)
param_grid = {'n_estimators': [150, 200, 250], 'max_features' : ['auto', 'log2', None]}

grid_search_train_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid , cv=5, scoring='roc_auc')

grid_search_train_rf.fit(X_tr, y_train)
    
# Results of the grid search for best n_estimator
print(grid_search_train_rf.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_train_rf.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
# ROC curve
from sklearn.metrics import roc_curve

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

In [None]:
# Find BEST model
from sklearn.model_selection import cross_val_predict

forest_train_clf = grid_search_train_rf.best_estimator_
y_probas_forest = cross_val_predict(forest_train_clf, X_tr, y_train, cv=5, method="predict_proba")
y_scores_forest = y_probas_forest[:, 1] 
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train, y_scores_forest)

In [None]:
# Use KNN 
from sklearn.neighbors import KNeighborsClassifier

model_knn = KNeighborsClassifier()
param_grid = {'n_neighbors': [20, 50, 100, 150, 200]}

grid_search_knn = GridSearchCV(estimator=model_knn, param_grid=param_grid , cv=5, scoring='roc_auc')
grid_search_knn.fit(X_tr, y_train)
    
# Results of the grid search for best n_estimator
print(grid_search_knn.best_params_)
print ("------------")

# Results of the grid search in general
cvres = grid_search_knn.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)

In [None]:
knn_clf = grid_search_knn.best_estimator_
y_probas_knn = cross_val_predict(knn_clf, X_tr, y_train, cv=5, method="predict_proba")
y_scores_knn = y_probas_knn[:, 1] 
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_train, y_scores_knn)

In [None]:
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plot_roc_curve(fpr_knn, tpr_knn, "KNN")
plt.legend(loc="lower right", fontsize=16)
plt.title("'(BEST) Random Forest vs. KNN")
plt.show()

In [None]:
# SVC

In [None]:
# Look at adding new features 

In [None]:
# Decision Tree

In [None]:
# Ensemble model

###  Other things to consider
1. Look at dropping columns with NULL > threshold
2. Feature engineering:
    1. 
    2.
    3.
3. Modelling:
    1.
    2.
    3.
4.     