In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, roc_auc_score, SCORERS 
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
import eli5
from eli5.sklearn import PermutationImportance
import numpy as np
import seaborn as sns
import pandas as pd
import category_encoders as ce
from glob import glob
from xgboost import XGBClassifier
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
from zipfile import ZipFile
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform

In [None]:
pd.options.display.max_columns= 150

# Datasets

In [None]:
current= pd.read_csv('primaryMarketNotes_browseNotes_1-RETAIL.csv')

In [None]:
dictionary1= pd.read_excel('LCDataDictionary.xlsx', sheet_name=0)

In [None]:
dictionary2= pd.read_excel('LCDataDictionary.xlsx', sheet_name=1)

In [None]:
historical= pd.read_csv('historical.csv')

# Data Wrangling/ Preprocessing

In [None]:
historical['emp_title'] = historical['emp_title'].str.lower()
historical['emp_title_teacher'] = historical['emp_title'].str.contains('teacher', na=False)
historical['emp_title_manager'] = historical['emp_title'].str.contains('manager', na=False)
historical['emp_title_owner']  = historical['emp_title'].str.contains('owner', na=False)

In [None]:
current['emp_title'] = current['emp_title'].str.lower()
current['emp_title_teacher'] = current['emp_title'].str.contains('teacher', na=False)
current['emp_title_manager'] = current['emp_title'].str.contains('manager', na=False)
current['emp_title_owner']  = current['emp_title'].str.contains('owner', na=False)

In [None]:
def string_to_float(df):
    df['int_rate'] = df['int_rate'].str.strip('%').astype(float)
    df['revol_util'] = df['revol_util'].str.strip('%').astype(float)

In [None]:
string_to_float(historical)

In [None]:
common_columns = set(historical.columns) & set(current.columns)
just_historical = set(historical.columns) - set(current.columns)
just_current = set(current.columns) - set(historical.columns)

In [None]:
features= list(common_columns)
features.append('loan_status')

In [None]:
training_historical= historical[features]

In [None]:
train, test= train_test_split(training_historical, train_size= .8, stratify= historical['loan_status'], random_state=42)

In [None]:
train.shape, test.shape

In [None]:
X_train, X_val = train_test_split(train, train_size= .8, stratify= train['loan_status'], random_state=42)

In [None]:
X_train.shape, X_val.shape

In [None]:
train_target= X_train['loan_status']
val_target= X_val['loan_status']
test_target= test['loan_status']

train_features= X_train.drop(columns='loan_status').fillna('Unknown')
val_features= X_val.drop(columns='loan_status').fillna('Unknown')
test_features=  test.drop(columns='loan_status').fillna('Unknown')

In [None]:
encoder= ce.OrdinalEncoder()

train_encoded= encoder.fit_transform(train_features)
val_encoded= encoder.transform(val_features)
test_encoded= encoder.transform(test_features)

# Random Forest Model Pre Optimization

In [None]:
forest= RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
forest.fit(train_encoded, train_target)

# Scores for Random Forest

In [None]:
val_pred_proba= forest.predict_proba(val_encoded)[:,1]

val_roc_auc= roc_auc_score(val_target, val_pred_proba)

print(f'Val ROC AUC: {val_roc_auc}')

In [None]:
test_pred_proba= forest.predict_proba(test_encoded)[:,1]

test_roc_auc= roc_auc_score(test_target, test_pred_proba)

print(f'Test ROC AUC: {test_roc_auc}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(val_pred, val_target)
print(f'Precision: {precision}\nRecall: {recall}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(test_pred, test_target)
print(f'Precision: {precision}\nRecall: {recall}')

# Hyper Parameter Tuning for RandomForest

In [None]:
param_distributions = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'min_samples_leaf': [1, 2, 4, 6, 8]
}

search = RandomizedSearchCV(
    forest,
    param_distributions= param_distributions,
    n_iter=2,
    cv=10,
    scoring='roc_auc',
    n_jobs=-1
)

search.fit(train_encoded, train_target)

In [None]:
print('Best hyperparameters', search.best_params_)
print('Best roc auc', (search.best_score_))

# Random Forest Post Parameter Optimization

In [None]:
hyper_forest= RandomForestClassifier(max_depth= 7, min_samples_leaf= 8, n_estimators= 113, random_state=42)

In [None]:
hyper_forest.fit(train_encoded, train_target)

# Random Forest scores Post Parameter Optimization

In [None]:
val_pred_proba= hyper_forest.predict_proba(val_encoded)[:,1]

val_roc_auc= roc_auc_score(val_target, val_pred_proba)

print(f'Val ROC AUC: {val_roc_auc}')

In [None]:
test_pred_proba= hyper_forest.predict_proba(test_encoded)[:,1]

test_roc_auc= roc_auc_score(test_target, test_pred_proba)

print(f'Test ROC AUC: {test_roc_auc}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(val_pred, val_target)

print(f'Precision: {precision}\nRecall: {recall}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(test_pred, test_target)

print(f'Precision: {precision}\nRecall: {recall}')

# XGB model Pre Optimization

In [None]:
booster= XGBClassifier(random_state=42, n_jobs= -1)

In [None]:
booster.fit(train_encoded, train_target)

# Scores for XGB

In [None]:
val_pred_proba= booster.predict_proba(val_encoded)[:,1]

val_roc_auc= roc_auc_score(val_target, val_pred_proba)

print(f'Val ROC AUC: {val_roc_auc}')

In [None]:
test_pred_proba= booster.predict_proba(test_encoded)[:,1]

test_roc_auc= roc_auc_score(test_target, test_pred_proba)

print(f'Test ROC AUC: {test_roc_auc}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(val_pred, val_target)

print(f'Precision: {precision}\nRecall: {recall}')

In [None]:
precision, recall, _, _= precision_recall_fscore_support(test_pred, test_target)

print(f'Precision: {precision}\nRecall: {recall}')

# Hyper Parameter Tuning for XGB

In [None]:
param_distributions = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(3, 8),
    'booster' : ['gbtree', 'gblinear', 'dart']
}

search = RandomizedSearchCV(
    booster,
    param_distributions= param_distributions,
    n_iter=3,
    cv=10,
    scoring='roc_auc',
    n_jobs=-1
)

search.fit(train_encoded, train_target)

In [None]:
print('Best hyperparameters', search.best_params_)
print('Best roc auc', (search.best_score_))

# XGB Model Post Hyper Parameter Optimization

In [None]:
hyper_booster= XGBClassifier(booster= 'dart', max_depth= 7, n_estimators= 261)

In [None]:
hyper_booster.fit(train_encoded, train_target)

# Scores for XGB Model

In [None]:
val_pred_proba= hyper_booster.predict_proba(val_encoded)[:,1]

val_roc_auc= roc_auc_score(val_target, val_pred_proba)

print(f'Val ROC AUC: {val_roc_auc}')

In [None]:
test_pred_proba= booster.predict_proba(test_final)[:,1]

test_roc_auc= roc_auc_score(test_target, test_pred_proba)

print(f'Test ROC AUC: {test_roc_auc}')

# Permutations to find feature weights

In [None]:
permuter = PermutationImportance(booster,cv='prefit', n_iter=3, random_state=42)

permuter.fit(val_encoded, val_target)

In [None]:
feature_names = val_encoded.columns.tolist()

eli5.show_weights(permuter, top=None, feature_names=feature_names)

In [None]:
print('Shapes before removing features:', train_encoded.shape, val_encoded.shape, test_encoded.shape)
 
mask = permuter.feature_importances_ > 0
#grabbing features using bools
features = train_encoded.columns[mask]

#dataframes with only weighted features
train_final = train_encoded[features]
val_final = val_encoded[features]
test_final= test_encoded[features]

print('Shapes after removing features:', train_final.shape, val_final.shape, test_final.shape)

# XGB model Pre Hyper Parameter Optimization, Post Feature Reduction

In [None]:
booster.fit(train_final, train_target)

# XGB Scores

In [None]:
val_pred_proba= booster.predict_proba(val_final)[:,1]

val_roc_auc= roc_auc_score(val_target, val_pred_proba)

print(f'Val ROC AUC: {val_roc_auc}')

In [None]:
test_pred_proba= booster.predict_proba(test_final)[:,1]

test_roc_auc= roc_auc_score(test_target, test_pred_proba)

print(f'Test ROC AUC: {test_roc_auc}')