In [50]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
import eli5
from eli5.sklearn import PermutationImportance
import numpy as np
import seaborn as sns
import pandas as pd
import category_encoders as ce
from glob import glob
from xgboost import XGBClassifier
from pdpbox.pdp import pdp_isolate, pdp_plot, pdp_interact, pdp_interact_plot
from zipfile import ZipFile
from sklearn.impute import SimpleImputer
from scipy.stats import randint, uniform

In [2]:
pd.options.display.max_columns= 150

In [3]:
current= pd.read_csv('primaryMarketNotes_browseNotes_1-RETAIL.csv')

In [4]:
dictionary1= pd.read_excel('LCDataDictionary.xlsx', sheet_name=0)

In [5]:
dictionary2= pd.read_excel('LCDataDictionary.xlsx', sheet_name=1)

In [None]:
files= ['LoanStats3a.csv',
        'LoanStats3b.csv',
        'LoanStats3c.csv',
        'LoanStats3d.csv',
        'LoanStats_2016Q1.csv',
        'LoanStats_2016Q2.csv',
        'LoanStats_2016Q3.csv',
        'LoanStats_2017Q4.csv',
        'LoanStats_2018Q1.csv',
        'LoanStats_2018Q2.csv',
        'LoanStats_2018Q3.csv',
        'LoanStats_2018Q4.csv',
        'LoanStats_2019Q1.csv']

In [6]:
def a_to_d_done(df):
    a_to_d = df['grade'].isin(['A', 'B', 'C', 'D'])
    done = df['loan_status'].isin(['Fully Paid', 'Charged Off'])
    return a_to_d & done

In [7]:
dfs = (pd.read_csv(file, skiprows=1, skipfooter=2, engine='python')
       .where(a_to_d_done)
       .dropna(subset=['grade', 'loan_status'])
       for file in files)

NameError: name 'files' is not defined

In [8]:
df= pd.concat(dfs)

NameError: name 'dfs' is not defined

In [None]:
historical, _ = train_test_split(df.copy(), train_size=0.10, 
                             stratify=df['loan_status'])

In [None]:
historical.to_csv('historical.csv', index=False)

In [9]:
historical= pd.read_csv('historical.csv')

In [10]:
common_columns = set(historical.columns) & set(current.columns)
just_historical = set(historical.columns) - set(current.columns)
just_current = set(current.columns) - set(historical.columns)

In [11]:
common_columns_with_labels= []
for column in common_columns:
    common_columns_with_labels.append(column)

In [12]:
common_columns_with_labels.append('loan_status')

In [13]:
training_historical= historical[common_columns_with_labels]

In [14]:
train, test= train_test_split(training_historical, train_size= .8, stratify= historical['loan_status'], random_state=42)

In [15]:
train.shape, test.shape

((87412, 103), (21854, 103))

In [16]:
train, val= train_test_split(train, train_size= .8, stratify= train['loan_status'], random_state=42)

In [17]:
train.shape, val.shape

((69929, 103), (17483, 103))

In [18]:
train_target= train['loan_status']
val_target= val['loan_status']
test_target= test['loan_status']

train_features= train.drop(columns='loan_status').fillna('Unknown')
val_features= val.drop(columns='loan_status').fillna('Unknown')
test_features=  test.drop(columns='loan_status').fillna('Unknown')

In [19]:
encoder= ce.OrdinalEncoder()

train_encoded= encoder.fit_transform(train_features)
val_encoded= encoder.transform(val_features)
test_encoded= encoder.transform(test_features)

In [24]:
model= RandomForestClassifier(random_state=42, n_jobs=-1)

In [25]:
model.fit(train_encoded, train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [26]:
val_pred= model.predict(val_encoded)

In [27]:
accuracy_score(val_target, val_pred)

0.7729222673454212

In [28]:
test_pred= model.predict(test_encoded)

In [29]:
accuracy_score(test_pred, test_target)

0.7762423354992221

In [30]:
precision, recall, _, _= precision_recall_fscore_support(val_pred, val_target)
print(f'Precision: {precision}\nRecall: {recall}')

Precision: [0.17262513 0.89412897]
Recall: [0.24767953 0.8425758 ]


In [31]:
precision, recall, _, _= precision_recall_fscore_support(test_pred, test_target)
print(f'Precision: {precision}\nRecall: {recall}')

Precision: [0.17052574 0.8985316 ]
Recall: [0.25333873 0.84290358]


In [32]:
permuter = PermutationImportance(model,cv='prefit', n_iter=3, random_state=42)
permuter.fit(val_encoded, val_target)

PermutationImportance(cv='prefit',
           estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_iter=3, random_state=42, refit=True, scoring=None)

In [33]:
feature_names = val_encoded.columns.tolist()

eli5.show_weights(permuter, top=None, feature_names=feature_names)

Weight,Feature
0.0069  ± 0.0025,home_ownership
0.0068  ± 0.0019,term
0.0065  ± 0.0017,installment
0.0060  ± 0.0035,mort_acc
0.0049  ± 0.0024,loan_amnt
0.0045  ± 0.0004,all_util
0.0032  ± 0.0022,funded_amnt
0.0024  ± 0.0018,grade
0.0023  ± 0.0016,sub_grade
0.0022  ± 0.0018,revol_bal


In [34]:
print('Shapes before removing features:', train_encoded.shape, val_encoded.shape, test_encoded.shape)
 
mask = permuter.feature_importances_ > 0
#grabbing features using bools
features = train_encoded.columns[mask]

#dataframes with only weighted features
train_final = train_encoded[features]
val_final = val_encoded[features]
test_final= test_encoded[features]

print('Shapes after removing features:', train_final.shape, val_final.shape, test_final.shape)

Shapes before removing features: (69929, 102) (17483, 102) (21854, 102)
Shapes after removing features: (69929, 102) (17483, 102) (21854, 102)


In [39]:
model.fit(train_final, train_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [41]:
val_pred= model.predict(val_final)

In [42]:
test_pred= model.predict(test_final)

In [44]:
accuracy_score(val_pred, val_target), accuracy_score(test_pred, test_target)

(0.7828175942343991, 0.7847533632286996)