In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
y2015 = pd.read_csv(
    'LoanStats3d.csv',
    skipinitialspace=True,
    header=1
)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Convert ID and Interest Rate to numeric.
y2015['id'] = pd.to_numeric(y2015['id'], errors='coerce')
y2015['int_rate'] = pd.to_numeric(y2015['int_rate'].str.strip('%'), errors='coerce')

# Drop other columns with many unique variables
y2015.drop(['url', 'emp_title', 'zip_code', 'earliest_cr_line', 'revol_util',
            'sub_grade', 'addr_state', 'desc', 'policy_code', 'member_id', 'id'], 1, inplace=True)

In [4]:
y2015 = y2015[:-2]

#### Assignment starts here

Goal is to remove as much data as possible while still staying above an average of .9 for a 10-fold cross-validation.

In [5]:
# Grab the continuous variables from the original DataFrame

cont_y2015 = y2015.select_dtypes(include=['float64'])
cat_y2015 = y2015.select_dtypes(exclude=['float64'])

#### Preparing the continuous data

In [6]:
# Figure out where what's going on with those NaNs

y2015_na = cont_y2015.isnull().sum(axis = 0)
y2015_na.sort_values(ascending=False)

dti_joint                         420586
annual_inc_joint                  420584
il_util                           402478
mths_since_rcnt_il                400285
inq_last_12m                      399723
open_acc_6m                       399723
open_il_6m                        399723
open_il_12m                       399723
open_il_24m                       399723
total_bal_il                      399723
open_rv_12m                       399723
open_rv_24m                       399723
all_util                          399723
inq_fi                            399723
total_cu_tl                       399723
max_bal_bc                        399723
mths_since_last_record            346680
mths_since_recent_bc_dlq          312495
mths_since_last_major_derog       298366
mths_since_recent_revol_delinq    269358
mths_since_last_delinq            203962
mths_since_recent_inq              44599
num_tl_120dpd_2m                   19230
mo_sin_old_il_acct                 12254
percent_bc_gt_75

In [7]:
# Drop the columns with loads of NaNs. Caution, some NaNs may be meaningful predictors.
cont_y2015 = cont_y2015.dropna(thresh=len(cont_y2015) - 100000, axis=1)

In [8]:
# Remaining NaNs were negligible. Dropped 'em.
cont_y2015.dropna(inplace=True)

Before I rescale, I should get the corresponding rows from the categorical and outcome variables saved in a dataframe
then after I rescale the continuous and get dummies for the categorical, I can rejoin them for the model!

In [9]:
# This line of code saves the categoricl data that goes with the data that I didn't drop above.
cat_y2015_corresponding = cat_y2015.iloc[list(cont_y2015.index), :]

In [10]:
# Rescaling the continuous variables. Potential issue could be the bankruptcies. Maybe make that one categorical
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
rescaled_y2015 = scaler.fit_transform(cont_y2015)


In [11]:
# Getting rid of the columns with lower than .01 variance
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=.01)
filtered_rescaled_y2015 = selector.fit_transform(rescaled_y2015)

In [12]:
# Changing it to a dataframe so I can rejoin it to my categorical dummies in future
filtered_rescaled_y2015 = pd.DataFrame(filtered_rescaled_y2015)

#### Preparing the Categorical Data

In [13]:
# Getting the dummies because it is currently categorical, removing loan_status because that's cheating.
dummies_cat_y2015_corresponding = pd.get_dummies(cat_y2015_corresponding.drop('loan_status', 1))

In [14]:
# Resetting the indices so that I can concatenate these back together nicely.
dummies_cat_y2015_corresponding = dummies_cat_y2015_corresponding.reset_index()
filtered_rescaled_y2015 = filtered_rescaled_y2015.reset_index()

In [15]:
# Setting X and Y for the model.
X = pd.concat([dummies_cat_y2015_corresponding, filtered_rescaled_y2015], axis=1)
Y = cat_y2015_corresponding['loan_status']

In [16]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

rfc = ensemble.RandomForestClassifier()

cross_val_score(rfc, X, Y, cv=10)

array([0.97827537, 0.97900049, 0.97989964, 0.98068277, 0.94161156,
       0.97659105, 0.91019319, 0.97731624, 0.97862095, 0.43045282])

In [18]:
# OK, we dropped a lot of data and it is still over 90% accurate
np.mean([0.97827537, 0.97900049, 0.97989964, 0.98068277, 0.94161156,
       0.97659105, 0.91019319, 0.97731624, 0.97862095, 0.43045282])

0.9132644079999999

I dropped some data that might have been very useful. Specifically the data that says how many months it has been since last delinquency. This was because there were many NaN values. I suspect that the NaN values meant that they had never had any delinquencies. And this could be a valuable predictor. I may try it but putting in a huge value to represent never having any delinquencies.

Also, there are some pieces of continuous data that might do better as categorical data. For excample the number of bankruptcies.