In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

## Import other dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
## Looks like the resources folder already contains the data so the GenerateData
## Is not necessary. Good thing because it doesn't work as is.
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
## Looking at our dataFrames to get an idea of what the columns contain
train_df["loan_status"].unique()

array(['low_risk', 'high_risk'], dtype=object)

In [4]:
test_df["loan_status"].unique()

array(['low_risk', 'high_risk'], dtype=object)

In [5]:
## Before we split our dummies, we're going to make our low & high risk = 0 & 1 respectively
train_df["loan_status"] = train_df["loan_status"].map({'high_risk': 1, 'low_risk': 0})
test_df["loan_status"] = test_df["loan_status"].map({'high_risk': 1, 'low_risk': 0})

In [6]:
# Convert categorical data to numeric and separate target feature for training data
dummyTrain = pd.get_dummies(train_df)
dummyTrain.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,57107,57107,13375.0,0.1797,483.34,223000.0,0,29.99,0.0,0.0,...,0,1,0,1,1,0,1,0,1,0
1,141451,141451,21000.0,0.1308,478.68,123000.0,0,11.26,2.0,0.0,...,0,1,0,1,1,0,1,0,1,0
2,321143,321143,20000.0,0.124,448.95,197000.0,0,11.28,0.0,0.0,...,0,1,0,1,1,0,1,0,1,0
3,11778,11778,3000.0,0.124,100.22,45000.0,0,18.08,0.0,0.0,...,0,1,0,1,1,0,1,0,1,0
4,169382,169382,30000.0,0.1612,1056.49,133000.0,0,27.77,0.0,2.0,...,0,1,0,1,1,0,1,0,1,0


In [7]:
# Convert categorical data to numeric and separate target feature for testing data
dummyTest = pd.get_dummies(test_df)
dummyTest.head()

Unnamed: 0.1,Unnamed: 0,index,loan_amnt,int_rate,installment,annual_inc,loan_status,dti,delinq_2yrs,inq_last_6mths,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,67991,67991,40000.0,0.0819,814.7,140000.0,0,19.75,0.0,1.0,...,0,0,1,0,1,1,0,1,0,1
1,25429,25429,6000.0,0.1524,208.7,55000.0,0,11.52,2.0,0.0,...,0,0,1,0,1,1,0,1,0,1
2,38496,38496,3600.0,0.1695,128.27,42000.0,0,6.74,0.0,0.0,...,0,0,1,0,1,1,0,1,0,1
3,19667,19667,20000.0,0.1524,478.33,100000.0,0,12.13,0.0,2.0,...,0,0,1,0,1,1,0,1,0,1
4,37505,37505,3600.0,0.124,120.27,50000.0,0,16.08,0.0,3.0,...,0,0,1,0,1,1,0,1,0,1


## Find the missing column

In [8]:
## We know the train has one more column than the test so let's find it.
trainColumns = dummyTrain.columns
testColumns = dummyTest.columns
for i in trainColumns:
    if i not in testColumns:
        print(i)
    else:
        pass
## Here we find that the train column has an additional debt_settlement_flag_Y column.
## In order to not skew our data and to make it work, it may be worth dropping this
## as well as dropping the N version. If it is dropped later it is because it made
## The model better

debt_settlement_flag_Y


In [9]:
# add missing dummy variables to testing set
## Instead of adding the missing dummy variable, we're going to drop the additional variable
dummyTrain = dummyTrain.drop('debt_settlement_flag_Y', axis = 1)
dummyTrain.columns

Index(['Unnamed: 0', 'index', 'loan_amnt', 'int_rate', 'installment',
       'annual_inc', 'loan_status', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'open_acc', 'pub_rec', 'revol_bal', 'total_acc', 'out_prncp',
       'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
       'total_rec_int', 'total_rec_late_fee', 'recoveries',
       'collection_recovery_fee', 'last_pymnt_amnt',
       'collections_12_mths_ex_med', 'policy_code', 'acc_now_delinq',
       'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il',
       'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
       'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
       'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m',
       'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util',
       'chargeoff_within_12_mths', 'delinq_amnt', 'mo_sin_old_il_acct',
       'mo_sin_old_rev_tl_op', 'mo_sin_rcnt_rev_tl_op', 'mo_sin_rcnt_tl',
       'mort_acc', 

In [12]:
## set my X and y train and test
X_train = dummyTrain.drop('loan_status', axis = 1)
y_train = dummyTrain['loan_status']
X_test = dummyTest.drop('loan_status', axis = 1)
y_test = dummyTest['loan_status']

### I don't think we'll need to do the train_test_split with this since its two separate documents

In [13]:
# Train the Logistic Regression model on the unscaled data and print the model score
## Instantiate
logisticModel = LogisticRegression()
## Fit
logisticModel.fit(X_train, y_train)
## Score
logisticModel.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5253083794130158

In [14]:
# Train a Random Forest Classifier model and print the model score
## Instantiate
randomForestModel = RandomForestClassifier(random_state=1)
## fit
randomForestModel.fit(X_train, y_train)
## Score
randomForestModel.score(X_test, y_test)

0.5980433857932794

### Looks like the random forest works better but let's scale our data and see.

In [15]:
# Scale the data

## Instatiate the scaler
scaler = StandardScaler().fit(X_train)
## Fit the scaler
X_train_scaled = scaler.transform(X_train)
## Display
X_train_scaled

array([[-1.31172014, -1.31172014, -0.39311205, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-0.46579523, -0.46579523,  0.35168119, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [ 1.3364188 ,  1.3364188 ,  0.25400339, ...,  0.17149859,
        -0.17149859,  0.02026518],
       ...,
       [ 1.67571549,  1.67571549, -1.34791257, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [ 1.67600634,  1.67600634, -0.23438563, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [ 1.67906533,  1.67906533, -0.23438563, ...,  0.17149859,
        -0.17149859,  0.02026518]])

In [16]:
## and the same for the test!
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-1.20255948, -1.20255948,  2.20755943, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.62943343, -1.62943343, -1.11348584, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.49837845, -1.49837845, -1.34791257, ...,  0.17149859,
        -0.17149859,  0.02026518],
       ...,
       [-1.10927546, -1.10927546, -0.72277464, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.10922531, -1.10922531, -0.91813024, ...,  0.17149859,
        -0.17149859,  0.02026518],
       [-1.1091551 , -1.1091551 ,  1.23078141, ...,  0.17149859,
        -0.17149859,  0.02026518]])

In [17]:
# Train the Logistic Regression model on the scaled data and print the model score

## Instantiate
logisticModel = LogisticRegression()
## Fit
logisticModel.fit(X_train_scaled, y_train)
## Score
logisticModel.score(X_test_scaled, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7218205019140791

In [18]:
# Train a Random Forest Classifier model on the scaled data and print the model score

## Instantiate
randomForestModel = RandomForestClassifier(random_state=1)
## fit
randomForestModel.fit(X_train_scaled, y_train)
## Score
randomForestModel.score(X_test_scaled, y_test)

0.5978307103360272

## I'm surprised at how much better the linear regression model did when scaled compared to the unscaled data. Even surpassing the random forest. The random forest actually decreased by a very small amount in scoring after scaling.