In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [3]:
train_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,25000.0,0.2,662.35,RENT,45000.0,Verified,n,22.4,2.0,0.0,...,25.0,0.0,0.0,41532.0,12941.0,18000.0,20632.0,N,N,low_risk
1,35000.0,0.1862,900.62,MORTGAGE,130000.0,Not Verified,n,17.71,0.0,0.0,...,88.9,0.0,0.0,401978.0,69537.0,65200.0,19056.0,N,N,low_risk
2,6400.0,0.0881,202.96,RENT,70000.0,Not Verified,n,3.27,0.0,0.0,...,0.0,1.0,0.0,23400.0,3571.0,15800.0,4000.0,N,N,low_risk
3,7000.0,0.2,260.15,RENT,65000.0,Verified,n,15.66,0.0,0.0,...,50.0,0.0,0.0,64853.0,53589.0,5300.0,57453.0,N,N,low_risk
4,8500.0,0.143,291.75,OWN,41000.0,Not Verified,n,24.18,0.0,0.0,...,25.0,1.0,0.0,37469.0,19623.0,16000.0,18169.0,N,N,low_risk


In [4]:
test_df.head()

Unnamed: 0,loan_amnt,int_rate,installment,home_ownership,annual_inc,verification_status,pymnt_plan,dti,delinq_2yrs,inq_last_6mths,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,debt_settlement_flag,target
0,8000.0,0.0819,251.4,MORTGAGE,53000.0,Not Verified,n,30.87,0.0,1.0,...,0.0,0.0,0.0,214538.0,52345.0,500.0,67066.0,N,N,low_risk
1,30000.0,0.1102,652.58,MORTGAGE,120000.0,Not Verified,n,21.53,0.0,0.0,...,0.0,0.0,0.0,554901.0,101234.0,42500.0,119678.0,N,N,low_risk
2,16000.0,0.0819,325.88,MORTGAGE,95000.0,Verified,n,23.37,0.0,1.0,...,42.9,1.0,0.0,292025.0,76609.0,29900.0,62902.0,N,N,low_risk
3,3000.0,0.1524,104.35,MORTGAGE,50000.0,Not Verified,n,22.3,0.0,0.0,...,75.0,0.0,0.0,472470.0,219678.0,48100.0,171741.0,N,N,low_risk
4,10000.0,0.2305,282.2,OWN,34000.0,Not Verified,n,5.58,0.0,2.0,...,0.0,0.0,0.0,14729.0,5309.0,12800.0,1929.0,N,N,low_risk


In [5]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["target"]
X_train = train_df.drop(columns = ["target"])
X_train = pd.get_dummies(X_train)
X_train.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N
0,25000.0,0.2,662.35,45000.0,22.4,2.0,0.0,9.0,0.0,6637.0,...,0,1,1,0,1,0,1,1,0,1
1,35000.0,0.1862,900.62,130000.0,17.71,0.0,0.0,14.0,0.0,58513.0,...,0,0,1,0,1,1,0,1,0,1
2,6400.0,0.0881,202.96,70000.0,3.27,0.0,0.0,6.0,1.0,1238.0,...,0,0,1,0,1,1,0,1,0,1
3,7000.0,0.2,260.15,65000.0,15.66,0.0,0.0,12.0,0.0,4578.0,...,0,1,1,0,1,0,1,1,0,1
4,8500.0,0.143,291.75,41000.0,24.18,0.0,0.0,9.0,1.0,6540.0,...,0,0,1,0,1,1,0,1,0,1


In [6]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["target"]
X_test = test_df.drop(columns = ["target"])
X_test = pd.get_dummies(X_test)
X_test.head()

Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,debt_settlement_flag_N
0,8000.0,0.0819,251.4,53000.0,30.87,0.0,1.0,11.0,0.0,13802.0,...,1,0,0,1,0,1,0,1,1,1
1,30000.0,0.1102,652.58,120000.0,21.53,0.0,0.0,13.0,0.0,19427.0,...,1,0,0,1,0,1,0,1,1,1
2,16000.0,0.0819,325.88,95000.0,23.37,0.0,1.0,16.0,1.0,20786.0,...,0,0,1,1,0,1,1,0,1,1
3,3000.0,0.1524,104.35,50000.0,22.3,0.0,0.0,14.0,0.0,30125.0,...,1,0,0,1,0,1,1,0,1,1
4,10000.0,0.2305,282.2,34000.0,5.58,0.0,2.0,9.0,0.0,4166.0,...,1,0,0,1,0,1,1,0,1,1


In [7]:
# add missing dummy variables to testing set
for column in X_train.columns:
    if column not in X_test.columns:
        X_test[column] = 0

In [8]:
# Train the Logistic Regression model on the unscaled data and print the model score
reg = LogisticRegression().fit(X_train, y_train)
reg.score(X_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Feature names must be in the same order as they were in fit.



0.4889086069210293

In [11]:
# Train a Random Forest Classifier model and print the model score
randomForestClass = RandomForestClassifier(random_state=0)
randomForestClass.fit(X_train, y_train)

print(f"Training score: {randomForestClass.score(X_train,y_train)}")
print(f"Testing score: {randomForestClass.score(X_test,y_test)}")

Training score: 1.0


Feature names must be in the same order as they were in fit.



Testing score: 0.6343537414965986


In [10]:
# Scale the data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Feature names must be in the same order as they were in fit.



In [12]:
# Train the Logistic Regression model on the scaled data and print the model score
logistic_scaled = LogisticRegression(
    solver='lbfgs',
    max_iter=100,
    random_state=0
)
logistic_scaled.fit(X_train_scaled, y_train)
print(f"LogisticRegression testing score (scaled): {logistic_scaled.score(X_test_scaled, y_test)}")

LogisticRegression testing score (scaled): 0.5564921620822242


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Train a Random Forest Classifier model on the scaled data and print the model score
randomForestClass_scaled = RandomForestClassifier(random_state=0)
randomForestClass_scaled.fit(X_train_scaled, y_train)
print(f"RandomForestClassifier scaled score: {randomForestClass_scaled.score(X_test_scaled, y_test)}")

RandomForestClassifier scaled score: 0.6349452824608104
