In [44]:
import numpy as np
import pandas as pd
from pathlib import Path
# Additional imports:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [45]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

In [46]:
# Convert categorical data to numeric and separate target feature for training data
y_train = train_df["loan_status"]
X_train = train_df.drop(["Unnamed: 0", "index", "loan_status"], axis = 1)

In [47]:
X_train=pd.get_dummies(X_train, drop_first = True)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12180 entries, 0 to 12179
Data columns (total 85 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            12180 non-null  float64
 1   int_rate                             12180 non-null  float64
 2   installment                          12180 non-null  float64
 3   annual_inc                           12180 non-null  float64
 4   dti                                  12180 non-null  float64
 5   delinq_2yrs                          12180 non-null  float64
 6   inq_last_6mths                       12180 non-null  float64
 7   open_acc                             12180 non-null  float64
 8   pub_rec                              12180 non-null  float64
 9   revol_bal                            12180 non-null  float64
 10  total_acc                            12180 non-null  float64
 11  out_prncp                   

In [48]:
# Convert categorical data to numeric and separate target feature for testing data
y_test = test_df["loan_status"]
X_test = test_df.drop(["Unnamed: 0", "index", "loan_status"], axis = 1)

In [49]:
X_test = pd.get_dummies(X_test, drop_first = True)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 84 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            4702 non-null   float64
 1   int_rate                             4702 non-null   float64
 2   installment                          4702 non-null   float64
 3   annual_inc                           4702 non-null   float64
 4   dti                                  4702 non-null   float64
 5   delinq_2yrs                          4702 non-null   float64
 6   inq_last_6mths                       4702 non-null   float64
 7   open_acc                             4702 non-null   float64
 8   pub_rec                              4702 non-null   float64
 9   revol_bal                            4702 non-null   float64
 10  total_acc                            4702 non-null   float64
 11  out_prncp                     

In [50]:
# add missing dummy variables to testing set
X_test["debt_settlement_flag_Y"] = 0
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4702 entries, 0 to 4701
Data columns (total 85 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   loan_amnt                            4702 non-null   float64
 1   int_rate                             4702 non-null   float64
 2   installment                          4702 non-null   float64
 3   annual_inc                           4702 non-null   float64
 4   dti                                  4702 non-null   float64
 5   delinq_2yrs                          4702 non-null   float64
 6   inq_last_6mths                       4702 non-null   float64
 7   open_acc                             4702 non-null   float64
 8   pub_rec                              4702 non-null   float64
 9   revol_bal                            4702 non-null   float64
 10  total_acc                            4702 non-null   float64
 11  out_prncp                     

In [53]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train, y_train)
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.7020525451559935
Testing Data Score: 0.5759251382390472


In [54]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test, y_test):.3f}')

Training Score: 1.000
Testing Score: 0.631


In [55]:
# Scale the data
scaler = StandardScaler().fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [56]:
# Train the Logistic Regression model on the scaled data and print the model score
classifier = LogisticRegression(max_iter=15000)
classifier.fit(X_train_scaled, y_train)
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train):.3f}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test):.3f}")

Training Data Score: 0.708
Testing Data Score: 0.768


In [57]:
# Train a Random Forest Classifier model on the scaled data and print the model score
clf = RandomForestClassifier(random_state=1, n_estimators=50).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train):.3f}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test):.3f}')

Training Score: 1.000
Testing Score: 0.631
