In [3]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

# Split the Data into Training and Testing Sets

### Step 1: Read the lending_data.csv data from the Resources folder into a Pandas DF

In [4]:
df = pd.read_csv("Resources/lending_data.csv")
df.head(5)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### Step 2" Create the labels set from the "loan_status" column, and then create the features DF from the remaining columns

In [5]:
# Create the labels set (y) from the 'laon_status' column
y = df['loan_status']

# Drop the 'loan_status' column from the df 
df.drop(columns='loan_status', inplace=True)

# Create the features set (x)
X = df.copy()


In [6]:
y

0        0
1        0
2        0
3        0
4        0
        ..
77531    1
77532    1
77533    1
77534    1
77535    1
Name: loan_status, Length: 77536, dtype: int64

### Step 3: Check the balance of the labels variable by using the value_counts function

In [7]:

feature_names = X.columns
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using the train_test_split function

In [8]:
# Import train_test_split function
from sklearn.model_selection import train_test_split

# Split the data into training and testing datasets
# Assign 1 to the random_state parameter
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a Logistic Regression Model with the OG Data

### Step 1: Fit a logistic regression model by using the training data (X_train and y_train)

In [9]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Indtantiate the LR model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitting model

In [10]:
# Make a prediction using the testing data
# Predict outcomes for test data set
predictions = classifier.predict(X_test)

### Step 3: Evaluate the model's performance

In [11]:
# Calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9918489475856377

In [12]:
# Create a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions)

array([[18663,   102],
       [   56,   563]])

In [13]:
# Print a classification report 
from sklearn.metrics import classification_report
target_names = ["healthy", "default"]
print(classification_report(y_test, predictions, target_names=target_names))

              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
     default       0.85      0.91      0.88       619

    accuracy                           0.99     19384
   macro avg       0.92      0.95      0.94     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Question 1

How well does the logistic regression model predict both the '0' (healthy loan) and '1' (high-risk loan) labels?

ANSWER: The logistic regression model does an okay job at predicting the healthy loans and high-risk loans, however, the precision for the high-risk loan is 85%. This means that if you get stuck in the high risk loan category, so there is a 15% chance the loan is being mislabeled as risky. The precision for the high risk loans is 91% despite the precision and recall for the healthy loans being 100%. This indicates that there are indeed false positives (loans that are not actually healthy), but the number is very low compared to the amound of healthy loans. The number of "high risk" loans is much lower than the number of "healthy" loans which is why the precission and recall for both categories are so different.  

# Predict a Logistic Regression Model with Resampled Training Data

### Step 1: USe the Random OverSampler module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points

In [15]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_res, y_res = ros.fit_resample(X_train, y_train)

### Step 2: Use the LogisticRegression classifier and the resampled data to fit the model and make predictions

In [16]:
# Create classifier object
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
classifier.fit(X_res, y_res)

# Make a prediction using the testing data
# Predict outcomes for test data set
predictions_res = classifier.predict(X_test)

### Step 3: Evaluate the model's performance

In [17]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions_res)

0.9938093272802311

In [18]:
# Create a confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, predictions_res)

array([[18649,   116],
       [    4,   615]])

In [19]:
# Print a classification report 
from sklearn.metrics import classification_report
target_names = ["healthy", "default"]
print(classification_report(y_test, predictions_res, target_names=target_names))

              precision    recall  f1-score   support

     healthy       1.00      0.99      1.00     18765
     default       0.84      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Question 2

How well does the logistic regression model fit with oversampled data?

ANSWER: While the precision did worse, the recall for the high risk loans was significantly better rising from 91% to 99%. This indicates that there are fewer risky loans being mislabeled as healthy (false negatives for risky loans). However, the precision for the risky loans is still in the mid 80s. This indicates that there are still loans that are being mislabeled as risky, but are in facts healthy. A bank may prefer loans be mislabeled as risky rather than risky loans being mislabeled as healthy, so this resampling of the data would support the use of this model. 