In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

### Step 1: Read the lending_data.csv data from the Resources folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!
loan_df = pd.read_csv(Path('../Resources/lending_data.csv'))
# Review the DataFrame
# YOUR CODE HERE!
loan_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
# Identifying shape of the data
loan_df.shape

(77536, 8)

In [4]:
# Getting additional information on the data
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   borrower_income   77536 non-null  int64  
 3   debt_to_income    77536 non-null  float64
 4   num_of_accounts   77536 non-null  int64  
 5   derogatory_marks  77536 non-null  int64  
 6   total_debt        77536 non-null  int64  
 7   loan_status       77536 non-null  int64  
dtypes: float64(3), int64(5)
memory usage: 4.7 MB


In [5]:
# Looking at column headers
loan_df.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [6]:
# Data statistics for evaluation purposes.
loan_df.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.032243
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.176646
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0


In [7]:
# Calculate the sum total of the outstanding loans
total = loan_df['loan_size'].sum()
total

760284100.0

In [8]:
# Calculate the total of the loans in default
defaults = loan_df.loc[loan_df['loan_status'] == 1, ['loan_size']].sum()
defaults


loan_size    46269500.0
dtype: float64

In [9]:
 #alculating the percent of oustanding loans in default
percent_default = defaults / total *100
percent_default

loan_size    6.085817
dtype: float64

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [10]:
# Separate the data into labels and features

# Separate the y variable, the labels
# YOUR CODE HERE!]
y = loan_df['loan_status']
# Separate the X variable, the features
# YOUR CODE HERE!
X = loan_df.drop(columns='loan_status')

In [11]:
# Review the y variable Series
# YOUR CODE HERE!
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [12]:
# Review the X variable DataFrame
# YOUR CODE HERE!
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [13]:
# Additional code added to scale the features data for comparison purposes

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X)

# Transform the training data using the scaler
X_scaled = X_scaler.transform(X)
# Results of scaling
X_scaled

array([[ 0.42730427,  0.4268375 ,  0.42740435, ...,  0.61614258,
         1.04399575,  0.42740435],
       [-0.67148676, -0.67491817, -0.67155173, ..., -0.43404935,
        -0.67397306, -0.67155173],
       [-0.38484562, -0.370249  , -0.37292236, ..., -0.43404935,
        -0.67397306, -0.37292236],
       ...,
       [ 3.72367737,  3.71299241,  3.71232744, ...,  3.76671836,
         2.76196455,  3.71232744],
       [ 3.10262157,  3.12051768,  3.1150687 , ...,  3.24162239,
         2.76196455,  3.1150687 ],
       [ 2.76820691,  2.75401528,  2.75671345, ...,  2.71652643,
         2.76196455,  2.75671345]])

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [14]:
# Check the balance of our target values
# YOUR CODE HERE!
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [15]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
# YOUR CODE HERE!
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=1)

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [16]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
lr_model_scaled = LogisticRegression(solver='lbfgs', random_state=1)
# Fit the model using training data
# YOUR CODE HERE!
lr_model_scaled.fit(X_train, y_train)
# Validating the model using score.
print(f"Training Data Score: {lr_model_scaled.score(X_train, y_train)}")
print(f"Testing Data Score: {lr_model_scaled.score(X_test, y_test)}")

Training Data Score: 0.9942908240473243
Testing Data Score: 0.9936545604622369


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [17]:
# Make a prediction using the testing data
# YOUR CODE HERE!
loan_predictions = lr_model_scaled.predict(X_test)
pd.DataFrame({"Prediction": loan_predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
60914,0,0
36843,0,0
1966,0,0
70137,0,0
27237,0,0
...,...,...
45639,0,0
11301,0,0
51614,0,0
4598,0,0


### Step 3: Evaluate the model’s performance by doing the following:
Calculate the accuracy score of the model.

Generate a confusion matrix.

Print the classification report.

In [18]:

# Print the balanced_accuracy score of the model
# YOUR CODE HERE!
loan_acc_scaled = balanced_accuracy_score(y_test, loan_predictions)

# Generate a confusion matrix for the model
# YOUR CODE HERE!
c_matrix_scaled = confusion_matrix(y_test, loan_predictions)


# Print the classification report for the model
# YOUR CODE HERE!
loan_test_report_scaled = classification_report(y_test, loan_predictions)
print(loan_test_report_scaled)
print(c_matrix_scaled)
loan_acc_scaled

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.84      0.98      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384

[[18652   113]
 [   10   609]]


0.9889115309798473

### Step 4: Answer the following question.

Question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

Answer: Prediction model using scaled data demonstrates better results over raw data, balanced accuracy score increased to .370, recall increased by 7% while precision dropped by 1%. The confusion matrix false positives dropped drastically for the data set. However, false negatives increased, true positives dropped equally to false negatives. Predicting model with re-sampled data could provide better information on whether scaled provides better predictive mode.

## Predict a Logistic Regression Model with Resampled Training Data

#### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [20]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_scaled, y)


# Fit the original training data to the random_oversampler model
# YOUR CODE HERE!
ros_scaled.fit(X_train, y_train)
# Count the distinct values of the resampled labels data
# YOUR CODE HERE!
y_resampled.value_counts()

0    75036
1    75036
Name: loan_status, dtype: int64

#### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [21]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
ros_model = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using the resampled training data
# YOUR CODE HERE!
ros_model.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
# YOUR CODE HERE!
ros_predictions = ros_model.predict(X_test)

#### Step 3: Evaluate the model’s performance by doing the following:
Calculate the accuracy score of the model.

Generate a confusion matrix.

Print the classification report.

In [22]:
# Print the balanced_accuracy score of the model 
# YOUR CODE HERE!
ros_acc_scaled = balanced_accuracy_score(y_test, ros_predictions)
ros_acc_scaled

0.9934383134311076

In [23]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!
ros_c_matrix_scaled = confusion_matrix(y_test, ros_predictions)
print(ros_c_matrix_scaled)

[[18640   125]
 [    4   615]]


In [24]:
# Print the classification report for the model
# YOUR CODE HERE!
loan_ros_report_scaled = classification_report(y_test, ros_predictions)
print(loan_ros_report_scaled)

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     18765
           1       0.83      0.99      0.91       619

    accuracy                           0.99     19384
   macro avg       0.92      0.99      0.95     19384
weighted avg       0.99      0.99      0.99     19384



### Step 4: Answer the following question

Question: How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

Answer: Oversampled data represents high degree of accuracy, however shows minute improvement over raw data.False negatives increased from 125 to 116 therfore it may be useful to use raw data when prediciting.