In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data = Path('Resources/lending_data.csv')
df_lending_data = pd.read_csv(data)


# Review the DataFrame
df_lending_data.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
4247,8700.0,6.834,44900,0.331849,3,0,14900,0
36428,9500.0,7.145,47800,0.372385,4,0,17800,0
44504,10300.0,7.49,51100,0.412916,4,1,21100,0
17524,10900.0,7.752,53500,0.439252,5,1,23500,0
30973,11200.0,7.893,54900,0.453552,5,1,24900,0
67349,9800.0,7.272,49000,0.387755,4,0,19000,0
13874,9900.0,7.323,49500,0.393939,4,0,19500,0
76862,16700.0,10.239,77000,0.61039,10,2,47000,1
54860,9200.0,7.019,46700,0.357602,3,0,16700,0
53604,8900.0,6.905,45600,0.342105,3,0,15600,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df_lending_data["loan_status"]

# Separate the X variable, the features
X = df_lending_data.drop(["loan_status"], axis=1)

In [4]:
# Review the y variable Series
y.sample(10)

42443    0
23795    0
9636     0
24369    0
34951    0
59532    0
71153    0
5900     0
40509    0
62580    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
X.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
26022,11000.0,7.782,53800,0.442379,5,1,23800
75826,18500.0,10.969,83800,0.642005,12,2,53800
58563,10300.0,7.501,51200,0.414062,4,1,21200
3668,9600.0,7.185,48200,0.377593,4,0,18200
74614,10300.0,7.495,51100,0.412916,4,1,21100
71397,7500.0,6.324,40100,0.25187,2,0,10100
72056,10100.0,7.427,50500,0.405941,4,1,20500
29731,6500.0,5.9,36100,0.168975,1,0,6100
60503,8500.0,6.758,44200,0.321267,3,0,14200
10623,11000.0,7.787,53900,0.443414,5,1,23900


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split (X, y, random_state=1, train_size=0.7)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state = 1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)



In [9]:
# Generate training predictions on the data
training_predictions = logistic_regression_model.predict(X_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model and score the model.

In [10]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)
df_results = pd.DataFrame({
    "Prediction" : testing_predictions,
    "Actual" : y_test
})
df_results.sample(10)

Unnamed: 0,Prediction,Actual
16492,0,0
60741,0,0
6895,0,0
52803,0,0
25408,0,0
56082,0,0
63296,0,0
19550,0,0
66250,0,0
10489,0,0


In [11]:
#Print the model score for the Training and Testing 
print(f"Training Data Score: {round(lr_model.score(X_train, y_train),4)}")
print(f"Testing Data Score: {round(lr_model.score(X_test, y_test),4)}")

Training Data Score: 0.9921
Testing Data Score: 0.992


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [32]:
# Print the balanced_accuracy score of the model
acc_score = balanced_accuracy_score(y_test, testing_predictions)


In [33]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_train, training_predictions)


In [34]:
# Print the classification report for the model
training_report = classification_report(y_train, training_predictions)


In [35]:
 # Displaying results
print("Confusion Matrix")
display(training_matrix)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(training_report)

Confusion Matrix


array([[52258,   263],
       [  167,  1587]])

Accuracy Score : 0.950507049515396
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     52521
           1       0.86      0.90      0.88      1754

    accuracy                           0.99     54275
   macro avg       0.93      0.95      0.94     54275
weighted avg       0.99      0.99      0.99     54275



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression modle was 95% accurate in predicting the healthy vs high risk loans

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [22]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X, y)

In [23]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    75036
1    75036
Name: loan_status, dtype: int64

### Step 2: Split the over sampled data into training and testing datasets by using `train_test_split`.

In [24]:
# Split the over sampled data using train_test_split
# Assign a random_state of 1 to the function
X_resampled_train, X_resampled_test, y_resampled_train, y_resampled_test = train_test_split(X_resampled,
                                                                                             y_resampled,
                                                                                             random_state=1,
                                                                                             train_size=0.7)

### Step 3: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [25]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model = LogisticRegression(random_state = 1)

# Fit the model using over sampled training data
ros_model.fit(X_resampled_train, y_resampled_train)

# Generate training predictions on the over sampled data data
ros_training_predictions = ros_model.predict(X_resampled_train)




In [26]:
# Make a prediction using the over sampled testing data
ros_testing_predictions = ros_model.predict(X_resampled_test)
df_ros_results = pd.DataFrame({
    "Prediction" : ros_testing_predictions,
    "Actual" : y_resampled_test
})
df_ros_results.sample(10)

Unnamed: 0,Prediction,Actual
128171,1,1
109356,1,1
24776,0,0
79332,1,1
144215,1,1
51492,0,0
93813,1,1
132256,1,1
35956,0,0
96956,1,1


### Step 4: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [36]:
# Print the balanced_accuracy score of the model 
ros_acc_score = balanced_accuracy_score(y_resampled_test, ros_testing_predictions)


In [37]:
# Generate a confusion matrix for the model
ros_matrix = confusion_matrix(y_resampled_train, ros_training_predictions)


In [38]:
# Print the classification report for the model
ros_report = classification_report(y_resampled_train, ros_training_predictions)


In [39]:
 # Displaying results
print("Confusion Matrix")
print(ros_matrix)
print(f"Accuracy Score : {ros_acc_score}")
print("Classification Report")
print(ros_report)

Confusion Matrix
[[52111   309]
 [  293 52337]]
Accuracy Score : 0.9950470690253296
Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     52420
           1       0.99      0.99      0.99     52630

    accuracy                           0.99    105050
   macro avg       0.99      0.99      0.99    105050
weighted avg       0.99      0.99      0.99    105050



### Step 5: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer: The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)

# Credit Risk Analysis Report

Explain the purpose of this analysis:

The purpose of this analysis is to create and evaluate the accuracy of a data model that predicts the credity worthiness of potential borrowers from peer-to-peer lending services

Using a bulleted list, describe the accuracy score, the precision score, and recall score of the machine learning model.
    Balanced Accuracy Score: 95.1% --> this means that when taking into account the sensitivity (recall and/or true positive rate) and specificity (true negative rate) of the model, the balanced prediction accuracy was 95.1%
    Precision Score: 93% --> This means 93% of predicted positives were correct
    Recall Score: 95% --> this means that the model was 95% precise in measuring true positive values our of all positive predictions made

Summarize the results from the machine learning model. 
Include your justification for recommending the model for use by the company. If you don’t recommend the model, justify your reasoning.

I would recommend using this model to predict the creditworthiness of borrowers, because it has over 95% accuracy in predicting the outcome of the repayment of the initial loan. That accuracy range could be easily molded into a business risk profile to ensure sufficient capital flow for the lenders to remain in business/make a profit.