In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import RandomOverSampler

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data = Path('Resources/lending_data.csv')
df_lending_data = pd.read_csv(data)


# Review the DataFrame
df_lending_data.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
61000,11000.0,7.794,53900,0.443414,5,1,23900,0
35878,8500.0,6.747,44100,0.319728,3,0,14100,0
59194,9900.0,7.349,49800,0.39759,4,0,19800,0
74775,10400.0,7.545,51600,0.418605,4,1,21600,0
30047,10300.0,7.521,51400,0.416342,4,1,21400,0
24820,11300.0,7.915,55100,0.455535,5,1,25100,0
30640,11500.0,8.008,56000,0.464286,5,1,26000,0
67346,8400.0,6.683,43500,0.310345,3,0,13500,0
62496,10200.0,7.445,50700,0.408284,4,1,20700,0
12426,11100.0,7.822,54200,0.446494,5,1,24200,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df_lending_data["loan_status"]

# Separate the X variable, the features
X = df_lending_data.drop(["loan_status"], axis=1)

In [4]:
# Review the y variable Series
y.sample(10)

18556    0
22926    0
52598    0
33177    0
11057    0
57546    0
9665     0
15236    0
49174    0
60273    0
Name: loan_status, dtype: int64

In [5]:
# Review the X variable DataFrame
X.sample(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
67634,9100.0,6.979,46300,0.352052,3,0,16300
15311,8100.0,6.583,42600,0.295775,2,0,12600
45409,7700.0,6.404,40900,0.266504,2,0,10900
64354,10900.0,7.751,53500,0.439252,5,1,23500
41098,10400.0,7.557,51700,0.419729,4,1,21700
51725,10000.0,7.354,49800,0.39759,4,0,19800
62313,9400.0,7.099,47400,0.367089,3,0,17400
73569,9700.0,7.265,49000,0.387755,4,0,19000
60620,10200.0,7.44,50600,0.407115,4,1,20600
27547,10700.0,7.664,52700,0.43074,5,1,22700


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [6]:
# Check the balance of our target values
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split (X, y, random_state=1, train_size=0.7)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [8]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state = 1)

# Fit the model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)



In [9]:
# Generate training predictions on the data
training_predictions = logistic_regression_model.predict(X_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model and score the model.

In [10]:
# Make a prediction using the testing data
testing_predictions = logistic_regression_model.predict(X_test)
df_results = pd.DataFrame({
    "Prediction" : testing_predictions,
    "Actual" : y_test
})
df_results.sample(10)

Unnamed: 0,Prediction,Actual
20059,0,0
45841,0,0
47592,0,0
31236,0,0
18417,0,0
11145,0,0
60011,0,0
10234,0,0
42643,0,0
44033,0,0


In [11]:
#Print the model score for the Training and Testing 
print(f"Training Data Score: {round(lr_model.score(X_train, y_train),4)}")
print(f"Testing Data Score: {round(lr_model.score(X_test, y_test),4)}")

Training Data Score: 0.9921
Testing Data Score: 0.992


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [25]:
# Print the balanced_accuracy score of the model
acc_score = round(balanced_accuracy_score(y_test, testing_predictions), 4)


In [26]:
# Generate a confusion matrix for the model
training_matrix = confusion_matrix(y_train, training_predictions)


In [27]:
# Print the classification report for the model
training_report = classification_report(y_train, training_predictions)


In [28]:
 # Displaying results
print("Confusion Matrix")
display(training_matrix)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(training_report)

Confusion Matrix


array([[52258,   263],
       [  167,  1587]])

Accuracy Score : 0.9505
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     52521
           1       0.86      0.90      0.88      1754

    accuracy                           0.99     54275
   macro avg       0.93      0.95      0.94     54275
weighted avg       0.99      0.99      0.99     54275



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression modle was 95% accurate in predicting the healthy vs high risk loans

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [16]:
# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = ros.fit_resample(X, y)

In [17]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    75036
1    75036
Name: loan_status, dtype: int64

### Step 2: Split the over sampled data into training and testing datasets by using `train_test_split`.

In [18]:
# Split the over sampled data using train_test_split
# Assign a random_state of 1 to the function
X_resampled_train, X_resampled_test, y_resampled_train, y_resampled_test = train_test_split(X_resampled,
                                                                                             y_resampled,
                                                                                             random_state=1,
                                                                                             train_size=0.7)

### Step 3: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [19]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
ros_model = LogisticRegression(random_state = 1)

# Fit the model using over sampled training data
ros_model.fit(X_resampled_train, y_resampled_train)

# Generate training predictions on the over sampled data data
ros_training_predictions = ros_model.predict(X_resampled_train)




In [20]:
# Make a prediction using the over sampled testing data
ros_testing_predictions = ros_model.predict(X_resampled_test)
df_ros_results = pd.DataFrame({
    "Prediction" : ros_testing_predictions,
    "Actual" : y_resampled_test
})
df_ros_results.sample(10)

Unnamed: 0,Prediction,Actual
37646,0,0
84805,1,1
43001,0,0
109046,1,1
50093,0,0
39181,0,0
45104,0,0
102730,1,1
14293,0,0
44149,0,0


### Step 4: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [29]:
# Print the balanced_accuracy score of the model 
ros_acc_score = round(balanced_accuracy_score(y_resampled_test, ros_testing_predictions),4)


In [30]:
# Generate a confusion matrix for the model
ros_matrix = confusion_matrix(y_resampled_train, ros_training_predictions)


In [31]:
# Print the classification report for the model
ros_report = classification_report(y_resampled_train, ros_training_predictions)


In [32]:
 # Displaying results
print("Over Sampled Confusion Matrix")
print(ros_matrix)
print(f"Over Sampled Accuracy Score : {ros_acc_score}")
print("Over Sampled Classification Report")
print(ros_report)

Over Sampled Confusion Matrix
[[52111   309]
 [  293 52337]]
Over Sampled Accuracy Score : 0.995
Over Sampled Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     52420
           1       0.99      0.99      0.99     52630

    accuracy                           0.99    105050
   macro avg       0.99      0.99      0.99    105050
weighted avg       0.99      0.99      0.99    105050



### Step 5: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer: The logistic regression model predicts the oversampled data with near-perfect accuracy (>99% accurate)

# Credit Risk Analysis Report

Explain the purpose of this analysis:

The purpose of this analysis is to create and evaluate the accuracy of a data model that predicts the credity worthiness of potential borrowers from peer-to-peer lending services

Describe the accuracy score, the precision score, and recall score of the machine learning model.
* Balanced Accuracy Score: 95.1% --> this means that when taking into account the sensitivity (recall and/or true positive rate) and specificity (true negative rate) of the model, the balanced prediction accuracy was 95.1%
* Precision Score: 93% --> This means 93% of predicted positives were correct
* Recall Score: 95% --> this means that the model was 95% precise in measuring true positive values our of all positive predictions made

Summarize the results from the machine learning model. 
Include your justification for recommending the model for use by the company. If you don’t recommend the model, justify your reasoning.

I would recommend using this model to predict the credit worthiness of borrowers, because it has over 95% accuracy in predicting the outcome of the repayment of the initial loan. That accuracy range could be easily molded into a business risk profile to ensure sufficient capital flow for the lenders to remain in business/make a profit. Although I recommend this model a further loss analysis should be performed to assess the loss in the incorrect predicts to detemine if losses is within the banks tolerance.