In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
file_path = Path("C:/JGonzalez/Classwork/Week20/Starter_Code/Resources/lending_data.csv")

# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!
df = pd.read_csv(file_path)

# Review the DataFrame
# YOUR CODE HERE!
print(df.head())
print(df.info())

   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  loan_status  
0                 1       22800            0  
1                 0       13600            0  
2                 0       16100            0  
3                 1       22700            0  
4                 1       23000            0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [3]:
# Separate the data into labels and features
y = df['loan_status']

# Separate the y variable, the labels
# YOUR CODE HERE!]
X = df.drop(columns='loan_status')
# Separate the X variable, the features
# YOUR CODE HERE!
print("Labels (y):")
print(y.head())

print("\nFeatures (X):")
print(X.head())

Labels (y):
0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

Features (X):
   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
0    10700.0          7.672            52800        0.431818                5   
1     8400.0          6.692            43600        0.311927                3   
2     9000.0          6.963            46100        0.349241                3   
3    10700.0          7.664            52700        0.430740                5   
4    10800.0          7.698            53000        0.433962                5   

   derogatory_marks  total_debt  
0                 1       22800  
1                 0       13600  
2                 0       16100  
3                 1       22700  
4                 1       23000  


In [4]:
# Review the y variable Series
# YOUR CODE HERE!
print("Labels (y) overview:")
print(y.describe())
print("\nFirst 10 values of y:")
print(y.head(10))
print("\nValue counts of y:")
print(y.value_counts())

Labels (y) overview:
count    77536.000000
mean         0.032243
std          0.176646
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: loan_status, dtype: float64

First 10 values of y:
0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: loan_status, dtype: int64

Value counts of y:
loan_status
0    75036
1     2500
Name: count, dtype: int64


In [5]:
# Review the X variable DataFrame
# YOUR CODE HERE!
print("Features (X) overview:")
print(X.describe())
print("\nFirst 10 rows of X:")
print(X.head(10))
print("\nInformation about X:")
print(X.info())

Features (X) overview:
          loan_size  interest_rate  borrower_income  debt_to_income  \
count  77536.000000   77536.000000     77536.000000    77536.000000   
mean    9805.562577       7.292333     49221.949804        0.377318   
std     2093.223153       0.889495      8371.635077        0.081519   
min     5000.000000       5.250000     30000.000000        0.000000   
25%     8700.000000       6.825000     44800.000000        0.330357   
50%     9500.000000       7.172000     48100.000000        0.376299   
75%    10400.000000       7.528000     51400.000000        0.416342   
max    23800.000000      13.235000    105200.000000        0.714829   

       num_of_accounts  derogatory_marks    total_debt  
count     77536.000000      77536.000000  77536.000000  
mean          3.826610          0.392308  19221.949804  
std           1.904426          0.582086   8371.635077  
min           0.000000          0.000000      0.000000  
25%           3.000000          0.000000  14800.0000

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [6]:
# Import the train_test_split module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split and assign a random_state of 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Verify the shape of the resulting datasets
print("Training feature set shape:", X_train.shape)
print("Testing feature set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Training feature set shape: (62028, 7)
Testing feature set shape: (15508, 7)
Training labels shape: (62028,)
Testing labels shape: (15508,)


---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [7]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
# YOUR CODE HERE!
logistic_model = LogisticRegression(random_state=1)

# Fit the model using training data
# YOUR CODE HERE!
logistic_model.fit(X_train, y_train)

# Verify the fit by checking the coefficients and intercept
print("Model coefficients:", logistic_model.coef_)
print("Model intercept:", logistic_model.intercept_)


Model coefficients: [[-1.07343332e-05 -1.11821247e-07 -3.86442644e-04 -2.57250652e-09
   1.61411871e-07  5.41492664e-08  6.42898333e-04]]
Model intercept: [-3.43113659e-08]


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [8]:
# Make a prediction using the testing data
# YOUR CODE HERE!
y_pred = logistic_model.predict(X_test)
# Print the first few predictions to verify
print("Predictions for the testing data:")
print(y_pred[:10])

Predictions for the testing data:
[0 0 0 0 0 0 0 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [9]:
# Generate a confusion matrix for the model
# YOUR CODE HERE!

# Import necessary modules from sklearn
from sklearn.metrics import confusion_matrix, classification_report

# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, y_pred)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

# Generate and print the classification report
cr = classification_report(y_test, y_pred)

print("\nClassification Report:")
print(cr)

Confusion Matrix:
[[14926    75]
 [   46   461]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



In [10]:
# Print the classification report for the model
# YOUR CODE HERE!

# Import the classification_report module from sklearn
from sklearn.metrics import classification_report

# Generate and print the classification report
classification_rep = classification_report(y_test, y_pred)

print("Classification Report:")
print(classification_rep)


Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** 

The logistic regression model performs exceptionally well in predicting both the 0 (healthy loan) and 1 (high-risk loan) labels, as evidenced by the classification report.

For Healthy Loans (0):

Precision: The precision for predicting healthy loans is 1.00, indicating that nearly all loans predicted as healthy are indeed healthy. This means there are very few false positives.
Recall: The recall for predicting healthy loans is 1.00, suggesting that the model successfully identifies all actual healthy loans. There are almost no false negatives.
F1-Score: The F1-score for healthy loans is 1.00, demonstrating a perfect balance between precision and recall for this class.
For High-Risk Loans (1):

Precision: The precision for predicting high-risk loans is 0.86, indicating that 86% of the loans predicted as high-risk are actually high-risk. There is a small number of false positives.
Recall: The recall for predicting high-risk loans is 0.91, which means the model correctly identifies 91% of the actual high-risk loans. There are some false negatives.
F1-Score: The F1-score for high-risk loans is 0.88, which reflects a good balance between precision and recall for this class, though not perfect.
Overall Model Performance:

Accuracy: The overall accuracy of the model is 0.99, indicating that 99% of all predictions (both healthy and high-risk) are correct.
Macro Average: The macro average F1-score is 0.94, representing the arithmetic mean of F1-scores for both classes, providing an overall measure of model performance across classes.
Weighted Average: The weighted average F1-score is 0.99, which accounts for the proportion of each class in the dataset, indicating that the model is highly effective in making predictions.
Summary:
The logistic regression model demonstrates excellent performance in predicting healthy loans with perfect precision and recall. It also performs very well in predicting high-risk loans, with high precision and recall, although there is some room for improvement in reducing false positives. Overall, the model achieves a high accuracy rate and balanced performance across both classes.




---