In [13]:
# Import of libraries and dependencies 

import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Read the `loan_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [14]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("Resources/clean_loan_data.csv")

loan_df = pd.read_csv(file_path)

# Review the DataFrame
loan_df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No,1
1,21,female,High School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes,0
2,25,female,High School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No,1
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No,1
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No,1


### Created the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [15]:
# Separate the data into labels and features

    # Separate the y variable, the labels
    
y= loan_df['loan_status']

    # Separate the X variable, the features
    
X= loan_df.drop(columns=['loan_status'])

In [16]:
# Review the y variable Series
    # Provided statstical description of the y variable Series data

y.describe()

count    44995.000000
mean         0.222247
std          0.415761
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: loan_status, dtype: float64

In [17]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file
0,22,female,Master,71948,0,RENT,35000,PERSONAL,16.02,0.49,3,561,No
1,21,female,High School,12282,0,OWN,1000,EDUCATION,11.14,0.08,2,504,Yes
2,25,female,High School,12438,3,MORTGAGE,5500,MEDICAL,12.87,0.44,3,635,No
3,23,female,Bachelor,79753,0,RENT,35000,MEDICAL,15.23,0.44,2,675,No
4,24,male,Master,66135,1,RENT,35000,MEDICAL,14.27,0.53,4,586,No


In [18]:
# remove id coolumn from data

In [19]:
# reviewed the data types
loan_df.dtypes

person_age                          int64
person_gender                      object
person_education                   object
person_income                       int64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                           int64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length          int64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

In [20]:
# Transformed columns person_gender, person_education, person_home_ownership, loan_intent, previous_loan_defaults_on_file using get_dummies
# to change select columns from object in order to run the regression model effectivley.

loans_transformed_df = pd.get_dummies(loan_df, columns=['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file'])

loans_transformed_df.dtypes

person_age                              int64
person_income                           int64
person_emp_exp                          int64
loan_amnt                               int64
loan_int_rate                         float64
loan_percent_income                   float64
cb_person_cred_hist_length              int64
credit_score                            int64
loan_status                             int64
person_gender_female                     bool
person_gender_male                       bool
person_education_Associate               bool
person_education_Bachelor                bool
person_education_Doctorate               bool
person_education_High School             bool
person_education_Master                  bool
person_home_ownership_MORTGAGE           bool
person_home_ownership_OTHER              bool
person_home_ownership_OWN                bool
person_home_ownership_RENT               bool
loan_intent_DEBTCONSOLIDATION            bool
loan_intent_EDUCATION             

### Created new labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the loans_transformed_df.

In [21]:
# Separate the data into labels and features

    # Separate the y variable, the labels
    
y= loans_transformed_df['loan_status']

    # Separate the X variable, the features
    
X= loans_transformed_df.drop(columns=['loan_status'])

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [22]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

---

## Create a Logistic Regression Model with the Original Data

###  Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [23]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
    # Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1)

# Fit the model using training data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# The following evaluates the accuracy of the model prior to making predictions 

print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.839358738813489
Testing Data Score: 0.8420304027024624


### Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [25]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)

results=pd.DataFrame({"Prediction": predictions, "Actual": y_test})

results.head()

Unnamed: 0,Prediction,Actual
37927,0,0
19978,0,0
41568,0,0
4214,0,0
28797,0,1


### Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [26]:
# Generate a confusion matrix for the model

confusion_matrix = confusion_matrix(y_test, predictions)

confusion_matrix_df = pd.DataFrame(confusion_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']) # Creates a confusion matrix and converts it into a pandas DataFrame for easier visualization and analysis. 

display(confusion_matrix_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,8426,323
Actual 1,1454,1046


In [27]:
# Print the classification report for the model

    # The classification report visualizer displays the precision, recall, F1, and support scores for the model
        #  Precision is the ability of a classifier not to label an instance positive that is actually negative
        #  Recall is the ability of a classifier to find all positive instances
        #  The F1 score is a weighted harmonic mean of precision and recall such that the best score is 1.0 and the worst is 0.0
        #  Support is the number of actual occurrences of the class in the specified dataset
            # Reference: (https://medium.com/@kohlishivam5522/understanding-a-classification-report-for-your-machine-learning-model-88815e2ce397)

print("The Classification report")
print("-------------------------")

target_names = ["low-risk loan", "high-risk loan"]

print(classification_report(y_test, predictions, target_names=target_names))

The Classification report
-------------------------
                precision    recall  f1-score   support

 low-risk loan       0.85      0.96      0.90      8749
high-risk loan       0.76      0.42      0.54      2500

      accuracy                           0.84     11249
     macro avg       0.81      0.69      0.72     11249
  weighted avg       0.83      0.84      0.82     11249



### Key Takeaways

The model performs well on predicting low-risk loans, as indicated by the high precision, recall, and F1-score.
The performance on high-risk loans could be improved, as shown by the relatively lower precision, recall, and F1-score for that class.
The accuracy of 86% indicates a generally strong performance, but there's an imbalance between how well the model predicts each class.



---