In [None]:
# Import of libraries and dependencies 

import pandas as pd
import numpy as np
import hvplot.pandas

from pathlib import Path
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler


---

## Split the Data into Training and Testing Sets

### Step 1: Read the `loan_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [9]:
# Define the file path
file_path = Path("Resources/loan_data.csv")

# Read the CSV file into a DataFrame
loans_df = pd.read_csv(file_path)

# Display the DataFrame
loans_df

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,27.0,male,Associate,47971.0,6,RENT,15000.0,MEDICAL,15.66,0.31,3.0,645,No,1
44996,37.0,female,Associate,65800.0,17,RENT,9000.0,HOMEIMPROVEMENT,14.07,0.14,11.0,621,No,1
44997,33.0,male,Associate,56942.0,7,RENT,2771.0,DEBTCONSOLIDATION,10.02,0.05,10.0,668,No,1
44998,29.0,male,Bachelor,33164.0,4,RENT,12000.0,EDUCATION,13.23,0.36,6.0,604,No,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [13]:
# Check the DataFrame data types
loans_df.dtypes

person_age                        float64
person_gender                      object
person_education                   object
person_income                     float64
person_emp_exp                      int64
person_home_ownership              object
loan_amnt                         float64
loan_intent                        object
loan_int_rate                     float64
loan_percent_income               float64
cb_person_cred_hist_length        float64
credit_score                        int64
previous_loan_defaults_on_file     object
loan_status                         int64
dtype: object

In [52]:
# Encoding select dtypes

loans_df_encoded = pd.get_dummies(loans_df, columns=['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file'])
loans_df_encoded.columns

Index(['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'loan_status', 'person_gender_female',
       'person_gender_male', 'person_education_Associate',
       'person_education_Bachelor', 'person_education_Doctorate',
       'person_education_High School', 'person_education_Master',
       'person_home_ownership_MORTGAGE', 'person_home_ownership_OTHER',
       'person_home_ownership_OWN', 'person_home_ownership_RENT',
       'loan_intent_DEBTCONSOLIDATION', 'loan_intent_EDUCATION',
       'loan_intent_HOMEIMPROVEMENT', 'loan_intent_MEDICAL',
       'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'previous_loan_defaults_on_file_No',
       'previous_loan_defaults_on_file_Yes'],
      dtype='object')

In [53]:
# Multi-Output Classification:
#from sklearn.multioutput import MultiOutputClassifier
#from sklearn.ensemble import RandomForestClassifier

# Separate the data into labels and features
    # Separate the y variable, the labels
    
# y= loans_df_encoded[['loan_status_0', 'loan_status_1']]
y= loans_df_encoded['loan_status']
    # Separate the X variable, the features
    
X= loans_df_encoded.drop(columns=['loan_status'])

In [54]:
# Review the y variable Series
    # Provided statstical description of the y variable Series data

y.describe()

count    45000.000000
mean         0.222222
std          0.415744
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: loan_status, dtype: float64

In [None]:
# Review the X variable DataFrame
X.columns

Index(['person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'person_gender_female', 'person_gender_male',
       'person_education_Associate', 'person_education_Bachelor',
       'person_education_Doctorate', 'person_education_High School',
       'person_education_Master', 'person_home_ownership_MORTGAGE',
       'person_home_ownership_OTHER', 'person_home_ownership_OWN',
       'person_home_ownership_RENT', 'loan_intent_DEBTCONSOLIDATION',
       'loan_intent_EDUCATION', 'loan_intent_HOMEIMPROVEMENT',
       'loan_intent_MEDICAL', 'loan_intent_PERSONAL', 'loan_intent_VENTURE',
       'previous_loan_defaults_on_file_No',
       'previous_loan_defaults_on_file_Yes'],
      dtype='object')

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [71]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [91]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
    # Assign a random_state parameter of 1 to the model
# Initialize the classifier

class_weights = {0: 2, 1: 5}  # 1 for low-risk, 3 for high-risk

classifier = LogisticRegression(solver='lbfgs', class_weight=class_weights, random_state=1)

# Fit the model using training data
classifier .fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [92]:
# The following evaluates the accuracy of the model prior to making predictions 

print(f"Training Data Score: {classifier .score(X_train, y_train)}")
print(f"Testing Data Score: {classifier .score(X_test, y_test)}")

Training Data Score: 0.817037037037037
Testing Data Score: 0.8192


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [93]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test)
results=pd.DataFrame({"Prediction": predictions, "Actual": y_test})

results.head()

Unnamed: 0,Prediction,Actual
26978,0,0
41476,0,0
15643,1,0
8163,0,0
12462,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [94]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix = confusion_matrix(y_test, predictions)
confusion_matrix_df = pd.DataFrame(confusion_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1']) # Creates a confusion matrix and converts it into a pandas DataFrame for easier visualization and analysis. 

display(confusion_matrix_df)

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,7455,1295
Actual 1,739,1761


In [None]:
# Breakdown of the results of the confusion matrix:

    # True Negatives :  8167 (Predicted 0, Actual 0)
    # False Positives :  583 (Predicted 1, Actual 0)
    # False Negatives : 1029 (Predicted 0, Actual 1)
    # True Positives :  1471 (Predicted 1, Actual 1)

In [95]:
# Print the classification report for the model

print("The Classification report")
print("-------------------------")

target_names = ["low-risk loan", "high-risk loan"]

print(classification_report(y_test, predictions, target_names=target_names))

The Classification report
-------------------------
                precision    recall  f1-score   support

 low-risk loan       0.91      0.85      0.88      8750
high-risk loan       0.58      0.70      0.63      2500

      accuracy                           0.82     11250
     macro avg       0.74      0.78      0.76     11250
  weighted avg       0.84      0.82      0.83     11250



---