In [1]:
#*****************************************
#** UC Bekeley Data Analytics Bootcamp  **
#**       January 2024 Cohort           **
#**                                     **
#**        Project 4, Group 4           **
#**                                     **
#** N. Shevchenko, T. Brown, B. Beachtel**
#** C. Kamler, J. Haugen, J. Bein       **
#**                                     **
#*****************************************

In [1]:
#*****************************
#** Import the dependencies **
#*****************************

import numpy as np                                                     #this package provides support for large arrays and matrices and mathematical functions
import pandas as pd                                                    #this package provides data structures and data analysis tools.
from pathlib import Path                                               #the pathlib module provides a way to interact with  file system paths (operating system independent) 
from sklearn.metrics import confusion_matrix                           #scikit-learn is a machine learning library. The confusion matrix provides a summary of correct and incorrect predictions
from sklearn.metrics import classification_report                      #classification report provides a summary of the model's peformance, including precision, recall, and F1 score
from sklearn.metrics import accuracy_score                             #this function is used to calculate the accuracy of a classification model by comparing the true labels of the data 
                                                                          #with the predicted labels generated by the model
from sklearn.metrics import ConfusionMatrixDisplay                     #This class is used to display a confusion matrix in a visually appealing way
                                                                        
from sklearn.model_selection import train_test_split                   #the train_test_split function is used to split a dataset into training and testing sets for
                                                                           #model training and evaluation
from sklearn.linear_model import LogisticRegression                    #the LogisticRegression clas is used for performing logistic regression
from sklearn.preprocessing import StandardScaler                       #The StandardScaler class is used for standardizing features by removing the mean and scaling to unit variance.

import tensorflow as tf                                                #TensorFlow is an open-source machine learning framework developed by Google that is widely
                                                                           #used for building and training deep learning models.
from tensorflow.keras.models import Sequential                         #In TensorFlow's Keras API, the Sequential class is used to create a linear stack of layers for building neural network models. 
from tensorflow.keras.layers import Dense, Dropout                     #The Dense class is used to create fully connected layers in a neural network. Each Dense layer 
                                                                            #represents a set of neurons where each neuron is connected to every neuron in the previous layer.
                                                                       #The Dropout class is used to apply dropout regularization to a neural network. Dropout is a 
                                                                            #technique where randomly selected neurons are ignored during training, which helps prevent 
                                                                            #overfitting by improving the generalization of the model. 
from lazypredict.Supervised import LazyClassifier                      #The LazyClassifier class in LazyPredict is a simplified version of a classifier that automatically builds and 
                                                                            #evaluates multiple classification models on a given dataset, providing a quick overview of how different 
                                                                            #models perform without the need for extensive manual configuration.

#Acknowledgement: Xpert Learning Assistant

## Data Preprocessing



### Step 1: Read the `loan.csv` data from the `Resources` folder into a Pandas DataFrame.

In [3]:
#*************************************************************************
#** Read the CSV file from the Resources folder into a Pandas DataFrame **
#*************************************************************************

path = './Resources/loan.csv'
loan_df = pd.read_csv(path)

# Review the DataFrame
loan_df
data_types=loan_df.dtypes
print(data_types)

Loan_ID               object
Gender                object
Married               object
Dependents            object
Education             object
Self_Employed         object
ApplicantIncome        int64
CoapplicantIncome    float64
LoanAmount           float64
Loan_Amount_Term     float64
Credit_History       float64
Property_Area         object
Loan_Status           object
dtype: object


Note:
1. The variable that is the target for our model is "Loan_Status"
2. Currently, all other variables are features for the model.

In [4]:
#***************************************
#** Drop the non-beneficial variables **
#***************************************

columns_to_delete = ['Loan_ID']
loan_df = loan_df.drop(columns = columns_to_delete)
loan_df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [5]:
#**********************************************************
#** Determine the number of unique values in each column **
#**********************************************************

#******************************************************************************
#** For the columns containing categorical data with numerous unique values, **
#** determine the number of data points for each unique value. This might be **
#** an opportunity to use binning.                                           **
#******************************************************************************

unique_counts = loan_df.nunique()
print(unique_counts)

Gender                 2
Married                2
Dependents             4
Education              2
Self_Employed          2
ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           203
Loan_Amount_Term      10
Credit_History         2
Property_Area          3
Loan_Status            2
dtype: int64


In [6]:
#**************************************************
#** breakpoint - make a copy so if things go awry**
#** I don't have to start from the beginning.    **
#**************************************************

#copied_df= loan_df.copy()
#loan_df = copied_df.copy()


In [7]:
#***********************
#**   Data Conversion **
#***********************

#Convert categorical data to numeric with `pd.get_dummies` (One Hot Encoding - OHE)
loan_df_new = pd.get_dummies(loan_df, drop_first=True)
loan_df_new.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Male,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,,360.0,1.0,True,False,False,False,False,False,False,False,True,True
1,4583,1508.0,128.0,360.0,1.0,True,True,True,False,False,False,False,False,False,False
2,3000,0.0,66.0,360.0,1.0,True,True,False,False,False,False,True,False,True,True
3,2583,2358.0,120.0,360.0,1.0,True,True,False,False,False,True,False,False,True,True
4,6000,0.0,141.0,360.0,1.0,True,False,False,False,False,False,False,False,True,True


In [8]:
#******************************************************
#** determine the number of records with null values **
#******************************************************
#null_df=loan_df_new.isnull()
#null_values=loan_df_new[null_df.any(axis=1)]
#print(null_values)
null_count=loan_df_new.isnull().sum()
print(null_count)

ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Gender_Male                 0
Married_Yes                 0
Dependents_1                0
Dependents_2                0
Dependents_3+               0
Education_Not Graduate      0
Self_Employed_Yes           0
Property_Area_Semiurban     0
Property_Area_Urban         0
Loan_Status_Y               0
dtype: int64


In [9]:
#****************************************************
#drop records for which the loan amount is missing **
#****************************************************
df_cleaned = loan_df_new.dropna(subset=['LoanAmount'])

In [10]:
#**********************************************************************
#split the preprocessed data into our features array and target array**
#**********************************************************************

y=df_cleaned['Loan_Status_Y']
X=df_cleaned.drop(columns='Loan_Status_Y')

In [11]:
#********************************************************
#split the arrays into trainging data and testing data **
#********************************************************
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

In [12]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.

nn_model = tf.keras.models.Sequential()

# First hidden layer
nn_model.add(tf.keras.layers.Dense(units=15, activation="relu", input_dim=14))
nn_model.add(Dropout(0.5))

# Second hidden layer
nn_model.add(tf.keras.layers.Dense(units=10, activation="relu"))
nn_model.add(Dropout(0.5))

# Output layer
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn_model.summary()

In [14]:
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [15]:
# Train the model
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6840 - loss: 0.6989   
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7003 - loss: 0.6847 
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6827 - loss: 0.6783 
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6966 - loss: 0.6730 
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6985 - loss: 0.6644 
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7070 - loss: 0.6576 
Epoch 7/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6867 - loss: 0.6571 
Epoch 8/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6851 - loss: 0.6539 
Epoch 9/100
[1m14/14[0m [32m━━━━━━━

In [16]:
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

5/5 - 0s - 32ms/step - accuracy: 0.6892 - loss: 0.6198
Loss: 0.6197662353515625, Accuracy: 0.6891891956329346


In [17]:
#*****************************
#** Now, use LazyClassifier **
#*****************************
classifier = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Fit and evaluate models
models, predictions = classifier.fit(X_train, X_test, y_train, y_test)

# Display the results
print(models)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29/29 [00:01<00:00, 22.00it/s]

[LightGBM] [Info] Number of positive: 309, number of negative: 135
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 335
[LightGBM] [Info] Number of data points in the train set: 444, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.695946 -> initscore=0.828066
[LightGBM] [Info] Start training from score 0.828066
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.80               0.72     0.72      0.78   
SGDClassifier                      0.77               0.71     0.71      0.77   
LGBMClassifier                     0.79               0.71     0.71      0.78   
CalibratedClassifierCV             0.81        




Try an alternative:
HistGradientBoostingClassifier
AdaBoostClassifier

In [29]:
from sklearn.ensemble import HistGradientBoostingClassifier
hist_gb_classifier = HistGradientBoostingClassifier()
hist_gb_classifier.fit(X_train, y_train)
predictions = hist_gb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7635135135135135
