# __Problem Statement__
Identify the customers eligible for loan amounts.
# __Hypothesis__


# __Getting the system ready and loading the data (Prepare Data)__

## Import libraries

In [1]:
# Supress warnings
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

import pandas as pd
import skimpy as sk #for data profiling

### Collect data

In [2]:
df = pd.read_csv('./data-for-project-1/raw_data.csv') #read the raw data from the raw_data.csv file
df.head() #show all features with first few rows of data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## Preprocessing and cleaning the data

### Removing irrelevant features

The Loan_ID feature will be irrelevant to our models, thus we can remove it.

In [3]:
irrelevant_features = 'Loan_ID' #feature to be removed
#removing of feature
df.drop(
    columns=irrelevant_features,
    inplace=True
)
#display data without Loan_ID
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


### Change the target, Loan_Status, and Credit_History to int values

In [4]:

df['Loan_Status'].replace({'Y': 1, 'N': 0},inplace = True)

df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,1
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,0
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,1
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,1
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,1


### Checking the cardinality of the features

In [5]:
# checking the cardinality of features
feature_cardinality = df.select_dtypes("object").nunique()
feature_cardinality

Gender           2
Married          2
Dependents       4
Education        2
Self_Employed    2
Property_Area    3
dtype: int64

There is no need to handle the cardinality of the features as no features have very low cardinality or very high cardinality.

### Understanding and profiling the data
Use the skimpy library to profile the data

In [6]:
sk.skim(df) #profile the dataframe of raw data

According to the skimpy summary above, we notice that there are 3 numeric features and 9 categorical features (which we will be encoding for better data understanding). The features LoanAmount, Loan-Amount_Term, Credit_History, Gender, Married, Dependents, and Self_Employed have missing values. Credit_History must also be changed to a string feature as it is categorical.

### Handling missing values

In [7]:

df['LoanAmount'].fillna(
    df['LoanAmount']
    .dropna()
    .mean(),
    inplace=True
)

col_cat = ['Gender','Married','Dependents','Self_Employed','Credit_History','Loan_Amount_Term']
for col in col_cat:
    df[col].fillna(
        df[col]
        .mode()[0],
        inplace=True
    )

sk.skim(df)

### Encoding categorical variables

In [8]:
from category_encoders import OneHotEncoder

ohe = OneHotEncoder(
    use_cat_names=True,
    cols=['Gender','Married','Dependents','Self_Employed','Education','Property_Area','Credit_History','Loan_Amount_Term']
)

encoded_df = ohe.fit_transform(df)
encoded_df.head()

Unnamed: 0,Gender_Male,Gender_Female,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,...,Loan_Amount_Term_480.0,Loan_Amount_Term_36.0,Loan_Amount_Term_84.0,Loan_Amount_Term_12.0,Credit_History_1.0,Credit_History_0.0,Property_Area_Urban,Property_Area_Rural,Property_Area_Semiurban,Loan_Status
0,1,0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1
1,1,0,0,1,0,1,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
2,1,0,0,1,1,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1
3,1,0,0,1,1,0,0,0,0,1,...,0,0,0,0,1,0,1,0,0,1
4,1,0,1,0,1,0,0,0,1,0,...,0,0,0,0,1,0,1,0,0,1


In [9]:
encoded_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender_Male              614 non-null    int64  
 1   Gender_Female            614 non-null    int64  
 2   Married_No               614 non-null    int64  
 3   Married_Yes              614 non-null    int64  
 4   Dependents_0             614 non-null    int64  
 5   Dependents_1             614 non-null    int64  
 6   Dependents_2             614 non-null    int64  
 7   Dependents_3+            614 non-null    int64  
 8   Education_Graduate       614 non-null    int64  
 9   Education_Not Graduate   614 non-null    int64  
 10  Self_Employed_No         614 non-null    int64  
 11  Self_Employed_Yes        614 non-null    int64  
 12  ApplicantIncome          614 non-null    int64  
 13  CoapplicantIncome        614 non-null    float64
 14  LoanAmount               6

### Creating a prepare data function

In [10]:
def prepare_data(path):
    prep_df = pd.read_csv(path) #read the raw data from the raw_data.csv file
    
    irrelevant_features = 'Loan_ID' #feature to be removed
    #removing of feature
    prep_df.drop(
        columns=irrelevant_features,
        inplace=True
    )

    prep_df['Loan_Status'].replace({'Y': 1, 'N': 0},inplace = True)#replace loan status values with int values

    #Handling of missing data
    prep_df['LoanAmount'].fillna(
    prep_df['LoanAmount']
    .dropna()
    .mean(),
    inplace=True
    )

    col_cat = ['Gender','Married','Dependents','Self_Employed','Credit_History','Loan_Amount_Term']
    for col in col_cat:
        prep_df[col].fillna(
            prep_df[col]
            .mode()[0],
            inplace=True
        )

    #clean column names
    from skimpy import clean_columns
    return clean_columns(prep_df)

### Calling the prepare_data function and writing to a file

In [11]:
prepared_df = prepare_data('./data-for-project-1/raw_data.csv')
prepared_df.to_csv('./data-for-project-1/prepared_data.csv')

## __Data exploration__

In [12]:
# For Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sn

### High Collinearity

In [13]:
corr_df = df.select_dtypes("number").corr() #checking the correlation between the number features
corr_df

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status
ApplicantIncome,1.0,-0.116605,0.56562,-0.046531,-0.018615,-0.00471
CoapplicantIncome,-0.116605,1.0,0.187828,-0.059383,0.011134,-0.059187
LoanAmount,0.56562,0.187828,1.0,0.036475,-0.001431,-0.036416
Loan_Amount_Term,-0.046531,-0.059383,0.036475,1.0,-0.004705,-0.022549
Credit_History,-0.018615,0.011134,-0.001431,-0.004705,1.0,0.540556
Loan_Status,-0.00471,-0.059187,-0.036416,-0.022549,0.540556,1.0


In [14]:
fig = px.imshow(corr_df, color_continuous_scale='Spectral')
fig.update_layout(title='Heat Map: Correlation of Features', font=dict(size=12))
fig.show()

From the correlation matrix displayed, we can see that there is high multi-collinearity on Credit_History against the Loan_Status. As Loan_Status is our target, we could look to drop the Credit_History feature. We also notice high multi-collinearity on LoanAmount against ApplicantIncome. Since income is more important for loan eligibility, we could look to drop the LoanAmount feature.

### Univariate analysis

In [15]:
# Prepare data to display
labels = (
    df['Loan_Status']
    .astype('str')
    .str.replace('0','No', regex=True)
    .str.replace('1','Yes', regex=True)
    .value_counts()
)

# Create figure using Plotly
fig = px.bar(
    data_frame=labels, 
    x=labels.index, 
    y=labels.values, 
    title=f'Class Imbalance', 
    color=labels.index
)

# Add titles & Display figure
fig.update_layout(xaxis_title='Loan Status', yaxis_title='Number of Customers')
fig.show()

The BC Finance company would like to decrease the number of customers who are not eligible for a loan, which is depicted by the orange bar labeled N in the graph above.

### Bivariate/Multivariate analysis

#### Analysis of numeric features 

In [16]:
#Gaining insights on which features to use as numeric and categorical
df.select_dtypes('number').nunique()

ApplicantIncome      505
CoapplicantIncome    287
LoanAmount           204
Loan_Amount_Term      10
Credit_History         2
Loan_Status            2
dtype: int64

In [17]:
# Select features to plot
plot_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

# Plot numeric features against target
plt.Figure(figsize=(3,4))
for col in plot_cols:
    fig = px.box(data_frame=df[plot_cols], x=col, color=df['Loan_Status'], title=f'BoxPlot for {col} Feature against the Target')
    fig.update_layout(xaxis_title=f'{col} Feature')
    fig.show()

##### Visualization of numeric features without outliers

In [18]:
mask_appincome = df['ApplicantIncome'] < 6000 #mask for filtering the ApplicantIncome feature
df_mask1 = df[mask_appincome] #filtered dataframe from the applicant income mask 
mask_coincome = df['CoapplicantIncome'] < 2300 #mask for filtering the CoapplicantIncome feature
df_mask2 = df[mask_coincome] #filtered dataframe from the coapplicant income mask
mask_loanamount_upper = df['LoanAmount'] < 175 #mask for filtering the LoanAmount feature
mask_loanamount_lower = df['LoanAmount'] > 35
df_mask3 = df[mask_loanamount_upper & mask_loanamount_lower] #filtered dataframe from the loan amount mask

plot_cols = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount']

plt.Figure(figsize=(3,4))
for col in plot_cols:
    if col == 'ApplicantIncome':
        fig = px.box(data_frame=df_mask1, x=col, color= df_mask1['Loan_Status'],
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()
    elif col == 'CoapplicantIncome':
        fig = px.box(data_frame=df_mask2, x=col, color= df_mask2['Loan_Status'],
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()
    else:
        fig = px.box(data_frame=df_mask3, x=col, color= df_mask3['Loan_Status'],
        title=f'BoxPlot for {col} Feature against the Target without outliers')
        fig.update_layout(xaxis_title=f'{col} Feature')
        fig.show()

#### Analysis of categorical features

In [19]:
cat_col = ['Loan_Amount_Term','Credit_History','Gender','Married','Dependents','Education','Self_Employed','Property_Area']
for col in cat_col:
    # Aggregate Category Feature
    new_df = pd.DataFrame(
        df[[col, 'Loan_Status']]
        .groupby(['Loan_Status'])
        .value_counts()
        .reset_index()
        )
    
    # Plot Category feature vs label
    fig = px.bar(
        data_frame=new_df, 
        x=col, 
        y='count', 
        facet_col='Loan_Status', 
        color=new_df['Loan_Status'].astype(str), # convert it to string to avoid continuous scale on legend
        title=f'{col} vs Target'
    )

    fig.update_layout(xaxis_title=col, yaxis_title='Number of Customers')
    fig.show()

## __Evaluation metrics for this classification problem__

Libraries to be used during classification evalutions:

In [20]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from category_encoders import OneHotEncoder

# Metric variables to be used for output
accuracy_scores = []
precisions = []
f1_scores = []
recalls = []

Datasets to be used for the Logistic Regression and Decision Tree alogrithms:

In [21]:
# Split loan status data
label = 'loan_status'
x = prepared_df.drop(columns=[label], inplace=False)
y = prepared_df[label]

x_Train, x_Val, y_Train, y_Val = train_test_split(x, y, test_size=0.4, random_state=42)

print(
    f'Training dataset \
    \nx_Train: {x_Train.shape[0]/len(x)*100:.0f}% \ny_Train: {y_Train.shape[0]/len(x)*100:.0f}% \
    \n\nValidation dataset \
    \nx_Val: {x_Val.shape[0]/len(x)*100:.0f}% \ny_Val: {y_Val.shape[0]/len(x)*100:.0f}%'
)

Training dataset     
x_Train: 60% 
y_Train: 60%     

Validation dataset     
x_Val: 40% 
y_Val: 40%


### Baseline accuracy

In [22]:
accuracy_Base = y_Train.value_counts(normalize=True).max()

print("Baseline Accuracy:", round(accuracy_Base, 2))

Baseline Accuracy: 0.71


### Logistic Regression evaluation

In [23]:
# Model building and fitting
regression_Model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    LogisticRegression(max_iter=2500)
)
regression_Model.fit(x_Train, y_Train)

# Display accuracy scores
lr_train_acc = regression_Model.score(x_Train, y_Train)
lr_val_acc = regression_Model.score(x_Val, y_Val)
print("Logistic Regression training accuracy:", lr_train_acc)
print("Logistic Regression validation accuracy:", lr_val_acc)

y_val_pred_reg = regression_Model.predict(x_Val)

accuracy_scores.append(accuracy_score(y_Val, y_val_pred_reg).round(4)), 
precisions.append(precision_score(y_Val, y_val_pred_reg).round(4)), 
recalls.append(recall_score(y_Val, y_val_pred_reg).round(4)), 
f1_scores.append(f1_score(y_Val, y_val_pred_reg).round(4))

Logistic Regression training accuracy: 0.8179347826086957
Logistic Regression validation accuracy: 0.7926829268292683


### Decision Tree evaluation

In [24]:
depth_hyperpar = range(1, 8)

# List of scores per each set for visualization purpose
training_Scores = []
validation_Scores = []

for d in depth_hyperpar:
    # Model building and fitting
    tree_Model = make_pipeline(
        OneHotEncoder(use_cat_names=True),
        DecisionTreeClassifier(max_depth=d, random_state=42)
    )
    tree_Model.fit(x_Train, y_Train)
    
    # Calculate training accuracy score and append to `training_scores`
    training_Scores.append(tree_Model.score(x_Train, y_Train))
    
    # Calculate validation accuracy score and append to `validation_scores`
    validation_Scores.append(tree_Model.score(x_Val, y_Val))

tune_data = pd.DataFrame(
    data = {'Training': training_Scores, 'Validation': validation_Scores}, 
    index=depth_hyperpar
)

fig = px.line(
    data_frame=tune_data, 
    x=depth_hyperpar, 
    y=['Training', 'Validation'], 
    title="Decision Tree Model training & validation curves"
)
fig.update_layout(xaxis_title ="Maximum Depth", yaxis_title="Accuracy Score")
fig.show()

y_val_pred_tree = tree_Model.predict(x_Val)

accuracy_scores.append(accuracy_score(y_Val, y_val_pred_tree).round(4)), 
precisions.append(precision_score(y_Val, y_val_pred_tree).round(4)), 
recalls.append(recall_score(y_Val, y_val_pred_tree).round(4)), 
f1_scores.append(f1_score(y_Val, y_val_pred_tree).round(4))

In [25]:
# Compare evaluation metrics for Logistic regression and Decision tree 
metrics = {
        'Accuracy': accuracy_scores,
        'Precision': precisions,
        'F1-Score': f1_scores, 
        'Recall': recalls
    }

pd.DataFrame(
    data=metrics, 
    index=['Logistic Regression', 'Decision Tree']
).sort_values(
    by='Accuracy', 
    ascending=False
)

Unnamed: 0,Accuracy,Precision,F1-Score,Recall
Logistic Regression,0.7927,0.767,0.861,0.9814
Decision Tree,0.7195,0.7396,0.8045,0.882


In [26]:
import joblib

# Saving first iteration as Logistic Regression model
joblib.dump(regression_Model, './artifacts/model_1.pkl')

['./artifacts/model_1.pkl']

Here we find that the Logistic Regression could be a better model to make use of, and it is saved as the first iteration.

However, the Deep Learning algorithm model is yet to be evaluated.

The building and evaluation of the Deep Learning algorithm model is done under Model Building Part 1 below, without feature engineering.

## __Model building part 1__

In [27]:
# Columns to be encoded for deep learning algorithm
categorical_columns = ['gender','married','dependents','education','self_employed','loan_amount_term','credit_history','property_area'] 

ohe = OneHotEncoder(cols=categorical_columns)

# Split loan status encoded data for deep learning
X_encoded = ohe.fit_transform(prepared_df)
y_deep = prepared_df[label]

X_train_deep, X_val_deep, y_train_deep, y_val_deep = train_test_split(X_encoded, y_deep, test_size=0.4, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_deep)
X_val_scaled = scaler.transform(X_val_deep)

# Building, compiling and training the deep learning model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_scaled, y_train_deep, epochs=20)

# Evaluating deep learning
y_pred_deep_probability = model.predict(X_val_scaled)
y_pred_deep = (y_pred_deep_probability > 0.5).astype(int)

accuracy_scores.append(accuracy_score(y_val_deep, y_pred_deep).round(4)), 
precisions.append(precision_score(y_val_deep, y_pred_deep).round(4)), 
recalls.append(recall_score(y_val_deep, y_pred_deep).round(4)), 
f1_scores.append(f1_score(y_val_deep, y_pred_deep).round(4))

Epoch 1/20



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4918 - loss: 0.8439
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6014 - loss: 0.7129 
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6884 - loss: 0.6084 
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7025 - loss: 0.6061 
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7873 - loss: 0.5206 
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7547 - loss: 0.4851 
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7562 - loss: 0.5400 
Epoch 8/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7886 - loss: 0.4654 
Epoch 9/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [28]:
# Comparing Deep Learning to previous algorithms
metrics = {
        'Accuracy': accuracy_scores,
        'Precision': precisions,
        'F1-Score': f1_scores, 
        'Recall': recalls
    }

pd.DataFrame(
    data=metrics, 
    index=['Logistic Regression', 'Decision Tree','Deep Learning']
).sort_values(
    by='Accuracy', 
    ascending=False
)

Unnamed: 0,Accuracy,Precision,F1-Score,Recall
Deep Learning,0.9715,0.9639,0.9786,0.9938
Logistic Regression,0.7927,0.767,0.861,0.9814
Decision Tree,0.7195,0.7396,0.8045,0.882


We find that the Deep Learning model is significantly more useful than the previous classification models

As such, next we save the Deep Learning model without the feature engineering as our second iteration.

In [29]:
# Saving second iteration of Deep Learning model
joblib.dump(model, './artifacts/model_2.pkl')

['./artifacts/model_2.pkl']

## __Feature engineering__

## __Model building part 2__