In [2]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import pickle

2024-07-19 07:41:05.146843: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

path = '../Resources/loan.csv'

In [4]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# file_path = Path(f'{path}loan.csv')
loan_data = pd.read_csv(path)

# Review the DataFrame
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [5]:
# Drop Loan_ID column
columns_to_delete = ['Loan_ID']
loan_data = loan_data.drop(columns = columns_to_delete)

# Separately transforming the Loan_Status column without using OHE, since it needs to remain in one copy
loan_data['Loan_Status'] = loan_data['Loan_Status'].map({'N': 0, 'Y': 1})

# Split to train and test. Splitting before knn imputing required for fitting imputer on train and fill NaNs using it on test
df_train, df_test = train_test_split(loan_data, test_size=0.2, random_state=1)

# get categorical and numerical columns
categorical_columns = loan_data.select_dtypes(include=['object', 'category']).columns
numerical_columns = loan_data.select_dtypes(include=['int64', 'float64']).columns

In [6]:
#Impute missing values for continuous data only (LoanAmount, Loan_Amount_Term)
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Apply the KNN imputer to the numerical columns
df_train[numerical_columns] = imputer.fit_transform(df_train[numerical_columns])

# apply to test !IMPORTANT! for prevent data leak
df_test[numerical_columns] = imputer.transform(df_test[numerical_columns])

In [7]:
# Change all categorical columns to binary using get dummies
clean_train_df = pd.get_dummies(df_train, columns=['Gender','Married','Dependents','Education',
                                                    'Self_Employed','Property_Area'])
clean_test_df = pd.get_dummies(df_test, columns=['Gender','Married','Dependents','Education',
                                                    'Self_Employed','Property_Area'])
# Export columns to be used in app
columns = clean_train_df.columns.tolist()

# Save the column names to a CSV file
with open('columns_list.csv', 'w') as f:
    for column in columns:
        f.write(f"{column}\n")

In [8]:
# Separate the data into labels and features
# Separate the y variable, the labels
y_train = clean_train_df['Loan_Status']
y_test = clean_test_df['Loan_Status']

# Separate the X variable, the features
X_train = clean_train_df.drop('Loan_Status', axis=1)
X_test = clean_test_df.drop('Loan_Status', axis=1)
X_train

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
291,4400.0,0.0,127.0,360.0,0.0,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False
507,3583.0,0.0,96.0,360.0,1.0,False,False,True,False,True,False,False,False,True,False,True,False,False,False,True
328,4333.0,2451.0,110.0,360.0,1.0,True,False,False,True,True,False,False,False,True,False,True,False,False,False,True
609,2900.0,0.0,71.0,360.0,1.0,True,False,True,False,True,False,False,False,True,False,True,False,True,False,False
69,4300.0,0.0,136.0,360.0,0.0,True,False,True,False,True,False,False,False,True,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,6080.0,2569.0,182.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,True,False,False
144,11757.0,0.0,187.0,180.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,False,True
72,3500.0,0.0,81.0,300.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,True,False
235,5500.0,1260.0,170.0,360.0,1.0,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False


In [12]:
# Convert Credit_History to boolean
X_train['Credit_History'] = X_train['Credit_History'].astype(bool)

# Check data types
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 491 entries, 291 to 37
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ApplicantIncome          491 non-null    float64
 1   CoapplicantIncome        491 non-null    float64
 2   LoanAmount               491 non-null    float64
 3   Loan_Amount_Term         491 non-null    float64
 4   Credit_History           491 non-null    bool   
 5   Gender_Female            491 non-null    bool   
 6   Gender_Male              491 non-null    bool   
 7   Married_No               491 non-null    bool   
 8   Married_Yes              491 non-null    bool   
 9   Dependents_0             491 non-null    bool   
 10  Dependents_1             491 non-null    bool   
 11  Dependents_2             491 non-null    bool   
 12  Dependents_3+            491 non-null    bool   
 13  Education_Graduate       491 non-null    bool   
 14  Education_Not Graduate   491 n

In [20]:
# # get list of columns
# cols=list(X_train.columns)

# get all categorical columns
cat_cols=X_train.dtypes[X_train.dtypes=='bool'].index

# get all continuous columns
cont_cols=X_train.dtypes[X_train.dtypes!='float64'].index

# Initialize choices dictionary
choices = {col: None for col in X_train.columns}

# Map unique options to each categorical variable
for col in cat_cols:
    choices[col] = list(X_train[col].unique())

print(choices)

# Save as pkl for app.py
with open('choices.pkl', 'wb') as f:
    pickle.dump(choices, f)


{'ApplicantIncome': None, 'CoapplicantIncome': None, 'LoanAmount': None, 'Loan_Amount_Term': None, 'Credit_History': [False, True], 'Gender_Female': [False, True], 'Gender_Male': [True, False], 'Married_No': [False, True], 'Married_Yes': [True, False], 'Dependents_0': [False, True], 'Dependents_1': [False, True], 'Dependents_2': [True, False], 'Dependents_3+': [False, True], 'Education_Graduate': [True, False], 'Education_Not Graduate': [False, True], 'Self_Employed_No': [True, False], 'Self_Employed_Yes': [False, True], 'Property_Area_Rural': [False, True], 'Property_Area_Semiurban': [True, False], 'Property_Area_Urban': [False, True]}


In [8]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Save the scaler
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

In [9]:
# Creating the neural network model
# The first dense layer with 64 neurons and ReLU (Rectified Linear Unit) activation function:
# The ReLU function is often used in hidden layers because it helps the model learn nonlinear relationships in the data and prevents the vanishing gradient problem.
# Dropout layer with a rate of 0.5: Dropout is used to prevent overfitting by randomly disabling neurons during training.
# A rate of 0.5 means that 50% of the neurons will be disabled at each training iteration.
# The second dense layer with 32 neurons and ReLU activation function: This layer helps the model learn more complex patterns in the data.
# Another Dropout layer with a rate of 0.5: For additional overfitting prevention.
# Output layer with 1 neuron and sigmoid activation function: Since the task is binary classification (Loan_Status - approved or not),
# the output layer with a sigmoid activation function returns a probability value (from 0 to 1), which is easily interpreted as the probability of the positive class.

neural_network_model = Sequential()
neural_network_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(32, activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(1, activation='sigmoid'))

# Compiling the model
neural_network_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
neural_network_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluating the model on test data
loss, accuracy = neural_network_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.4415 - loss: 0.8371 - val_accuracy: 0.5758 - val_loss: 0.6772
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5626 - loss: 0.7231 - val_accuracy: 0.6869 - val_loss: 0.6349
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6530 - loss: 0.6737 - val_accuracy: 0.6970 - val_loss: 0.6149
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6150 - loss: 0.6961 - val_accuracy: 0.7071 - val_loss: 0.6030
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6463 - loss: 0.6796 - val_accuracy: 0.7273 - val_loss: 0.5930
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6574 - loss: 0.6595 - val_accuracy: 0.7273 - val_loss: 0.5868
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━

In [10]:
# # Save the entire model (architecture + weights + optimizer state)
# neural_network_model.save('model.h5')

# # Save model architecture to JSON
# model_architecture = neural_network_model.to_json()
# with open('model_architecture.json', 'w') as f:
#     f.write(model_architecture)

# # Save model weights
# neural_network_model.save_weights('model.weights.h5')


# Save model
model = neural_network_model
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)