In [3]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

---

In [4]:
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/Data_project4/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Preprocessing



### Step 1: Read the `loan.csv` data from the `Resources` folder into a Pandas DataFrame.

In [5]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path(f'{path}loan.csv')
loan_data = pd.read_csv(file_path)

# Review the DataFrame
loan_data

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,LP002978,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,LP002979,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,LP002983,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,LP002984,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [6]:
# Drop Loan_ID column

columns_to_delete = ['Loan_ID']

loan_data = loan_data.drop(columns = columns_to_delete)


In [7]:
# Separately transforming the Loan_Status column without using OHE, since it needs to remain in one copy

loan_data['Loan_Status'] = loan_data['Loan_Status'].map({'N': 0, 'Y': 1})

In [8]:
# Split to train and test. Splitting before knn imputing required for fitting imputer on train and fill NaNs using it on test

df_train, df_test = train_test_split(loan_data, test_size=0.2, random_state=1)

In [9]:
# Looking at the missing values
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             601 non-null    object 
 1   Married            611 non-null    object 
 2   Dependents         599 non-null    object 
 3   Education          614 non-null    object 
 4   Self_Employed      582 non-null    object 
 5   ApplicantIncome    614 non-null    int64  
 6   CoapplicantIncome  614 non-null    float64
 7   LoanAmount         592 non-null    float64
 8   Loan_Amount_Term   600 non-null    float64
 9   Credit_History     564 non-null    float64
 10  Property_Area      614 non-null    object 
 11  Loan_Status        614 non-null    int64  
dtypes: float64(4), int64(2), object(6)
memory usage: 57.7+ KB


In [10]:
# get categorical and numerical columns
categorical_columns = loan_data.select_dtypes(include=['object', 'category']).columns
numerical_columns = loan_data.select_dtypes(include=['int64', 'float64']).columns



In [11]:
#Impute missing values for continuous data only (LoanAmount, Loan_Amount_Term)

from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

# Apply the KNN imputer to the numerical columns
df_train[numerical_columns] = imputer.fit_transform(df_train[numerical_columns])

# apply to test !IMPORTANT! for prevent data leak
df_test[numerical_columns] = imputer.transform(df_test[numerical_columns])



In [12]:
# Looking at the number of unique values ​​of categorical variables; if there are few of them, this allows us to use OHE
for i in categorical_columns:
    print(i, loan_data[i].nunique())

Gender 2
Married 2
Dependents 4
Education 2
Self_Employed 2
Property_Area 3


In [13]:
# Change all categorical columns to binary using get dummies
clean_train_df = pd.get_dummies(df_train, columns=['Gender','Married','Dependents','Education',
                                                    'Self_Employed','Property_Area'])
clean_test_df = pd.get_dummies(df_test, columns=['Gender','Married','Dependents','Education',
                                                    'Self_Employed','Property_Area'])

In [14]:
clean_train_df

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Gender_Female,Gender_Male,Married_No,Married_Yes,...,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
291,4400.0,0.0,127.0,360.0,0.0,0.0,False,True,False,True,...,False,True,False,True,False,True,False,False,True,False
507,3583.0,0.0,96.0,360.0,1.0,0.0,False,False,True,False,...,False,False,False,True,False,True,False,False,False,True
328,4333.0,2451.0,110.0,360.0,1.0,0.0,True,False,False,True,...,False,False,False,True,False,True,False,False,False,True
609,2900.0,0.0,71.0,360.0,1.0,1.0,True,False,True,False,...,False,False,False,True,False,True,False,True,False,False
69,4300.0,0.0,136.0,360.0,0.0,0.0,True,False,True,False,...,False,False,False,True,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,6080.0,2569.0,182.0,360.0,1.0,0.0,False,True,False,True,...,False,False,False,True,False,True,False,True,False,False
144,11757.0,0.0,187.0,180.0,1.0,1.0,False,True,False,True,...,False,True,False,True,False,True,False,False,False,True
72,3500.0,0.0,81.0,300.0,1.0,1.0,False,True,True,False,...,False,False,False,True,False,True,False,False,True,False
235,5500.0,1260.0,170.0,360.0,1.0,1.0,False,True,False,True,...,True,False,False,True,False,True,False,True,False,False


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [15]:
# Separate the data into labels and features
# Separate the y variable, the labels
y_train = clean_train_df['Loan_Status']
y_test = clean_test_df['Loan_Status']

# Separate the X variable, the features
X_train = clean_train_df.drop('Loan_Status', axis=1)
X_test = clean_test_df.drop('Loan_Status', axis=1)


---

In [16]:
X_train

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Gender_Female,Gender_Male,Married_No,Married_Yes,Dependents_0,Dependents_1,Dependents_2,Dependents_3+,Education_Graduate,Education_Not Graduate,Self_Employed_No,Self_Employed_Yes,Property_Area_Rural,Property_Area_Semiurban,Property_Area_Urban
291,4400.0,0.0,127.0,360.0,0.0,False,True,False,True,False,False,True,False,True,False,True,False,False,True,False
507,3583.0,0.0,96.0,360.0,1.0,False,False,True,False,True,False,False,False,True,False,True,False,False,False,True
328,4333.0,2451.0,110.0,360.0,1.0,True,False,False,True,True,False,False,False,True,False,True,False,False,False,True
609,2900.0,0.0,71.0,360.0,1.0,True,False,True,False,True,False,False,False,True,False,True,False,True,False,False
69,4300.0,0.0,136.0,360.0,0.0,True,False,True,False,True,False,False,False,True,False,True,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,6080.0,2569.0,182.0,360.0,1.0,False,True,False,True,True,False,False,False,True,False,True,False,True,False,False
144,11757.0,0.0,187.0,180.0,1.0,False,True,False,True,False,False,True,False,True,False,True,False,False,False,True
72,3500.0,0.0,81.0,300.0,1.0,False,True,True,False,True,False,False,False,True,False,True,False,False,True,False
235,5500.0,1260.0,170.0,360.0,1.0,False,True,False,True,False,True,False,False,True,False,True,False,True,False,False


## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [28]:
#polynomial features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

#fit logistic regression
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=1, C=10, penalty='l1', solver='liblinear')
logistic_model.fit(X_train_poly, y_train)

#make predictions
y_pred = logistic_model.predict(X_test_poly)

class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.82      0.46      0.59        39
         1.0       0.79      0.95      0.86        84

    accuracy                           0.80       123
   macro avg       0.81      0.71      0.73       123
weighted avg       0.80      0.80      0.78       123



In [None]:
# Initiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_model = LogisticRegression(random_state=1)

# Fit the model using training data
logistic_model.fit(X_train, y_train)


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
y_pred = logistic_model.predict(X_test)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Generate a confusion matrix for the model
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", conf_matrix)

Confusion Matrix:
 [[16 23]
 [ 2 82]]


In [None]:
# Print the classification report for the model
class_report = classification_report(y_test, y_pred)
print("Classification Report:\n", class_report)

Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.41      0.56        39
         1.0       0.78      0.98      0.87        84

    accuracy                           0.80       123
   macro avg       0.83      0.69      0.71       123
weighted avg       0.82      0.80      0.77       123



### Step 4: Answer the following question.

---

Create a Neural Network Model with the Original Data

In [30]:
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam


In [31]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating the neural network model
# The first dense layer with 64 neurons and ReLU (Rectified Linear Unit) activation function:
# The ReLU function is often used in hidden layers because it helps the model learn nonlinear relationships in the data and prevents the vanishing gradient problem.
# Dropout layer with a rate of 0.5: Dropout is used to prevent overfitting by randomly disabling neurons during training.
# A rate of 0.5 means that 50% of the neurons will be disabled at each training iteration.
# The second dense layer with 32 neurons and ReLU activation function: This layer helps the model learn more complex patterns in the data.
# Another Dropout layer with a rate of 0.5: For additional overfitting prevention.
# Output layer with 1 neuron and sigmoid activation function: Since the task is binary classification (Loan_Status - approved or not),
# the output layer with a sigmoid activation function returns a probability value (from 0 to 1), which is easily interpreted as the probability of the positive class.

neural_network_model = Sequential()
neural_network_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(32, activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(1, activation='sigmoid'))

# Compiling the model
neural_network_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
neural_network_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluating the model on test data
loss, accuracy = neural_network_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.80


In [29]:
# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating the neural network model
# The first dense layer with 64 neurons and ReLU (Rectified Linear Unit) activation function:
# The ReLU function is often used in hidden layers because it helps the model learn nonlinear relationships in the data and prevents the vanishing gradient problem.
# Dropout layer with a rate of 0.5: Dropout is used to prevent overfitting by randomly disabling neurons during training.
# A rate of 0.5 means that 50% of the neurons will be disabled at each training iteration.
# The second dense layer with 32 neurons and ReLU activation function: This layer helps the model learn more complex patterns in the data.
# Another Dropout layer with a rate of 0.5: For additional overfitting prevention.
# Output layer with 1 neuron and sigmoid activation function: Since the task is binary classification (Loan_Status - approved or not),
# the output layer with a sigmoid activation function returns a probability value (from 0 to 1), which is easily interpreted as the probability of the positive class.

neural_network_model = Sequential()
neural_network_model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(32, activation='relu'))
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(1, activation='sigmoid'))

# Compiling the model
neural_network_model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
neural_network_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluating the model on test data
loss, accuracy = neural_network_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


NameError: name 'StandardScaler' is not defined

In [41]:
# Trying to improve the model by using a different optimizer RMSprop

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import RMSprop, SGD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from tensorflow.keras.regularizers import l2

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Creating a more complex neural network model
# First dense layer with 256 neurons and ReLU activation function: More neurons to capture more complex patterns.
# BatchNormalization layer to stabilize and accelerate the learning process.
# Dropout layer with a rate of 0.5 to prevent overfitting.
# Second dense layer with 128 neurons and ReLU activation function.
# BatchNormalization layer.
# Dropout layer with a rate of 0.5.
# Third dense layer with 64 neurons and ReLU activation function.
# BatchNormalization layer.
# Dropout layer with a rate of 0.5.
# Fourth dense layer with 32 neurons and ReLU activation function.
# BatchNormalization layer.
# Dropout layer with a rate of 0.5.
# Output layer with 1 neuron and sigmoid activation function: For binary classification, returning a probability value.

neural_network_model = Sequential()
neural_network_model.add(Dense(256, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)))
neural_network_model.add(BatchNormalization())
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
neural_network_model.add(BatchNormalization())
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
neural_network_model.add(BatchNormalization())
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
neural_network_model.add(BatchNormalization())
neural_network_model.add(Dropout(0.5))
neural_network_model.add(Dense(1, activation='sigmoid'))

# Compiling the model
neural_network_model.compile(optimizer=RMSprop(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Training the model
neural_network_model.fit(X_train, y_train, epochs=50, batch_size=64, validation_split=0.15)

# Evaluating the model on test data
loss, accuracy = neural_network_model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Accuracy: 0.80


In [None]:
accuracy

0.8048780560493469

In [None]:
import joblib

# Save the logistic regression model
joblib.dump(logistic_model, 'logistic_model.pkl')

# Save the neural network model
neural_network_model.save('neural_network_model.h5')

  saving_api.save_model(


In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(5000)"))

https://3du3di1kri6-496ff2e9c6d22116-5000-colab.googleusercontent.com/


In [51]:
from flask import Flask, request, jsonify, render_template
import joblib
import numpy as np
from tensorflow.keras.models import load_model

# Initialize the Flask application
app = Flask(__name__, template_folder='/content/drive/MyDrive/templates')

# Load the models
logistic_model = joblib.load('logistic_model.pkl')
neural_network_model = load_model('neural_network_model.h5')

# Main page route
@app.route('/')
def home():
    return render_template('Index.html')

# API for logistic regression prediction
@app.route('/predict_logistic', methods=['POST'])
def predict_logistic():
    data = request.get_json(force=True)
    X = np.array([data['features']])
    print(X)
    prediction = logistic_model.predict(np.array([data['features']]))
    output = int(prediction[0])
    return jsonify(result=output)

# API for neural network prediction
@app.route('/predict_nn', methods=['POST'])
def predict_nn():
    data = request.get_json(force=True)
    prediction = neural_network_model.predict(np.array([data['features']]))
    output = int(prediction[0][0] > 0.5)
    return jsonify(result=output)

# Run the Flask application
if __name__ == "__main__":
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [15/Jul/2024 18:39:02] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2024 18:39:02] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
INFO:werkzeug:127.0.0.1 - - [15/Jul/2024 18:39:07] "POST /predict_logistic HTTP/1.1" 200 -


[[ 0  0 50 50  0  1  1  0  1  0  0  1  0  1  0  1  0  0  1  0]]


INFO:werkzeug:127.0.0.1 - - [15/Jul/2024 18:39:12] "POST /predict_logistic HTTP/1.1" 200 -


[[      0       0      50      50 1000000       1       1       0       1
        0       0       1       0       1       0       1       0       0
        1       0]]


INFO:werkzeug:127.0.0.1 - - [15/Jul/2024 18:39:16] "POST /predict_logistic HTTP/1.1" 200 -


[[       0        0       50       50 -1000000        1        1        0
         1        0        0        1        0        1        0        1
         0        0        1        0]]


In [37]:
line = '4400.0,	0.0, 127.0, 360.0, 0.0, False, True, False,	True, False, False, True,	False, True, False, True,	False, False, True, False'

#replace all bool by 1 and 0
line = line.replace('False', '0')
line = line.replace('True', '1')
print(line)


4400.0,	0.0, 127.0, 360.0, 0.0, 0, 1, 0,	1, 0, 0, 1,	0, 1, 0, 1,	0, 0, 1, 0


In [45]:
logistic_model.coef_

array([[-7.73156956e-07, -5.72028274e-05, -8.00448672e-04,
        -5.54008187e-03,  2.89093002e+00,  1.55418855e-01,
        -3.60169345e-02, -1.14273016e-01,  2.30438088e-01,
         1.27940028e-01, -1.94362697e-01,  2.14724229e-01,
         7.15780232e-02,  3.56120796e-01, -1.93402631e-01,
        -7.39305972e-02, -8.85285738e-03, -2.01918106e-01,
         6.82647037e-01, -3.18010769e-01]])

In [50]:
list(zip(df_train.columns, logistic_model.coef_[0]))

[('Gender', -7.731569558604495e-07),
 ('Married', -5.720282744144973e-05),
 ('Dependents', -0.0008004486715148971),
 ('Education', -0.005540081874714804),
 ('Self_Employed', 2.890930021388891),
 ('ApplicantIncome', 0.15541885543745237),
 ('CoapplicantIncome', -0.03601693447988587),
 ('LoanAmount', -0.11427301630046399),
 ('Loan_Amount_Term', 0.23043808840369423),
 ('Credit_History', 0.1279400276829473),
 ('Property_Area', -0.19436269728650718),
 ('Loan_Status', 0.2147242294506017)]