In [1]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/avineshprabhakaran/loan-eligibility-prediction")

Skipping, found downloaded files in "./loan-eligibility-prediction" (use force=True to force download)


In [1]:
import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from torchsummary import summary
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

device = "cuda" if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
data_df = pd.read_csv(r"/home/namankarki/Naman/3_months_of_data-science/Deep_Learning_Pytorch/activation_functions/loan-eligibility-prediction/Loan Eligibility Prediction.csv")
data_df.head()

Unnamed: 0,Customer_ID,Gender,Married,Dependents,Education,Self_Employed,Applicant_Income,Coapplicant_Income,Loan_Amount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,569,Female,No,0,Graduate,No,2378,0.0,9,360,1,Urban,N
1,15,Male,Yes,2,Graduate,No,1299,1086.0,17,120,1,Urban,Y
2,95,Male,No,0,Not Graduate,No,3620,0.0,25,120,1,Semiurban,Y
3,134,Male,Yes,0,Graduate,Yes,3459,0.0,25,120,1,Semiurban,Y
4,556,Male,Yes,1,Graduate,No,5468,1032.0,26,360,1,Semiurban,Y


In [4]:
data_df.dropna(inplace=True)
data_df.drop(columns=["Customer_ID"], axis=1, inplace=True)
print(data_df.shape)

(614, 12)


In [5]:
data_df.head()
print("\n Checking the NAN values")
print(data_df.isnull().sum())

print("\n Checking the empty strings and unusual values")
print(data_df.isin(["", "NA", "N/A", "NONE", "none", "na", "NULL", "null", "None"]).sum())

print("\n Checking the duplicates")
print(data_df.duplicated().sum())


 Checking the NAN values
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
Applicant_Income      0
Coapplicant_Income    0
Loan_Amount           0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

 Checking the empty strings and unusual values
Gender                0
Married               0
Dependents            0
Education             0
Self_Employed         0
Applicant_Income      0
Coapplicant_Income    0
Loan_Amount           0
Loan_Amount_Term      0
Credit_History        0
Property_Area         0
Loan_Status           0
dtype: int64

 Checking the duplicates
0


In [6]:
num_cols = ['Applicant_Income', 'Coapplicant_Income', 'Loan_Amount', 'Loan_Amount_Term']
binary_cols = ["Gender", "Married", "Education", "Self_Employed"]
nominal_cols = ["Property_Area"]
target_col = "Loan_Status"

In [7]:

print(data_df[target_col].dtypes)

object


In [8]:
#encoding the target column
tgt_encoder = LabelEncoder()
data_df[target_col] = tgt_encoder.fit_transform(data_df[target_col])

#encoding the binary columns
bin_encoder = LabelEncoder()
for cols in binary_cols:
    data_df[cols] = bin_encoder.fit_transform(data_df[cols])


#column transformer for stadardinzing and ohe
preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(drop = "first"), nominal_cols)
],
remainder='passthrough')


X = preprocessor.fit_transform(data_df.drop(columns = [target_col]))
y = data_df[target_col].values

In [9]:
print(X.shape)
print(y.shape)

(614, 12)
(614,)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.3)


In [11]:
X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, test_size = 0.5)

In [12]:
X_train.shape, X_test.shape, X_val.shape

((429, 12), (92, 12), (93, 12))

In [13]:
class dataset(Dataset):
    def __init__(self,X,y):
        self.X = torch.tensor(X, dtype = torch.float32).to(device)
        self.y = torch.tensor(y, dtype=torch.float32).to(device)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [14]:
training_data = dataset(X_train, y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test, y_test)

In [15]:
train_dataloader = DataLoader(training_data, batch_size = 8, shuffle = True)
test_dataloader = DataLoader(testing_data, batch_size = 8, shuffle = False)
val_dataloader = DataLoader(validation_data, batch_size = 8, shuffle = False)

In [16]:
for x , y in train_dataloader:
    print(x)
    print(y)
    break

tensor([[-0.2319, -0.5545, -1.0576, -1.4197,  0.0000,  1.0000,  1.0000,  1.0000,
          1.0000,  0.0000,  0.0000,  1.0000],
        [-0.0661,  0.6997,  1.0800,  0.3030,  1.0000,  0.0000,  1.0000,  1.0000,
          2.0000,  0.0000,  0.0000,  1.0000],
        [-0.2299,  0.2236,  0.0227,  0.3030,  1.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  1.0000],
        [ 0.6164, -0.5545,  1.1720,  0.3030,  0.0000,  1.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  1.0000],
        [-0.4069, -0.5490, -0.6324,  0.3030,  0.0000,  0.0000,  1.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  1.0000],
        [-0.3916,  0.4828, -0.5404, -0.5583,  0.0000,  1.0000,  1.0000,  1.0000,
          0.0000,  0.0000,  0.0000,  1.0000],
        [-0.3521, -0.5545, -1.0576,  0.3030,  0.0000,  1.0000,  1.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  1.0000],
        [-0.0348, -0.5545, -0.1152,  0.3030,  1.0000,  0.0000,  0.0000,  0.0000,
          1.0000,  1.0000,  1.

In [17]:
HIDDEN_NEURONS = 32
class MyModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.input_layer = nn.Linear(X.shape[1], HIDDEN_NEURONS)
        self.relu1 = nn.ReLU()
        self.hidden_layer = nn.Linear(HIDDEN_NEURONS, 8)
        self.relu2 = nn.ReLU()
        self.output_layer = nn.Linear(8, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu1(x)
        x = self.hidden_layer(x)
        x = self.relu2(x)
        x = self.output_layer(x)
        x = self.sigmoid(x)
        return x 
    
model = MyModel().to(device)

In [18]:
summary(model, input_size = (X.shape[1],))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             416
              ReLU-2                   [-1, 32]               0
            Linear-3                    [-1, 8]             264
              ReLU-4                    [-1, 8]               0
            Linear-5                    [-1, 1]               9
           Sigmoid-6                    [-1, 1]               0
Total params: 689
Trainable params: 689
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


In [19]:
criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr=0.001)

total_loss+=batch_loss×batch_size

avg_loss=total_samples/total_loss

​accuracy=total number of samples/total correct predictions​

In [21]:
total_loss_train_plot = []
total_loss_val_plot = []
total_acc_val_plot = []
total_acc_train_plot = []

epochs = 20
for epoch in range(epochs):
    total_loss_train = 0
    total_acc_train = 0

    model.train()
    for inputs, labels in train_dataloader:
        logits = model(inputs).squeeze(1)

        # BCE requires probabilities → use sigmoid
        probs = torch.sigmoid(logits)

        batch_loss = criterion(probs, labels)

        # sum of batch losses → multiply averaged loss by batch size
        total_loss_train += batch_loss.item() * labels.size(0)

        # accuracy
        preds = (probs > 0.5).float()
        total_acc_train += (preds == labels).sum().item()

        # backprop
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # ---- averages ----
    avg_train_loss = total_loss_train / len(train_dataloader.dataset)
    avg_train_acc  = total_acc_train / len(train_dataloader.dataset)

    total_loss_train_plot.append(avg_train_loss)
    total_acc_train_plot.append(avg_train_acc)

    # ========== VALIDATION ==========
    total_loss_val = 0
    total_acc_val = 0
    model.eval()
    with torch.no_grad():
        for inputs, labels in val_dataloader:
            logits = model(inputs).squeeze(1)
            probs = torch.sigmoid(logits)
            batch_loss = criterion(probs, labels)

            total_loss_val += batch_loss.item() * labels.size(0)
            preds = (probs > 0.5).float()
            total_acc_val += (preds == labels).sum().item()

    avg_val_loss = total_loss_val / len(val_dataloader.dataset)
    avg_val_acc  = total_acc_val / len(val_dataloader.dataset)

    total_loss_val_plot.append(avg_val_loss)
    total_acc_val_plot.append(avg_val_acc)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f" Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f}")
    print(f" Val Loss:   {avg_val_loss:.4f}, Val Acc:   {avg_val_acc:.4f}")


Epoch 1/20
 Train Loss: 0.6252, Train Acc: 0.6830
 Val Loss:   0.5850, Val Acc:   0.7527
Epoch 2/20
 Train Loss: 0.6229, Train Acc: 0.6830
 Val Loss:   0.5827, Val Acc:   0.7527
Epoch 3/20
 Train Loss: 0.6203, Train Acc: 0.6830
 Val Loss:   0.5800, Val Acc:   0.7527
Epoch 4/20
 Train Loss: 0.6174, Train Acc: 0.6830
 Val Loss:   0.5761, Val Acc:   0.7527
Epoch 5/20
 Train Loss: 0.6139, Train Acc: 0.6830
 Val Loss:   0.5737, Val Acc:   0.7527
Epoch 6/20
 Train Loss: 0.6090, Train Acc: 0.6830
 Val Loss:   0.5710, Val Acc:   0.7527
Epoch 7/20
 Train Loss: 0.6027, Train Acc: 0.6830
 Val Loss:   0.5689, Val Acc:   0.7527
Epoch 8/20
 Train Loss: 0.5937, Train Acc: 0.6830
 Val Loss:   0.5681, Val Acc:   0.7527
Epoch 9/20
 Train Loss: 0.5830, Train Acc: 0.6830
 Val Loss:   0.5600, Val Acc:   0.7527
Epoch 10/20
 Train Loss: 0.5738, Train Acc: 0.6830
 Val Loss:   0.5544, Val Acc:   0.7527
Epoch 11/20
 Train Loss: 0.5662, Train Acc: 0.6830
 Val Loss:   0.5527, Val Acc:   0.7527
Epoch 12/20
 Train 

In [22]:
def testing(model, test_dataloader, criterion):
    model.eval()
    total_test_loss = 0
    total_test_acc = 0
    with torch.no_grad():
        for inputs, labels in test_dataloader:
            logits = model(inputs).squeeze(1)
            prob = torch.sigmoid(logits)
            batch_loss = criterion(prob, labels)
            total_test_loss += batch_loss.item() * labels.size(0)
            pred = (prob > 0.5).float()
            total_test_acc += (pred == labels).sum().item()
        
        avg_loss = total_test_loss /len(testing_data)
        avg_acc = total_test_acc / len(testing_data)
        return avg_loss, avg_acc
            
            
            
            
    

In [24]:
avg_loss, avg_acc = testing(model,test_dataloader, criterion)

In [25]:
print(avg_loss)
print(avg_acc)

0.5860790662143541
0.6413043478260869
