In [None]:
# Import all the base libraries required for building the NN and data processing

import os
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader

Loading data and analysis

In [None]:
# Import pandas to read the CSV file in a DataFrame

import pandas as pd

In [None]:
# Defining the base path for file access

base_path = "lend-or-lose"
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

train_csv = pd.read_csv(train_path)
test_csv = pd.read_csv(test_path)

In [125]:
train_csv.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,DRIRC89L0T,18,137576,209136,846,26,2,10.47,60,0.81,High School,Self-employed,Single,Yes,No,Business,No,0
1,TS0FIUNHNU,47,57194,5970,748,30,2,19.72,36,0.73,High School,Unemployed,Divorced,No,Yes,Education,No,0
2,I0YR284A1V,26,84328,95065,453,7,2,24.25,12,0.45,Master's,Self-employed,Married,No,No,Other,Yes,0
3,WB1T7NQV8A,53,49795,229582,533,107,3,14.44,60,0.17,Bachelor's,Self-employed,Single,Yes,No,Auto,Yes,1
4,J6GU9M4G1Z,49,115450,22072,840,0,4,24.48,12,0.11,Bachelor's,Part-time,Single,No,Yes,Education,Yes,0


In [None]:
# Listing the categorical classes to change into discreter number values

df_education_map = ['High School', "Master's", "Bachelor's", 'PhD']
df_employment_type_map = ['Self-employed', 'Unemployed', 'Part-time', 'Full-time']
df_marital_status_map = ['Single', 'Divorced', 'Married']
df_has_mort_map = ['Yes', 'No']
df_has_depends_map = ['No', 'Yes']
df_loan_purpose_map = ['Business', 'Education', 'Other', 'Auto', 'Home']
df_has_co_map = ['No', 'Yes']
arr_one_hot = {"Education": df_education_map, "EmploymentType": df_employment_type_map, "MaritalStatus": df_marital_status_map,
               "HasMortgage": df_has_mort_map, "HasDependents": df_has_depends_map, "LoanPurpose": df_loan_purpose_map,
               "HasCoSigner": df_has_co_map}

In [None]:
# Using the list above to change the data row values from categorical classes into discrete numerical values for each class

for text_col in arr_one_hot:
    train_csv = train_csv.replace({text_col: {key_temp: val_temp for val_temp, key_temp in enumerate(arr_one_hot[text_col])}})

for text_col in arr_one_hot:
    test_csv = test_csv.replace({text_col: {key_temp: val_temp for val_temp, key_temp in enumerate(arr_one_hot[text_col])}})

  train_csv = train_csv.replace({text_col: {key_temp: val_temp for val_temp, key_temp in enumerate(arr_one_hot[text_col])}})
  test_csv = test_csv.replace({text_col: {key_temp: val_temp for val_temp, key_temp in enumerate(arr_one_hot[text_col])}})


In [None]:
# Dividing the items in the 'LoanTerm' class by 12 to scale it down for the NN

for row_index in range(train_csv.shape[0]):
    train_csv.iloc[row_index, 8] = int(train_csv.iloc[row_index, 8] / 12)

for row_index in range(test_csv.shape[0]):
    test_csv.iloc[row_index, 8] = int(test_csv.iloc[row_index, 8] / 12)

In [None]:
# Padding for the continuous un-bounded values for the log value calculation so that 0 is never encountered

epsilon = 1

In [None]:
# Calculating the log values for each data item to avoid exploding gradients because of unbounded values

for row_index in range(train_csv.shape[0]):
    train_csv.iloc[row_index, 1] = np.log(train_csv.iloc[row_index, 1] + epsilon)
    train_csv.iloc[row_index, 2] = np.log(train_csv.iloc[row_index, 2] + epsilon)
    train_csv.iloc[row_index, 3] = np.log(train_csv.iloc[row_index, 3] + epsilon)
    train_csv.iloc[row_index, 4] = np.log(train_csv.iloc[row_index, 4] + epsilon)
    train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5] + epsilon)

for row_index in range(test_csv.shape[0]):
    test_csv.iloc[row_index, 1] = np.log(test_csv.iloc[row_index, 1] + epsilon)
    test_csv.iloc[row_index, 2] = np.log(test_csv.iloc[row_index, 2] + epsilon)
    test_csv.iloc[row_index, 3] = np.log(test_csv.iloc[row_index, 3] + epsilon)
    test_csv.iloc[row_index, 4] = np.log(test_csv.iloc[row_index, 4] + epsilon)
    test_csv.iloc[row_index, 5] = np.log(test_csv.iloc[row_index, 5] + epsilon)

  train_csv.iloc[row_index, 1] = np.log(train_csv.iloc[row_index, 1])
  train_csv.iloc[row_index, 2] = np.log(train_csv.iloc[row_index, 2])
  train_csv.iloc[row_index, 3] = np.log(train_csv.iloc[row_index, 3])
  train_csv.iloc[row_index, 4] = np.log(train_csv.iloc[row_index, 4])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row_index, 5] = np.log(train_csv.iloc[row_index, 5])
  train_csv.iloc[row

In [None]:
# Adding a new manual feature based on two other original feature columns in the dataset

train_csv['AmountRatio_I_La'] = [train_csv.iloc[i, 2]/train_csv.iloc[i, 3] for i in range(train_csv.shape[0])]
test_csv['AmountRatio_I_La'] = [test_csv.iloc[i, 2]/test_csv.iloc[i, 3] for i in range(test_csv.shape[0])]

In [131]:
train_csv.head()

Unnamed: 0,LoanID,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default,AmountRatio_I_La
0,DRIRC89L0T,2.890372,11.831932,12.25074,6.740519,3.258097,2,10.47,5,0.81,0,0,0,0,0,0,0,0,0.965814
1,TS0FIUNHNU,3.850148,10.954204,8.694502,6.617403,3.401197,2,19.72,3,0.73,0,1,1,1,1,1,0,0,1.2599
2,I0YR284A1V,3.258097,11.342469,11.462316,6.115892,1.94591,2,24.25,1,0.45,1,0,2,1,0,2,1,0,0.989544
3,WB1T7NQV8A,3.970292,10.81567,12.344016,6.278521,4.672829,3,14.44,5,0.17,2,0,0,0,0,3,1,1,0.876187
4,J6GU9M4G1Z,3.89182,11.656593,10.002065,6.733402,-inf,4,24.48,1,0.11,2,2,0,1,1,1,1,0,1.165419


In [132]:
test_csv.shape

(51070, 18)

Defining the class

In [None]:
# Defining the PyTorch model

class NN(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 2)
        )

    def forward(self, x):
        x = self.flatten(x)
        x = self.linear_relu_stack(x)

        return x

In [134]:
model = NN(input_size=17)

In [135]:
print(model)

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=17, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=512, out_features=512, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=512, out_features=256, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.2, inplace=False)
    (15): Linear(in_features=256, out_features=128, bias=True)
    (16): ReLU()
    (17): Dropout(p=0.2, inplace=False)
    (18): Linear(in_features=128, out_features=64, bias=True)
    (19): ReLU()
    (20): Dropout(p=0.2, inplace=False)
    (21): Linear(in_features=64, out_features=2, bias=True)
  )
)


Data loading

In [None]:
# list of features to be considered as input for the NN

feats = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed',
         'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio', 'Education',
         'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents',
         'LoanPurpose', 'HasCoSigner', 'AmountRatio_I_La']

y_feats = ['Default']

In [None]:
# Arrays containing the features for the training and testing CSV files

training_feats = train_csv[feats]
testing_feats = train_csv[y_feats]

In [None]:
# Import other helping libraries from scikit-learn

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
# Encode the testing categorical features into one-hot vectors

encoder = OneHotEncoder(sparse_output=False)  # One-hot encoder
y_encoded = encoder.fit_transform(np.array(testing_feats).reshape(-1, 1))

In [140]:
y_encoded

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [0., 1.]])

In [None]:
# Splitting the training dataset into training and testing subsets

X_train, X_test, y_train, y_test = train_test_split(training_feats, y_encoded, test_size=0.2, random_state=7)

Training

In [None]:
# Instantiating the model

model = NN(input_size=len(feats))
print(model)

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=17, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=512, out_features=512, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=512, out_features=256, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.2, inplace=False)
    (15): Linear(in_features=256, out_features=128, bias=True)
    (16): ReLU()
    (17): Dropout(p=0.2, inplace=False)
    (18): Linear(in_features=128, out_features=64, bias=True)
    (19): ReLU()
    (20): Dropout(p=0.2, inplace=False)
    (21): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [None]:
# Defining the model optimization functions

import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()  # Suitable for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [144]:
type(X_train), type(X_test), type(y_train), type(y_test)

(pandas.core.frame.DataFrame,
 pandas.core.frame.DataFrame,
 numpy.ndarray,
 numpy.ndarray)

In [145]:
X_train = np.array(X_train)
X_test = np.array(X_test)

In [None]:
# Converting the splitted subsets from DataFrame format to Tensor format to be a valid as input for the NN

from torch.utils.data import TensorDataset

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader for batch processing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

In [147]:
# Training loop
epochs = 30
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y.argmax(dim=1))

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Evaluate the model
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch_X, batch_y in test_loader:
        outputs = model(batch_X)
        _, predicted = torch.max(outputs, 1)
        total += batch_y.size(0)
        correct += (predicted == batch_y.argmax(dim=1)).sum().item()

accuracy = correct / total
print(f"Test Accuracy: {accuracy:.4f}")

Epoch 1/30, Loss: nan
Epoch 2/30, Loss: nan
Epoch 3/30, Loss: nan
Epoch 4/30, Loss: nan
Epoch 5/30, Loss: nan
Epoch 6/30, Loss: nan
Epoch 7/30, Loss: nan
Epoch 8/30, Loss: nan
Epoch 9/30, Loss: nan
Epoch 10/30, Loss: nan
Epoch 11/30, Loss: nan
Epoch 12/30, Loss: nan
Epoch 13/30, Loss: nan
Epoch 14/30, Loss: nan
Epoch 15/30, Loss: nan
Epoch 16/30, Loss: nan
Epoch 17/30, Loss: nan
Epoch 18/30, Loss: nan
Epoch 19/30, Loss: nan
Epoch 20/30, Loss: nan
Epoch 21/30, Loss: nan
Epoch 22/30, Loss: nan
Epoch 23/30, Loss: nan
Epoch 24/30, Loss: nan
Epoch 25/30, Loss: nan
Epoch 26/30, Loss: nan
Epoch 27/30, Loss: nan
Epoch 28/30, Loss: nan
Epoch 29/30, Loss: nan
Epoch 30/30, Loss: nan
Test Accuracy: 0.8859


Generating the actual predictions using the model on testing CSV

In [148]:
testing_actual = test_csv[feats]
testing_actual = np.array(testing_actual)
testing_actual = torch.tensor(testing_actual, dtype=torch.float32)

In [149]:
model.eval()

NN(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=17, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=128, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=512, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.2, inplace=False)
    (9): Linear(in_features=512, out_features=512, bias=True)
    (10): ReLU()
    (11): Dropout(p=0.2, inplace=False)
    (12): Linear(in_features=512, out_features=256, bias=True)
    (13): ReLU()
    (14): Dropout(p=0.2, inplace=False)
    (15): Linear(in_features=256, out_features=128, bias=True)
    (16): ReLU()
    (17): Dropout(p=0.2, inplace=False)
    (18): Linear(in_features=128, out_features=64, bias=True)
    (19): ReLU()
    (20): Dropout(p=0.2, inplace=False)
    (21): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [None]:
with torch.no_grad(): # Freezing the trained model and using it solely for predicting
    predictions = model(testing_actual)
    probs = torch.softmax(predictions, dim=1)
    pred_classes = torch.argmax(probs, dim=1)

In [151]:
# Print results
print("Raw Predictions (Logits):")
print(predictions)
print("\nProbabilities:")
print(probs)
print("\nPredicted Classes:")
print(pred_classes.numpy())  # Convert to NumPy for easy viewing

Raw Predictions (Logits):
tensor([[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]])

Probabilities:
tensor([[nan, nan],
        [nan, nan],
        [nan, nan],
        ...,
        [nan, nan],
        [nan, nan],
        [nan, nan]])

Predicted Classes:
[0 0 0 ... 0 0 0]


In [152]:
pred_classes.shape

torch.Size([51070])

In [153]:
pred_classes.count_nonzero()

tensor(0)

In [154]:
test_csv[feats]

Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,AmountRatio_I_La
0,4.007333,11.632094,11.433806,6.364751,4.727388,2,23.54,3,0.15,3,0,0,0,1,4,0,1.017342
1,4.025352,11.424848,11.787332,6.463029,3.988984,1,15.19,1,0.43,0,2,1,0,1,1,1,0.969248
2,3.258097,11.266628,11.230788,6.343880,4.653960,3,18.02,1,0.29,1,2,2,0,1,1,1,1.003191
3,3.258097,11.051414,9.287672,5.786897,4.770685,1,14.71,2,0.41,0,2,0,1,0,0,1,1.189901
4,3.178054,10.297723,9.960907,6.495266,4.624973,3,15.02,5,0.69,3,1,0,1,1,0,1,1.033814
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51065,3.931826,11.507642,12.045628,6.442540,3.178054,1,17.03,1,0.46,3,0,1,0,1,3,1,0.955338
51066,3.367296,10.645806,11.620110,5.916202,3.931826,4,7.10,3,0.50,3,0,2,1,0,2,0,0.916154
51067,4.204693,11.390837,11.868262,6.594413,3.931826,1,22.89,4,0.79,2,2,1,1,0,1,0,0.959773
51068,3.737670,11.666925,12.159704,6.190315,1.791759,1,10.83,5,0.32,2,3,2,1,1,2,1,0.959474


In [155]:
testing_actual

tensor([[ 4.0073, 11.6321, 11.4338,  ...,  4.0000,  0.0000,  1.0173],
        [ 4.0254, 11.4248, 11.7873,  ...,  1.0000,  1.0000,  0.9692],
        [ 3.2581, 11.2666, 11.2308,  ...,  1.0000,  1.0000,  1.0032],
        ...,
        [ 4.2047, 11.3908, 11.8683,  ...,  1.0000,  0.0000,  0.9598],
        [ 3.7377, 11.6669, 12.1597,  ...,  2.0000,  1.0000,  0.9595],
        [ 3.8918, 11.4122, 11.3380,  ...,  1.0000,  0.0000,  1.0065]])

Generating the final output file

In [156]:
import csv

data = [["LoanID", "Default"]]
for i in range(test_csv.shape[0]):
   temp_arr = [str(test_csv["LoanID"][i]), int(pred_classes[i])]
   data.append(temp_arr)

filename = "output_NN.csv"

# Write data to CSV
with open(filename, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerows(data)

print(f"Data has been written to {filename}")

Data has been written to output_NN.csv
