In [None]:
# Libraries

import pandas as pd
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import Dataset, DataLoader # Gives easier dataset managment and creates mini batches
import torchvision.datasets as datasets # Has standard datasets we can import in a nice and easy way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
from sklearn.metrics import classification_report #For getting classification report, helps in evaluating the model

In [None]:
# Fetching Data

df = pd.read_csv('train.csv')

In [None]:
# Visualizing the first few records

df.head()

Unnamed: 0,ID,Loan Amount,Funded Amount,Funded Amount Investor,Term,Batch Enrolled,Interest Rate,Grade,Sub Grade,Employment Duration,Home Ownership,Verification Status,Payment Plan,Loan Title,Debit to Income,Delinquency - two years,Inquires - six months,Open Account,Public Record,Revolving Balance,Revolving Utilities,Total Accounts,Initial List Status,Total Received Interest,Total Received Late Fee,Recoveries,Collection Recovery Fee,Collection 12 months Medical,Application Type,Last week Pay,Accounts Delinquent,Total Collection Amount,Total Current Balance,Total Revolving Credit Limit,Loan Status
0,65087372,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,176346.6267,Not Verified,n,Debt Consolidation,16.284758,1,0,13,0,24246,74.932551,7,w,2929.646315,0.102055,2.498291,0.793724,0,INDIVIDUAL,49,0,31,311301,6619,0
1,1450153,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,39833.921,Source Verified,n,Debt consolidation,15.412409,0,0,12,0,812,78.297186,13,f,772.769385,0.036181,2.377215,0.974821,0,INDIVIDUAL,109,0,53,182610,20885,0
2,1969101,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,91506.69105,Source Verified,n,Debt Consolidation,28.137619,0,0,14,0,1843,2.07304,20,w,863.324396,18.77866,4.316277,1.020075,0,INDIVIDUAL,66,0,34,89801,26155,0
3,6651430,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,108286.5759,Source Verified,n,Debt consolidation,18.04373,1,0,7,0,13819,67.467951,12,w,288.173196,0.044131,0.10702,0.749971,0,INDIVIDUAL,39,0,40,9189,60214,0
4,14354669,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,44234.82545,Source Verified,n,Credit card refinancing,17.209886,1,3,13,1,1544,85.250761,22,w,129.239553,19.306646,1294.818751,0.368953,0,INDIVIDUAL,18,0,430,126029,22579,0


In [None]:
# Understanding the datatypes

df.dtypes

ID                                int64
Loan Amount                       int64
Funded Amount                     int64
Funded Amount Investor          float64
Term                              int64
Batch Enrolled                   object
Interest Rate                   float64
Grade                            object
Sub Grade                        object
Employment Duration              object
Home Ownership                  float64
Verification Status              object
Payment Plan                     object
Loan Title                       object
Debit to Income                 float64
Delinquency - two years           int64
Inquires - six months             int64
Open Account                      int64
Public Record                     int64
Revolving Balance                 int64
Revolving Utilities             float64
Total Accounts                    int64
Initial List Status              object
Total Received Interest         float64
Total Received Late Fee         float64


In [None]:
# Identifying unique counts

df.nunique()

ID                              67463
Loan Amount                     27525
Funded Amount                   24548
Funded Amount Investor          67441
Term                                3
Batch Enrolled                     41
Interest Rate                   67448
Grade                               7
Sub Grade                          35
Employment Duration                 3
Home Ownership                  67454
Verification Status                 3
Payment Plan                        1
Loan Title                        109
Debit to Income                 67454
Delinquency - two years             9
Inquires - six months               6
Open Account                       36
Public Record                       5
Revolving Balance               20582
Revolving Utilities             67458
Total Accounts                     69
Initial List Status                 2
Total Received Interest         67451
Total Received Late Fee         67380
Recoveries                      67387
Collection R

In [6]:
#Dropping columns that provide no information about defaulters
df = df.drop(columns=['ID','Batch Enrolled'])

#Dropping columns with only 1 value
df = df.drop(columns=['Payment Plan','Accounts Delinquent'])

#Extracting the target variable
Y = df['Loan Status']
df = df.drop(columns=['Loan Status'])

In [7]:
#Categorical Variables
cat_var = ["Grade","Sub Grade","Employment Duration","Verification Status","Loan Title","Initial List Status",
           "Application Type",]
cat_df = df[cat_var]

#One hot encoding categorical values
cat_df = pd.get_dummies(cat_df)
cat_df.head(1)

Unnamed: 0,Grade_A,Grade_B,Grade_C,Grade_D,Grade_E,Grade_F,Grade_G,Sub Grade_A1,Sub Grade_A2,Sub Grade_A3,Sub Grade_A4,Sub Grade_A5,Sub Grade_B1,Sub Grade_B2,Sub Grade_B3,Sub Grade_B4,Sub Grade_B5,Sub Grade_C1,Sub Grade_C2,Sub Grade_C3,Sub Grade_C4,Sub Grade_C5,Sub Grade_D1,Sub Grade_D2,Sub Grade_D3,Sub Grade_D4,Sub Grade_D5,Sub Grade_E1,Sub Grade_E2,Sub Grade_E3,Sub Grade_E4,Sub Grade_E5,Sub Grade_F1,Sub Grade_F2,Sub Grade_F3,Sub Grade_F4,Sub Grade_F5,Sub Grade_G1,Sub Grade_G2,Sub Grade_G3,...,Loan Title_Moving and relocation,Loan Title_My Loan,Loan Title_Other,Loan Title_Pay Off,Loan Title_Payoff,Loan Title_Personal,Loan Title_Personal Loan,Loan Title_Personal loan,Loan Title_Pool,Loan Title_Refinance,Loan Title_Refinance Loan,Loan Title_Vacation,Loan Title_Wedding Loan,Loan Title_bills,Loan Title_cards,Loan Title_conso,Loan Title_consolidate,Loan Title_consolidation,Loan Title_consolidation loan,Loan Title_credit card,Loan Title_credit card consolidation,Loan Title_credit card refinance,Loan Title_credit pay off,Loan Title_debt,Loan Title_debt consolidation,Loan Title_debt consolidation loan,Loan Title_debt loan,Loan Title_get out of debt,Loan Title_home improvement,Loan Title_loan1,Loan Title_pay off bills,Loan Title_payoff,Loan Title_personal,Loan Title_refi,Loan Title_relief,Loan Title_vacation,Initial List Status_f,Initial List Status_w,Application Type_INDIVIDUAL,Application Type_JOINT
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [8]:
#Continuos Variables
conti_var = list(set(df.columns) - set(cat_var))
conti_df = df[conti_var]

#Normalizing values 
x = conti_df.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
conti_df = pd.DataFrame(x_scaled)

In [9]:
#Combining categorical and continuos variables
X = pd.concat([cat_df,conti_df],axis=1)

In [10]:
#Checking the shape on X and Y arrays
print(X.shape, Y.shape)

(67463, 184) (67463,)


In [11]:
#Resampling for dataset imbalance
X, Y = SMOTE().fit_resample(X, Y)



In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.2, random_state=42)

In [13]:
y_train = pd.get_dummies(y_train)

In [14]:
X.shape

(122444, 184)

In [None]:
# Declaring Data Class for training dataset
class LoanDefaultDataset(Dataset):
    #Class constructor
    def __init__(self, X, Y):
        self.X = torch.from_numpy(X.to_numpy())
        self.Y = torch.from_numpy(Y.to_numpy())
    
    #Variable Length
    def __len__(self):
        return len(self.Y)

    #Get Item index
    def __getitem__(self, idx):
        label = self.Y[idx]
        data = self.X[idx]
        return data,label

In [None]:
# Hyperparameters for training 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 64
input_size = 184
num_classes = 2
learning_rate = 0.001
num_epochs = 100

In [None]:
# Loading training dataset with dataloader

dataset_tr = LoanDefaultDataset(X_train,y_train)
train_loader = DataLoader(dataset=dataset_tr, batch_size=50, shuffle=True, num_workers=0)

In [None]:
# Loading testing dataset with dataloader

dataset_te = LoanDefaultDataset(X_test,y_test)
test_loader = DataLoader(dataset=dataset_te, batch_size=50, shuffle=True, num_workers=0)

In [None]:
# Basic ANN Model 

class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, 25)
        self.fc3 = nn.Linear(25,num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x 

In [None]:
#Initializing model and loading it to device

model = NN(input_size=input_size, num_classes=num_classes).to(device)

#Loss Function

criterion = nn.BCEWithLogitsLoss()

#Optimizer

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Running the training loop

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}")
    for batch_idx, (data, targets) in enumerate(train_loader):
        # Get data to cuda if possible
        data = data.to(device=device)
        targets = targets.to(device=device)

        # Get to correct shape, 28x28->784
        # -1 will flatten all outer dimensions into one
        data = data.reshape(data.shape[0], -1) 

        # forward propagation
        scores = model(data.float())
        loss = criterion(scores, targets.float())

        # zero previous gradients
        optimizer.zero_grad()
        
        # back-propagation
        loss.backward()

        # gradient descent or adam step
        optimizer.step()
    print(f"Loss at the end of Epoch {epoch+1}: {loss}")

Epoch: 1
Loss at the end of Epoch 1: 0.5015466213226318
Epoch: 2
Loss at the end of Epoch 2: 0.18662385642528534
Epoch: 3
Loss at the end of Epoch 3: 0.4147661328315735
Epoch: 4
Loss at the end of Epoch 4: 0.12102262675762177
Epoch: 5
Loss at the end of Epoch 5: 0.059692323207855225
Epoch: 6
Loss at the end of Epoch 6: 0.3180202841758728
Epoch: 7
Loss at the end of Epoch 7: 0.3003152906894684
Epoch: 8
Loss at the end of Epoch 8: 0.2025870531797409
Epoch: 9
Loss at the end of Epoch 9: 0.20444492995738983
Epoch: 10
Loss at the end of Epoch 10: 0.3070988059043884
Epoch: 11
Loss at the end of Epoch 11: 0.29775527119636536
Epoch: 12
Loss at the end of Epoch 12: 0.34217578172683716
Epoch: 13
Loss at the end of Epoch 13: 0.1919752061367035
Epoch: 14
Loss at the end of Epoch 14: 0.05762786790728569
Epoch: 15
Loss at the end of Epoch 15: 0.1032731756567955
Epoch: 16
Loss at the end of Epoch 16: 0.21386805176734924
Epoch: 17
Loss at the end of Epoch 17: 0.11640539020299911
Epoch: 18
Loss at the 

In [None]:
# Generating predictions 

preds = model(torch.from_numpy(X_test.to_numpy()).float())
y_pred = [0 if i[0]>i[1] else 1 for i in preds]

In [None]:
# Accuracy metrics

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89     12431
           1       0.90      0.85      0.88     12058

    accuracy                           0.88     24489
   macro avg       0.88      0.88      0.88     24489
weighted avg       0.88      0.88      0.88     24489

