In [45]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np

# Data exploration

In [46]:
df = pd.read_csv("/content/loan_data.csv")

In [47]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [48]:
df.columns

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file', 'loan_status'],
      dtype='object')

In [49]:
df.isna().sum()

Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


In [50]:
print(df.info())           # Column types, null counts
print(df.describe())       # Summary stats for numerics
print(df['loan_status'].value_counts(normalize=True))  # Class balance

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45000 entries, 0 to 44999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      45000 non-null  float64
 1   person_gender                   45000 non-null  object 
 2   person_education                45000 non-null  object 
 3   person_income                   45000 non-null  float64
 4   person_emp_exp                  45000 non-null  int64  
 5   person_home_ownership           45000 non-null  object 
 6   loan_amnt                       45000 non-null  float64
 7   loan_intent                     45000 non-null  object 
 8   loan_int_rate                   45000 non-null  float64
 9   loan_percent_income             45000 non-null  float64
 10  cb_person_cred_hist_length      45000 non-null  float64
 11  credit_score                    45000 non-null  int64  
 12  previous_loan_defaults_on_file  

In [51]:
(df["person_age"] < 18).sum()

np.int64(0)

In [52]:
print(df["loan_status"].value_counts(normalize=True))

loan_status
0    0.777778
1    0.222222
Name: proportion, dtype: float64


**the classes are imbalenced**

In [53]:
from sklearn.preprocessing import LabelEncoder

categorical_cols = [
    "person_gender",
    "person_education",
    "person_home_ownership",
    "loan_intent",
    "previous_loan_defaults_on_file"
]

In [54]:

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [55]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,0,4,71948.0,0,3,35000.0,4,16.02,0.49,3.0,561,0,1
1,21.0,0,3,12282.0,0,2,1000.0,1,11.14,0.08,2.0,504,1,0
2,25.0,0,3,12438.0,3,0,5500.0,3,12.87,0.44,3.0,635,0,1
3,23.0,0,1,79753.0,0,3,35000.0,3,15.23,0.44,2.0,675,0,1
4,24.0,1,4,66135.0,1,3,35000.0,3,14.27,0.53,4.0,586,0,1


## Scaling the data

In [56]:
from sklearn.preprocessing import StandardScaler

X = df.drop("loan_status", axis=1)
y = df["loan_status"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [57]:
## bala

In [58]:
from sklearn.utils import class_weight
import numpy as np

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(class_weights))
print(class_weights)  # e.g. {0: 0.64, 1: 2.23}


{0: np.float64(0.6428571428571429), 1: np.float64(2.25)}


In [59]:
from sklearn.model_selection import train_test_split

# Suppose X = features, y = target labels
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,   # 20% for testing, 80% for training
    random_state=42, # ensures reproducibility
    stratify=y       # optional: keeps class distribution same in train/test
)

print("Train size:", X_train.shape, y_train.shape)
print("Test size:", X_test.shape, y_test.shape)


Train size: (36000, 13) (36000,)
Test size: (9000, 13) (9000,)


In [60]:
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.float32).reshape(-1, 1)

X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32).reshape(-1, 1)

## Building the model

In [61]:

model = nn.Sequential(
    nn.Linear(13, 32),
    nn.ReLU(),
    nn.Linear(32, 1)
)

In [62]:

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([class_weights[1]]))
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [63]:

num_entries = X_train.size(0) # number of samples
batch_size = 32

In [64]:
for i in range(0, 1000):
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X_train[start:end]
        y_data = y_train[start:end]

        optimizer.zero_grad()
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward()
        optimizer.step()

    if i % 10 == 0:
        print(loss)


tensor(0.3254, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2096, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2023, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.2002, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1901, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1857, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1837, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1860, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1822, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1795, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1767, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1714, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1671, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1642, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1657, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
tensor(0.1661, grad_fn=<BinaryCrossEntro

In [65]:


model.eval()
with torch.no_grad():
    outputs = model(X_test)
    y_pred = nn.functional.sigmoid(outputs) > 0.5
    y_pred_correct = y_pred.type(torch.float32) == y_test
    print(y_pred_correct.type(torch.float32).mean())


tensor(0.9001)


In [66]:
from sklearn.metrics import classification_report, roc_auc_score

model.eval()
with torch.no_grad():
    outputs = model(X_test)
    y_probs = torch.sigmoid(outputs).numpy()
    y_pred = (y_probs > 0.5).astype(int)
    y_true = y_test.numpy()

print(classification_report(y_true, y_pred, digits=4))
print("ROC AUC:", roc_auc_score(y_true, y_probs))


              precision    recall  f1-score   support

         0.0     0.9593    0.9101    0.9341      7000
         1.0     0.7334    0.8650    0.7938      2000

    accuracy                         0.9001      9000
   macro avg     0.8464    0.8876    0.8639      9000
weighted avg     0.9091    0.9001    0.9029      9000

ROC AUC: 0.9663442857142857
