In [12]:
from google.colab import drive

In [13]:
drive.mount('/content/drive')

Mounted at /content/drive


In [14]:
file_path = '/content/drive/My Drive/Colab Notebooks/cleaned_training_data.csv'

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [17]:
from imblearn.over_sampling import SMOTE

In [18]:
data = pd.read_csv(file_path,  header=0)

In [None]:
# # Subsample the majority class
# data_majority = data[data['bad_flag'] == 0]
# data_minority = data[data['bad_flag'] == 1]

# # Downsample majority class
# data_majority_downsampled = data_majority.sample(2*len(data_minority), random_state=42)

# data_balanced = pd.concat([data_majority_downsampled, data_minority])

# # Shuffle the balanced dataset
# data_balanced = data_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [6]:
data.columns


Index(['term', 'int_rate', 'emp_length', 'annual_inc', 'percent_bc_gt_75',
       'dti', 'inq_last_6mths', 'mths_since_recent_inq', 'total_bc_limit',
       'tot_cur_bal', 'internal_score', 'bad_flag', 'purpose_car',
       'purpose_credit_card', 'purpose_debt_consolidation',
       'purpose_home_improvement', 'purpose_house', 'purpose_major_purchase',
       'purpose_medical', 'purpose_moving', 'purpose_other',
       'purpose_renewable_energy', 'purpose_small_business',
       'purpose_vacation', 'purpose_wedding', 'home_ownership_MORTGAGE',
       'home_ownership_OTHER', 'home_ownership_OWN', 'home_ownership_RENT'],
      dtype='object')

In [19]:
y = data['bad_flag'].astype(float)
X = data.drop(columns=['bad_flag'])

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.10, random_state=1000
)

In [21]:
X_t, X_val, y_t, y_val = train_test_split(
    X_train, y_train, stratify=y_train, test_size=0.20, random_state=1000
)

In [22]:
print(y_t.value_counts())

bad_flag
0.0    126956
1.0      9452
Name: count, dtype: int64


In [23]:
print(y_val.value_counts())

bad_flag
0.0    31740
1.0     2363
Name: count, dtype: int64


In [11]:
smote = SMOTE(sampling_strategy=0.1, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_t, y_t)

In [24]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# Define the pipeline: SMOTE for oversampling + RandomUnderSampler for undersampling
sampling_strategy_smote = 1.  # Increase minority to 50% of majority
sampling_strategy_under = 1. # Reduce majority to 60% of original

smote = SMOTE(sampling_strategy=sampling_strategy_smote, random_state=42)
under_sampler = RandomUnderSampler(sampling_strategy=sampling_strategy_under, random_state=42)

pipeline = Pipeline([
    ('smote', smote),
    ('under', under_sampler)
])

# Resample the training data
# X_train_smote, y_train_smote = pipeline.fit_resample(X_t, y_t)
X_train_smote, y_train_smote = X_t, y_t

In [25]:
numer = ['term', 'int_rate', 'emp_length', 'annual_inc', 'percent_bc_gt_75',
        'dti', 'inq_last_6mths', 'mths_since_recent_inq',
       'total_bc_limit', 'tot_cur_bal', 'internal_score']
dummy = X_train.columns.difference(numer)

In [26]:
scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_smote[numer])
X_val_num_scaled = scaler.transform(X_val[numer])

In [27]:
X_train_scaled1 = pd.DataFrame(X_train_num_scaled, columns=numer, index=X_train_smote.index)
X_train_scaled = pd.concat([X_train_scaled1, X_train_smote[dummy]], axis=1)

X_val_scaled1 = pd.DataFrame(X_val_num_scaled, columns=numer, index=X_val.index)
X_val_scaled = pd.concat([X_val_scaled1, X_val[dummy]], axis=1)

In [28]:

print(y_train_smote.value_counts())

bad_flag
0.0    126956
1.0      9452
Name: count, dtype: int64


In [29]:
# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_smote.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)

In [17]:
print(X_val_tensor.size(), y_val_tensor.size())

torch.Size([34103, 28]) torch.Size([34103])


In [18]:
print(X_train_tensor.shape, y_train_tensor.shape)

torch.Size([136408, 28]) torch.Size([136408])


In [30]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3,  output_size):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.relu = nn.ReLU()

        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.relu2 = nn.ReLU()
        self.dropout1 = nn.Dropout(p=0.2)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.relu3 = nn.ReLU()
        self.dropout2 = nn.Dropout(p=0.3)
        self.fc4 = nn.Linear(hidden_size3, output_size)
        # self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout1(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout2(x)
        x = self.fc4(x)
        # x = self.relu4(x)
        # x = self.fc5(x)
        # x = self.sigmoid(x)
        return x

# Model configuration
input_size = X_train_tensor.shape[1]
print(input_size)
hidden_size1 = 64  # Configurable
hidden_size2 = 256
hidden_size3 = 64
output_size = 1

# Instantiate the model
model = NeuralNet(input_size, hidden_size1, hidden_size2, hidden_size3, output_size)

28


In [31]:
def init_weights(m):
    if isinstance(m, nn.Linear):
        # Xavier (Glorot) initialization for weights
        nn.init.xavier_uniform_(m.weight)
        # Initialize biases to zero
        nn.init.zeros_(m.bias)

In [32]:
model.apply(init_weights)

NeuralNet(
  (fc1): Linear(in_features=28, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=256, bias=True)
  (relu2): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (relu3): ReLU()
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc4): Linear(in_features=64, out_features=1, bias=True)
)

In [33]:

# Training loop
epochs = 200  # Configurable
batch_size = 32
clip_value = 1.0
# Loss function and optimizer
# criterion = nn.BCELoss()  # Binary Cross-Entropy Loss
pos_weight = torch.tensor([126956 / 9452], dtype=torch.float)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)  # Binary Cross-Entropy Loss
# criterion = FocalLoss(alpha=0.25, gamma=2.0, reduction='mean')
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='max', factor=0.5, patience=5, verbose=True
)



In [34]:
patience = 10  # Number of epochs with no improvement to wait
# best_val_loss = float("inf")
early_stop_counter = 0

In [20]:
from sklearn.metrics import f1_score, recall_score, precision_score
best_threshold = 0.5
best_f1_score = 0.0

for epoch in range(epochs):

    model.train()
    total_loss = 0.0
    for i in range(0, len(X_train_tensor), batch_size):
        # Get mini-batch
        X_batch = X_train_tensor[i:i+batch_size]
        y_batch = y_train_tensor[i:i+batch_size]

        # Forward pass
        outputs = model(X_batch).squeeze(dim = -1)
        loss = criterion(outputs, y_batch)
        total_loss += loss.item()
        # print("Model output shape:", outputs.shape)
        # print("Target shape:", y_batch.shape)
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)

        optimizer.step()

    with torch.no_grad():
        train_outputs = model(X_train_tensor).squeeze()
        train_predictions = (train_outputs > 0.5).float()
        train_accuracy = accuracy_score(y_train_tensor.numpy(), train_predictions.numpy())
        train_f1 = f1_score(y_train_tensor.numpy(), train_predictions.numpy())
        train_recall = recall_score(y_train_tensor.numpy(), train_predictions.numpy())
        train_precision = precision_score(y_train_tensor.numpy(), train_predictions.numpy())

    # Validation
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor).squeeze()
        val_loss = criterion(val_outputs, y_val_tensor)

        # # Generate probabilities for validation
        val_probabilities = torch.sigmoid(val_outputs).numpy()

        # Search for the best threshold to maximize F1-score
        thresholds = np.arange(0.1, 0.9, 0.01)
        for threshold in thresholds:
            val_predictions = (val_probabilities > threshold).astype(int)
            current_f1 = f1_score(y_val_tensor.numpy(), val_predictions)

            if current_f1 > best_f1_score:
                best_f1_score = current_f1
                best_threshold = threshold

        # Apply the best threshold for current evaluation
        val_predictions = (val_probabilities > best_threshold).astype(int)
        val_accuracy = accuracy_score(y_val_tensor.numpy(), val_predictions)
        val_f1 = f1_score(y_val_tensor.numpy(), val_predictions)
        val_recall = recall_score(y_val_tensor.numpy(), val_predictions)
        val_precision = precision_score(y_val_tensor.numpy(), val_predictions)
    scheduler.step(val_f1)
    # Log epoch results
    print(f"Epoch {epoch+1}/{epochs}, "
          f"Loss: {total_loss / len(X_train_tensor):.4f}, Train Accuracy: {train_accuracy:.4f}, "
          f"Train F1: {train_f1:.4f}, Train Recall: {train_recall:.4f}, Train Precision: {train_precision:.4f}, "
          f"Val Loss: {val_loss.item():.4f}, Val Accuracy: {val_accuracy:.4f}, "
          f"Val F1: {val_f1:.4f}, Val Recall: {val_recall:.4f}, Val Precision: {val_precision:.4f}, "
          f"Best Threshold: {best_threshold:.2f}")

    if val_f1 >= best_f1_score:
        best_f1_score = val_f1
        torch.save(model.state_dict(), "/content/drive/My Drive/Colab Notebooks/neural_net_model.pth")
        print(f"Best model saved with F1-score: {val_f1:.4f}")
        early_stop_counter = 0  # Reset early stopping counter
    else:
        early_stop_counter += 1
    # print(val_f1, best_f1_score, early_stop_counter)
    # Check early stopping condition
    if early_stop_counter >= patience:
        print(f"Early stopping triggered after {epoch+1} epochs!")
        break
# Save the model
torch.save(model.state_dict(), "/content/drive/My Drive/Colab Notebooks/neural_net_model.pth")

Epoch 1/200, Loss: 0.0396, Train Accuracy: 0.8641, Train F1: 0.1790, Train Recall: 0.2137, Train Precision: 0.1539, Val Loss: 1.2047, Val Accuracy: 0.7628, Val F1: 0.2021, Val Recall: 0.4333, Val Precision: 0.1317, Best Threshold: 0.56
Best model saved with F1-score: 0.2021
Epoch 2/200, Loss: 0.0393, Train Accuracy: 0.8574, Train F1: 0.1917, Train Recall: 0.2440, Train Precision: 0.1579, Val Loss: 1.2077, Val Accuracy: 0.7697, Val F1: 0.2068, Val Recall: 0.4333, Val Precision: 0.1358, Best Threshold: 0.56
Best model saved with F1-score: 0.2068
Epoch 3/200, Loss: 0.0397, Train Accuracy: 0.8647, Train F1: 0.1944, Train Recall: 0.2355, Train Precision: 0.1655, Val Loss: 1.2223, Val Accuracy: 0.7862, Val F1: 0.2105, Val Recall: 0.4113, Val Precision: 0.1414, Best Threshold: 0.56
Best model saved with F1-score: 0.2105
Epoch 4/200, Loss: 0.0401, Train Accuracy: 0.8699, Train F1: 0.1965, Train Recall: 0.2296, Train Precision: 0.1717, Val Loss: 1.2225, Val Accuracy: 0.7781, Val F1: 0.2082, Val

In [35]:
model_path = "/content/drive/My Drive/Colab Notebooks/neural_net_model.pth"  # Path where your model was saved
model.load_state_dict(torch.load(model_path))
model.eval()

  model.load_state_dict(torch.load(model_path))


NeuralNet(
  (fc1): Linear(in_features=28, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=256, bias=True)
  (relu2): ReLU()
  (dropout1): Dropout(p=0.2, inplace=False)
  (fc3): Linear(in_features=256, out_features=64, bias=True)
  (relu3): ReLU()
  (dropout2): Dropout(p=0.3, inplace=False)
  (fc4): Linear(in_features=64, out_features=1, bias=True)
)

In [22]:
# Preprocess X_test (if not done earlier)
X_test_num_scaled = scaler.transform(X_test[numer])  # Only scale the numerical features
X_test_scaled1 = pd.DataFrame(X_test_num_scaled, columns=numer, index=X_test.index)
X_test_scaled = pd.concat([X_test_scaled1, X_test[dummy]], axis=1)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

In [23]:
with torch.no_grad():
    test_outputs = model(X_test_tensor).squeeze()  # Raw logits

# Convert logits to probabilities using sigmoid
test_probabilities = torch.sigmoid(test_outputs).numpy()

# Apply the best threshold for classification
test_predictions = (test_probabilities > best_threshold).astype(int)

In [24]:
test_accuracy = accuracy_score(y_test_tensor.numpy(), test_predictions)
test_f1 = f1_score(y_test_tensor.numpy(), test_predictions)
test_recall = recall_score(y_test_tensor.numpy(), test_predictions)
test_precision = precision_score(y_test_tensor.numpy(), test_predictions)

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test Precision: {test_precision:.4f}")

Test Accuracy: 0.8385
Test F1 Score: 0.2015
Test Recall: 0.2940
Test Precision: 0.1532


In [36]:
def remove_outliers_iqr(df, columns, factor=1.5):
    df_cleaned = df.copy()  # Make a copy of the DataFrame to avoid modifying original data

    for column in columns:
        if column in df_cleaned.columns:
            Q1 = df_cleaned[column].quantile(0.25)  # 25th percentile
            Q3 = df_cleaned[column].quantile(0.75)  # 75th percentile
            IQR = Q3 - Q1

            lower_bound = Q1 - factor * IQR
            upper_bound = Q3 + factor * IQR

            # Filter rows within the IQR bounds
            df_cleaned[column] = df_cleaned[column].clip(lower=lower_bound, upper=upper_bound)

        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")

    return df_cleaned



In [37]:
from torch.utils.data import DataLoader, TensorDataset

In [38]:
test_data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/testing_loan_data.csv")

  test_data = pd.read_csv("/content/drive/My Drive/Colab Notebooks/testing_loan_data.csv")


In [29]:
test_data.head()

Unnamed: 0,id,member_id,loan_amnt,term,int_rate,emp_length,home_ownership,annual_inc,desc,purpose,...,inq_last_6mths,mths_since_recent_inq,revol_util,total_bc_limit,mths_since_last_major_derog,tot_hi_cred_lim,tot_cur_bal,application_approved_flag,internal_score,bad_flag
0,20000001,22419852,10000,36 months,22.15%,8 years,RENT,37000.0,,debt_consolidation,...,1,3.0,73.10%,16200,,14877.17028,36809,1,131,
1,20000002,22349118,1400,36 months,18.24%,6 years,RENT,41000.0,,other,...,0,9.0,11.50%,4000,,4097.30477,19536,1,19,
2,20000003,22398818,7000,36 months,12.49%,3 years,RENT,68900.0,,debt_consolidation,...,0,11.0,48.10%,11900,80.0,12688.49516,241465,1,92,
3,20000004,22419015,18000,60 months,16.29%,9 years,MORTGAGE,41000.0,,debt_consolidation,...,1,0.0,38.10%,7600,73.0,7908.799817,179757,1,235,
4,20000005,22388614,12000,36 months,12.99%,10+ years,MORTGAGE,64000.0,,home_improvement,...,0,,57.90%,21000,,19378.56106,31953,1,157,


In [39]:
miscol = ['id', 'application_approved_flag', 'tot_hi_cred_lim', 'revol_util', 'loan_amnt', 'bc_util', 'desc', 'member_id', 'mths_since_last_major_derog']  ##drop useless columns
test_data.drop(miscol, axis=1, inplace=True)

In [40]:
test_data['term'] = test_data['term'].str.extract('(\d+)').astype(float)
test_data['emp_length'] = test_data['emp_length'].str.extract('(\d+)').astype(float)
test_data['emp_length'] = test_data['emp_length'].fillna(0)  # Assume missing values as 0 (less than a year)

In [41]:
percentage_columns = ['int_rate']
for col in percentage_columns:
    if col in test_data.columns and test_data[col].dtype == 'object':
        test_data[col] = test_data[col].str.replace('%', '').astype(float) / 100

In [44]:
test_data['mths_since_recent_inq'] = test_data['mths_since_recent_inq'].fillna(0)

In [42]:
columns_to_impute = [
       'percent_bc_gt_75',
      'total_bc_limit',  'tot_cur_bal'
]

# Replace missing values with the median for each column
for col in columns_to_impute:
    test_data[col] = test_data[col].fillna(test_data[col].median())

In [47]:
test_data['home_ownership'] = test_data['home_ownership'].replace(['OTHER', 'NONE'], 'OTHER')

In [48]:
categorical_columns = ['purpose', 'home_ownership']
test_data_encoded = pd.get_dummies(test_data, columns=categorical_columns, drop_first=False, dtype=int)

In [49]:
# Columns to clean
columns_to_check = [ 'term', 'int_rate', 'emp_length', 'annual_inc', 'percent_bc_gt_75',
        'dti', 'inq_last_6mths', 'mths_since_recent_inq',
       'total_bc_limit', 'tot_cur_bal', 'internal_score']


# Remove outliers
cleaned_data = remove_outliers_iqr(test_data_encoded, columns_to_check)

In [55]:
dummy = cleaned_data.columns.difference(numer)

In [56]:
cleaned_data_num_scaled = scaler.transform(cleaned_data[numer])
cleaned_data_scaled1 = pd.DataFrame(cleaned_data_num_scaled, columns=numer, index=cleaned_data.index)
cleaned_data_scaled = pd.concat([cleaned_data_scaled1, cleaned_data[dummy]], axis=1)

In [57]:
# Convert test data to PyTorch tensors
X_test_tensor = torch.tensor(cleaned_data_scaled.values, dtype=torch.float32)

# Create DataLoader for batch processing (optional)
test_loader = DataLoader(TensorDataset(X_test_tensor), batch_size=32, shuffle=False)


In [59]:

all_predictions = []

with torch.no_grad():
    for inputs in test_loader:
        inputs = inputs[0]  # Extract inputs from TensorDataset
        outputs = model(inputs).squeeze()  # Forward pass
        probabilities = torch.sigmoid(outputs)  # Convert logits to probabilities

        # Apply the best threshold
        predictions = (probabilities > 0.56).int()

        all_predictions.extend(predictions.tolist())

# ===========================
# 4. Fill Target Column
# ===========================
# Insert predictions into the test dataset
test_data["bad_flag"] = all_predictions  # Replace 'target' with the actual column name

# Save the updated test dataset
test_data.to_csv("/content/drive/My Drive/Colab Notebooks/test_predictions_filled.csv", index=False)

print("Predictions saved successfully!")

Predictions saved successfully!
