In [1]:
import pandas as pd
import numpy as np
import string
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from imblearn.over_sampling import SMOTE
import torchvision
from torchvision import datasets, transforms
import matplotlib as mpl
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, random_split, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve, auc

In [2]:
df = pd.read_csv('data_cleaned.csv')
df

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent,has_location,has_employment_type,has_required_experience,has_required_education,has_industry,...,city_ wilmington,city_ woodbridge,city_ woodruff,city_ worcester,city_ İstanbul,city_ Αthens,city_ Αθήνα,city_ ΕΛΛΗΝΙΚΟ,city_ 마포구 동교동,city_Unknown
0,1,0,1,0,0,1,1,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0,1,0,0,1,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0,1,0,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0,1,0,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0,1,1,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,0,1,1,0,1,1,1,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17876,17877,0,1,1,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17877,17878,0,0,0,0,1,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17878,17879,0,0,1,0,1,1,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
X = df.drop(columns=['fraudulent', 'job_id'])
y = df['fraudulent'].values

In [4]:
# Convert to PyTorch tensors
X_tensor = torch.tensor(X.values.astype(np.float32), dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

# Create a TensorDataset
full_dataset = TensorDataset(X_tensor, y_tensor)

# Split into train (80%) and test (20%) datasets
train_size = int(0.8 * len(full_dataset))
test_size = len(full_dataset) - train_size
trainset, testset = random_split(full_dataset, [train_size, test_size])

In [5]:
train_size = int(0.8 * len(trainset))
val_size = len(trainset) - train_size

# Split the trainset
train_dataset_new, val_dataset = random_split(trainset, [train_size, val_size])

# Create DataLoader objects
batch_size = 128

trainloader = DataLoader(train_dataset_new, batch_size=batch_size, shuffle=True)
valloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False)

# Check sizes to confirm splitting
print(f"Train dataset size: {len(train_dataset_new)}")
print(f"Validation dataset size: {len(val_dataset)}")
print(f"Test dataset size: {len(testset)}")

print(f"Trainloader: {len(trainloader)}")
print(f"Valloader: {len(valloader)}")
print(f"Testloader: {len(testloader)}")

Train dataset size: 11443
Validation dataset size: 2861
Test dataset size: 3576
Trainloader: 90
Valloader: 23
Testloader: 28


In [None]:
class FNN(nn.Module):
    def __init__(self, input_dim):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # Increased hidden layer size
        self.fc2 = nn.Linear(256, 128)        # Increased hidden layer size
        self.fc3 = nn.Linear(128, 64)         # Hidden layer
        self.fc4 = nn.Linear(64, 32)          # Hidden layer
        self.fc5 = nn.Linear(32, 1)           # Output layer

        self.dropout = nn.Dropout(0.3)        # Dropout for regularization

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc5(x))  # Sigmoid for binary classification
        return x

# Model initialization
input_dim = train_dataset_new[0][0].shape[0]  # Get input dimension from dataset
model = FNN(input_dim)

# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Loss function (Binary Cross-Entropy for binary classification)
loss_fn = nn.BCELoss()

# Initialize variables to store average metrics
total_train_loss = 0
total_train_accuracy = 0

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_train = 0
    total_train = 0
    
    # Training loop
    for data, target in trainloader:
        optimizer.zero_grad() 

        # Forward pass
        outputs = model(data)

        # Calculate loss
        loss = loss_fn(outputs, target.view(-1, 1).float())  # Reshape target to (batch_size, 1)
        loss.backward()  # Backpropagate
        optimizer.step()  # Update model parameters

        epoch_loss += loss.item()

        # Calculate training accuracy, Binary classification: threshold at 0.5
        predictions = (outputs > 0.5).float()
        correct_train += (predictions.view(-1) == target.view(-1)).sum().item()
        total_train += target.size(0)

    # Calculate average training loss and accuracy
    avg_train_loss = epoch_loss / len(trainloader)
    avg_train_accuracy = 100 * correct_train / total_train

    # Accumulate values for average over all epochs
    total_train_loss += avg_train_loss
    total_train_accuracy += avg_train_accuracy

    # Print training metrics for this epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Accuracy: {avg_train_accuracy:.2f}%")

    # Validation loop
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for data, target in valloader:
            outputs = model(data)
            predictions = (outputs > 0.5).float()
            correct_val += (predictions.view(-1) == target.view(-1)).sum().item()
            total_val += target.size(0)

    # Calculate validation accuracy
    val_accuracy = 100 * correct_val / total_val
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

# Calculate the average loss and accuracy over all epochs
avg_train_loss_over_epochs = total_train_loss / num_epochs
avg_train_accuracy_over_epochs = total_train_accuracy / num_epochs

print(f"Average Training Loss: {avg_train_loss_over_epochs:.4f}")
print(f"Average Training Accuracy: {avg_train_accuracy_over_epochs:.2f}%")

# Testing loop
model.eval()
correct_test = 0
total_test = 0
predictions_list = []
targets_list = []

with torch.no_grad():
    for data, target in testloader:
        outputs = model(data)
        predictions = (outputs > 0.5).float()
        predictions_list.extend(predictions.cpu().numpy())
        targets_list.extend(target.cpu().numpy())
        correct_test += (predictions.view(-1) == target.view(-1)).sum().item()
        total_test += target.size(0)

# Test accuracy
test_accuracy = 100 * correct_test / total_test
print(f"Test Accuracy: {test_accuracy:.2f}%")

# Calculate other metrics like precision, recall, and F1 score
precision = precision_score(targets_list, predictions_list)
recall = recall_score(targets_list, predictions_list)
f1 = f1_score(targets_list, predictions_list)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Calculate precision-recall curve and AUPRC (Area Under the Precision-Recall Curve)
precision_vals, recall_vals, _ = precision_recall_curve(targets_list, predictions_list)
auprc = auc(recall_vals, precision_vals)

print(f"AUPRC (Area Under the Precision-Recall Curve): {auprc:.2f}")


Epoch 1/10, Loss: 0.3304, Accuracy: 92.29%
Validation Accuracy: 95.04%
Epoch 2/10, Loss: 0.2974, Accuracy: 94.98%
Validation Accuracy: 95.00%
Epoch 3/10, Loss: 0.2832, Accuracy: 95.06%
Validation Accuracy: 95.00%
Epoch 4/10, Loss: 0.2876, Accuracy: 95.08%
Validation Accuracy: 95.00%
Epoch 5/10, Loss: 0.2673, Accuracy: 95.07%
Validation Accuracy: 95.00%
Epoch 6/10, Loss: 0.2575, Accuracy: 95.08%
Validation Accuracy: 95.00%
Epoch 7/10, Loss: 0.2506, Accuracy: 95.08%
Validation Accuracy: 95.00%
Epoch 8/10, Loss: 0.2395, Accuracy: 95.08%
Validation Accuracy: 95.00%
Epoch 9/10, Loss: 0.2416, Accuracy: 95.08%
Validation Accuracy: 95.00%
Epoch 10/10, Loss: 0.2356, Accuracy: 95.08%
Validation Accuracy: 95.00%
Average Training Loss: 0.2691
Average Training Accuracy: 94.79%
Test Accuracy: 95.53%
Precision: 0.00
Recall: 0.00
F1 Score: 0.00
AUPRC (Area Under the Precision-Recall Curve): 0.52


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
class FNN(nn.Module):
    def __init__(self, input_dim):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 256)  # Increased hidden layer size
        self.fc2 = nn.Linear(256, 128)        # Increased hidden layer size
        self.fc3 = nn.Linear(128, 64)         # Hidden layer
        self.fc4 = nn.Linear(64, 32)          # Hidden layer
        self.fc5 = nn.Linear(32, 1)           # Output layer

        self.dropout = nn.Dropout(0.3)        # Dropout for regularization

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.dropout(x)
        x = F.relu(self.fc4(x))
        x = self.dropout(x)
        x = torch.sigmoid(self.fc5(x))  # Sigmoid for binary classification
        return x

# Model initialization
input_dim = train_dataset_new[0][0].shape[0]  # Get input dimension from dataset
model = FNN(input_dim)

# Define optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Loss function (Binary Cross-Entropy for binary classification)
loss_fn = nn.BCELoss()

# Initialize variables to store average metrics
total_train_loss = 0
total_train_accuracy = 0

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    correct_train = 0
    total_train = 0
    
    # Training loop
    for data, target in trainloader:
        optimizer.zero_grad()

        # Forward pass
        outputs = model(data)

        # Calculate loss
        loss = loss_fn(outputs, target.view(-1, 1).float())  # Reshape target to (batch_size, 1)
        loss.backward()  # Backpropagate
        optimizer.step()  # Update model parameters

        epoch_loss += loss.item()

        # Calculate training accuracy
        predictions = (outputs > 0.5).float()
        correct_train += (predictions.view(-1) == target.view(-1)).sum().item()
        total_train += target.size(0)

    # Calculate average training loss and accuracy
    avg_train_loss = epoch_loss / len(trainloader)
    avg_train_accuracy = 100 * correct_train / total_train

    # Accumulate values for average over all epochs
    total_train_loss += avg_train_loss
    total_train_accuracy += avg_train_accuracy

    # Print training metrics for this epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_train_loss:.4f}, Accuracy: {avg_train_accuracy:.2f}%")

    # Validation loop
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for data, target in valloader:
            outputs = model(data)
            predictions = (outputs > 0.5).float()
            correct_val += (predictions.view(-1) == target.view(-1)).sum().item()
            total_val += target.size(0)

    # Calculate validation accuracy
    val_accuracy = 100 * correct_val / total_val
    print(f"Validation Accuracy: {val_accuracy:.2f}%")

# Calculate the average loss and accuracy over all epochs
avg_train_loss_over_epochs = total_train_loss / num_epochs
avg_train_accuracy_over_epochs = total_train_accuracy / num_epochs

print(f"Average Training Loss: {avg_train_loss_over_epochs:.4f}")
print(f"Average Training Accuracy: {avg_train_accuracy_over_epochs:.2f}%")

# Testing loop
model.eval()
correct_test = 0
total_test = 0
predictions_list = []
targets_list = []

with torch.no_grad():
    for data, target in testloader:
        outputs = model(data)
        predictions = (outputs > 0.5).float()
        predictions_list.extend(predictions.cpu().numpy())
        targets_list.extend(target.cpu().numpy())
        correct_test += (predictions.view(-1) == target.view(-1)).sum().item()
        total_test += target.size(0)

# Test accuracy
test_accuracy = 100 * correct_test / total_test
print(f"Test Accuracy: {test_accuracy:.2f}%")

# Calculate other metrics like precision, recall, and F1 score
precision = precision_score(targets_list, predictions_list)
recall = recall_score(targets_list, predictions_list)
f1 = f1_score(targets_list, predictions_list)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Calculate precision-recall curve and AUPRC (Area Under the Precision-Recall Curve)
precision_vals, recall_vals, _ = precision_recall_curve(targets_list, predictions_list)
auprc = auc(recall_vals, precision_vals)

print(f"AUPRC (Area Under the Precision-Recall Curve): {auprc:.2f}")


Epoch 1/10, Loss: 0.2305, Accuracy: 94.26%
Validation Accuracy: 95.00%
Epoch 2/10, Loss: 0.1445, Accuracy: 95.09%
Validation Accuracy: 95.00%
Epoch 3/10, Loss: 0.1171, Accuracy: 95.36%
Validation Accuracy: 95.70%
Epoch 4/10, Loss: 0.0996, Accuracy: 96.04%
Validation Accuracy: 95.74%
Epoch 5/10, Loss: 0.0959, Accuracy: 96.61%
Validation Accuracy: 97.55%
Epoch 6/10, Loss: 0.0746, Accuracy: 97.37%
Validation Accuracy: 97.87%
Epoch 7/10, Loss: 0.0829, Accuracy: 97.15%
Validation Accuracy: 97.83%
Epoch 8/10, Loss: 0.0538, Accuracy: 98.17%
Validation Accuracy: 97.31%
Epoch 9/10, Loss: 0.0564, Accuracy: 98.18%
Validation Accuracy: 97.97%
Epoch 10/10, Loss: 0.0699, Accuracy: 97.91%
Validation Accuracy: 96.54%
Average Training Loss: 0.1025
Average Training Accuracy: 96.61%
Test Accuracy: 96.73%
Precision: 0.96
Recall: 0.28
F1 Score: 0.43
AUPRC (Area Under the Precision-Recall Curve): 0.64
