In [7]:
import numpy as np
import os


with open("train_indices.txt") as f:
    train_indices = set(f.read().strip().split())

with open("test_indices.txt") as f:
    test_indices = set(f.read().strip().split())

with open("val_indices.txt") as f:
    val_indices = set(f.read().strip().split())

train_arrays = []
test_arrays = []
val_arrays = []
for f in os.listdir("samples"):
    if f[:-6] in train_indices:
        arr = np.loadtxt(f"samples/{f}", delimiter=',')
        if arr.size == 0:
            continue
        if arr.ndim == 1:
            arr = arr.reshape(1,24)
        train_arrays.append(arr)
    elif f[:-6] in test_indices:
        arr = np.loadtxt(f"samples/{f}", delimiter=',')
        if arr.size == 0:
            continue
        if arr.ndim == 1:
            arr = arr.reshape(1,24)
        test_arrays.append(arr)
    elif f[:-6] in val_indices:
        arr = np.loadtxt(f"samples/{f}", delimiter=',')
        if arr.size == 0:
            continue
        if arr.ndim == 1:
            arr = arr.reshape(1,24)
        val_arrays.append(arr)


all_train = np.vstack(train_arrays)
all_test = np.vstack(test_arrays)
all_val = np.vstack(val_arrays)

all_train_unique = np.unique(all_train, axis=0)
all_test_unique = np.unique(all_test, axis=0)
all_val_unique = np.unique(all_val, axis=0)

labels_train = all_train_unique[:, 22]
labels_test = all_test_unique[:, 22]
labels_val = all_val_unique[:, 22]

inputs_train = np.delete(all_train_unique, -2, axis=1)
inputs_test = np.delete(all_test_unique, -2, axis=1)
inputs_val = np.delete(all_val_unique, -2, axis=1)

In [8]:
print(inputs_train.shape, inputs_test.shape, inputs_val.shape)
print(labels_train.shape, labels_test.shape, labels_val.shape)

(2196724, 23) (283543, 23) (282553, 23)
(2196724,) (283543,) (282553,)


In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix


# Initialize the scaler
scaler = StandardScaler()

# Fit on training data and transform both training and test data
X_train = scaler.fit_transform(inputs_train)
X_test = scaler.transform(inputs_test)
X_val = scaler.transform(inputs_val)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.array([0, 1]), y=labels_train)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)

# Convert to PyTorch tensors
X_train, X_test, X_val = torch.tensor(X_train, dtype=torch.float32), torch.tensor(X_test, dtype=torch.float32), torch.tensor(X_val, dtype=torch.float32)
y_train, y_test, y_val = torch.tensor(labels_train, dtype=torch.long), torch.tensor(labels_test, dtype=torch.long), torch.tensor(labels_val, dtype=torch.long)

# Create DataLoaders
batch_size = 32
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=False)
val_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=False)

# ---- 3. Define MLP Model ----
class MLP(nn.Module):
    def __init__(self, input_size=23, hidden_size=64):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 2)  # Output size = 2 for binary classification
        )
    
    def forward(self, x):
        return self.model(x)

# ---- 4. Initialize Model, Loss, and Optimizer ----
model = MLP().to(device)
# criterion = nn.CrossEntropyLoss()  # Works with non-one-hot labels (0,1)
criterion = nn.CrossEntropyLoss(weight=class_weights)  # Define loss function with class weights
optimizer = optim.Adam(model.parameters(), lr=0.001)

# ---- 5. Train the Model ----
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)  # Move to GPU if available
        
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)  # CrossEntropyLoss expects class indices (0 or 1)
        
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")
    
    model.eval()  # Set the model to evaluation mode
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for batch_X, batch_y in val_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)  # Move to GPU if available
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            test_loss += loss.item()

            # Calculate accuracy (optional)
            _, predicted = torch.max(outputs.data, 1)  # Get the predicted class
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()

    # Print test set metrics
    print(f"Validation Loss: {test_loss / len(test_loader):.4f}, Validation Accuracy: {100 * correct / total:.2f}%")


model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        outputs = model(batch_X)
        predicted = torch.argmax(outputs, dim=1)  # Get class predictions
        
        all_preds.extend(predicted.cpu().numpy())  # Store predictions
        all_labels.extend(batch_y.cpu().numpy())   # Store actual labels

# ---- 7. Compute Accuracy, Precision, Recall, F1-score ----
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, digits=4))

# Compute confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)

# Print confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Epoch [1/10], Loss: 0.6024
Test Loss: 0.5893, Test Accuracy: 62.60%
Epoch [2/10], Loss: 0.5947
Test Loss: 0.5864, Test Accuracy: 62.74%
Epoch [3/10], Loss: 0.5937
Test Loss: 0.5871, Test Accuracy: 61.41%
Epoch [4/10], Loss: 0.5925
Test Loss: 0.5837, Test Accuracy: 62.86%
Epoch [5/10], Loss: 0.5917
Test Loss: 0.5835, Test Accuracy: 62.41%
Epoch [6/10], Loss: 0.5910
Test Loss: 0.5851, Test Accuracy: 62.97%
Epoch [7/10], Loss: 0.5910
Test Loss: 0.5863, Test Accuracy: 62.43%
Epoch [8/10], Loss: 0.5901
Test Loss: 0.5867, Test Accuracy: 62.18%
Epoch [9/10], Loss: 0.5901
Test Loss: 0.5826, Test Accuracy: 62.86%
Epoch [10/10], Loss: 0.5899
Test Loss: 0.5866, Test Accuracy: 63.26%

Classification Report:
              precision    recall  f1-score   support

           0     0.8929    0.5886    0.7095    214867
           1     0.3771    0.7792    0.5082     68676

    accuracy                         0.6348    283543
   macro avg     0.6350    0.6839    0.6089    283543
weighted avg     0.7680