In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Load CSV
df = pd.read_csv('../data/PepBERT_embeddings.csv')

# Feature matrix (160 columns) and labels
X = df.loc[:, '0':'159'].values.astype(np.float32)
y = df['Class'].values

# Encode string labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y).astype(np.int64)

# Convert to PyTorch tensors
X_tensor = torch.tensor(X)
y_tensor = torch.tensor(y_encoded)

# Dataset and split
dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

# Define a simple neural network
class ClassifierNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(ClassifierNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.out = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

# Model init
input_dim = 160
hidden_dim = 128
num_classes = len(label_encoder.classes_)
model = ClassifierNN(input_dim, hidden_dim, num_classes)

# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 200

# Training loop
for epoch in range(epochs):
    model.train()
    for xb, yb in train_loader:
        preds = model(xb)
        loss = criterion(preds, yb)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}")

# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for xb, yb in test_loader:
        outputs = model(xb)
        _, predicted = torch.max(outputs, 1)
        all_preds.extend(predicted.numpy())
        all_labels.extend(yb.numpy())

# Accuracy and report
acc = accuracy_score(all_labels, all_preds)
print(f"\nTest Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))


Epoch 1/200, Loss: 0.6054
Epoch 2/200, Loss: 0.5458
Epoch 3/200, Loss: 0.4530
Epoch 4/200, Loss: 0.4972
Epoch 5/200, Loss: 0.6024
Epoch 6/200, Loss: 0.4133
Epoch 7/200, Loss: 0.3145
Epoch 8/200, Loss: 0.4465
Epoch 9/200, Loss: 0.6192
Epoch 10/200, Loss: 0.7287
Epoch 11/200, Loss: 0.3037
Epoch 12/200, Loss: 0.5381
Epoch 13/200, Loss: 0.3256
Epoch 14/200, Loss: 0.4255
Epoch 15/200, Loss: 0.4538
Epoch 16/200, Loss: 0.4121
Epoch 17/200, Loss: 0.5797
Epoch 18/200, Loss: 0.4453
Epoch 19/200, Loss: 0.4827
Epoch 20/200, Loss: 0.6660
Epoch 21/200, Loss: 0.3294
Epoch 22/200, Loss: 0.3979
Epoch 23/200, Loss: 0.3661
Epoch 24/200, Loss: 0.1897
Epoch 25/200, Loss: 0.3707
Epoch 26/200, Loss: 0.3539
Epoch 27/200, Loss: 0.4244
Epoch 28/200, Loss: 0.3977
Epoch 29/200, Loss: 0.3008
Epoch 30/200, Loss: 0.3682
Epoch 31/200, Loss: 0.2570
Epoch 32/200, Loss: 0.3062
Epoch 33/200, Loss: 0.3459
Epoch 34/200, Loss: 0.3375
Epoch 35/200, Loss: 0.4227
Epoch 36/200, Loss: 0.4286
Epoch 37/200, Loss: 0.2992
Epoch 38/2