In [1]:
!pip install torch pandas numpy scikit-learn

Collecting torch
  Downloading torch-2.6.0-cp310-cp310-manylinux1_x86_64.whl (766.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:03[0m
Collecting filelock (from torch)
  Downloading filelock-3.17.0-py3-none-any.whl (16 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0ma [36m0:0

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import json
import pandas as pd
from torch.utils.data import DataLoader, Dataset

class PSSP_Dataset(Dataset):
    def __init__(self, embeddings_file, labels_csv, q_type="q3"):
        # Load embeddings (from JSON)
        with open(embeddings_file, "r") as f:
            self.embeddings = json.load(f)

        # Load labels (from CSV)
        self.labels_df = pd.read_csv(labels_csv)
        self.q_type = q_type
        self.sequences = list(self.embeddings.keys())

        # Define label mappings
        self.label_map = {
            'q3': {'C': 0, 'H': 1, 'E': 2},  # Q3 labels
            'q8': {'H': 0, 'G': 1, 'I': 2, 'E': 3, 'B': 4, 'T': 5, 'S': 6, 'C': 7}  # Q8 labels
        }[q_type]  # Choose based on q_type

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = self.sequences[idx]
        X = torch.tensor(self.embeddings[seq], dtype=torch.float32)  # (L, 1024)

        # Get labels
        label_str = self.labels_df.iloc[idx, 1]  # Labels are in the second column
        y = torch.tensor([self.label_map[l] for l in label_str], dtype=torch.long)  # (L,)

        return X, y


In [14]:
def collate_fn(batch):
    X_batch, y_batch = zip(*batch)  # Unzip inputs and labels

    # Pad sequences to the longest in the batch
    X_padded = pad_sequence(X_batch, batch_first=True, padding_value=0)  # Pad inputs
    y_padded = pad_sequence(y_batch, batch_first=True, padding_value=-1)  # Pad labels with -1 (ignored in loss)

    return X_padded, y_padded

In [16]:
batch_size = 32
train_dataset = PSSP_Dataset("cb513_embeddings.json", "CB513.csv", q_type="q3")
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

test_dataset = PSSP_Dataset("casp14_embeddings.json", "CASP14.csv", q_type="q3")
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [17]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim=1024, num_classes=3, num_heads=8, num_layers=2, hidden_dim=512, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.embedding = nn.Linear(input_dim, hidden_dim)  # Project input to hidden_dim
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, batch_first=True, dropout=dropout),
            num_layers=num_layers
        )
        self.fc = nn.Linear(hidden_dim, num_classes)  # Output layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, hidden_dim)
        x = self.transformer(x)  # (batch, seq_len, hidden_dim)
        x = self.fc(self.dropout(x))  # (batch, seq_len, num_classes)
        return x


In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss, optimizer
model = TransformerModel(input_dim=1024, num_classes=3).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=-1)  # Ignore padding labels
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        outputs = model(X)  # (batch, seq_len, num_classes)
        loss = criterion(outputs.view(-1, 3), y.view(-1))  # Flatten and compute loss

        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")


NameError: name 'pad_sequence' is not defined