In [9]:
import torch
import random
from torch.utils.data import Dataset, DataLoader
import numpy as np


In [10]:
class SparseDNABinaryDataset(Dataset):
    def __init__(self, n_samples=1000, seq_len=10):
        self.bases = ['A', 'C', 'G', 'T']
        self.seq_len = seq_len
        self.data = []

        for _ in range(n_samples):
            seq = [random.choice(self.bases) for _ in range(seq_len)]
            label = 1 if seq[2] == 'A' or seq[4] == 'T' else 0
            self.data.append((seq, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq, label = self.data[idx]
        # Encode DNA bases as integers
        base2idx = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
        encoded_seq = torch.tensor([base2idx[base] for base in seq], dtype=torch.long)
        label = torch.tensor(label, dtype=torch.float)
        return encoded_seq, label

In [11]:
import torch.nn as nn

In [12]:
class TransformerClassifier(nn.Module):
    def __init__(self, vocab_size, seq_len, d_model=32, nhead=2, num_layers=1, ff_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_embedding = nn.Parameter(torch.randn(1, seq_len, d_model))
        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=ff_dim, batch_first=True)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.classifier = nn.Sequential(
            nn.Linear(d_model * seq_len, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, 1)
        )

    def forward(self, x):
        x = self.embedding(x) + self.pos_embedding[:, :x.size(1), :]
        x = self.transformer_encoder(x)
        x = x.flatten(start_dim=1)  # flatten all tokens
        out = self.classifier(x).squeeze(1)
        return out

In [13]:
from torchmetrics import HingeLoss

def train_model(model, dataloader, epochs=50):
    model.train()
    loss_fn = HingeLoss(task='binary')
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            optimizer.zero_grad()
            pred = model(x)
            loss = loss_fn(pred, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} | Loss: {total_loss/len(dataloader):.4f}")


def evaluate_model(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in dataloader:
            out = model(x)
            preds = (out>=0.5)
            correct += (preds == y).sum().item()
            total += y.size(0)

    print(f"Accuracy: {correct / total * 100:.2f}%")


In [15]:
dna_dataset = SparseDNABinaryDataset()
dna_loader = DataLoader(dna_dataset, batch_size=32, shuffle=True)
model_dna = TransformerClassifier(vocab_size=4, seq_len=10)
print("\n--- Training on DNA Classification ---")
train_model(model_dna, dna_loader)
evaluate_model(model_dna, dna_loader)


--- Training on DNA Classification ---
Epoch 1 | Loss: 1.0089
Epoch 2 | Loss: 0.7813
Epoch 3 | Loss: 0.5993
Epoch 4 | Loss: 0.5735
Epoch 5 | Loss: 0.5841
Epoch 6 | Loss: 0.5778
Epoch 7 | Loss: 0.5893
Epoch 8 | Loss: 0.5775
Epoch 9 | Loss: 0.5833
Epoch 10 | Loss: 0.5774
Epoch 11 | Loss: 0.5861
Epoch 12 | Loss: 0.5831
Epoch 13 | Loss: 0.5890
Epoch 14 | Loss: 0.5802
Epoch 15 | Loss: 0.5860
Epoch 16 | Loss: 0.5743
Epoch 17 | Loss: 0.5889
Epoch 18 | Loss: 0.5831
Epoch 19 | Loss: 0.5831
Epoch 20 | Loss: 0.5831
Epoch 21 | Loss: 0.5801
Epoch 22 | Loss: 0.5860
Epoch 23 | Loss: 0.5801
Epoch 24 | Loss: 0.5772
Epoch 25 | Loss: 0.5742
Epoch 26 | Loss: 0.5742
Epoch 27 | Loss: 0.5830
Epoch 28 | Loss: 0.5918
Epoch 29 | Loss: 0.5830
Epoch 30 | Loss: 0.5772
Epoch 31 | Loss: 0.5830
Epoch 32 | Loss: 0.5801
Epoch 33 | Loss: 0.5830
Epoch 34 | Loss: 0.5684
Epoch 35 | Loss: 0.5830
Epoch 36 | Loss: 0.5860
Epoch 37 | Loss: 0.5859
Epoch 38 | Loss: 0.5830
Epoch 39 | Loss: 0.5801
Epoch 40 | Loss: 0.5859
Epoch 41 