# Classification de la structure secondaire des protéines avec ProtBERT + Transformer

Ce notebook montre comment prédire la structure secondaire (H, E, C) pour chaque résidu d'une protéine.

Nous allons :
- Charger les datasets (embeddings ProtBERT)
- Définir le modèle Transformer
- Entraîner et évaluer le modèle
- Stocker les métriques et afficher un résumé complet


In [1]:
# Librairies Python de base
import os
import numpy as np
from itertools import product

# Pytorch pour le deep learning
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Visualisation et métriques
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix

# Barre de progression pour le training
from tqdm import tqdm


In [3]:
# Dataset custom pour les embeddings ProtBERT + labels H/E/C
class ProtBERTDataset(Dataset):
    def __init__(self, embeddings_dir):
        self.embeddings, self.labels = [], []
        labels_path = os.path.join(embeddings_dir, "labels.npy")
        labels_list = np.load(labels_path, allow_pickle=True)

        for i, lab in enumerate(labels_list):
            emb_path = os.path.join(embeddings_dir, f"seq_{i}.npy")
            if not os.path.exists(emb_path):
                continue
            emb = np.load(emb_path)
            if emb.shape[0] != len(lab):
                print(f"Séquence {i} ignorée (embedding={emb.shape[0]}, labels={len(lab)})")
                continue
            # Conversion en tenseur Pytorch
            self.embeddings.append(torch.tensor(emb, dtype=torch.float32))
            self.labels.append(torch.tensor(lab, dtype=torch.long))

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        # Renvoie embedding, labels et longueur (pour padding)
        return self.embeddings[idx], self.labels[idx], self.embeddings[idx].shape[0]

# Fonction de collate pour gérer les séquences de longueurs différentes
def collate_batch(batch):
    sequences, labels, lengths = zip(*batch)
    padded_seqs = pad_sequence(sequences, batch_first=True)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)
    return padded_seqs, padded_labels, torch.tensor(lengths)


In [4]:
#modèle
class ProtBERT_Transformer(nn.Module):
    """Transformer simple pour classification H/E/C par résidu"""
    def __init__(self, input_dim=1024, hidden_dim=256, num_layers=2, num_heads=4, output_dim=3, dropout=0.1):
        super().__init__()
        # Projection linéaire des embeddings ProtBERT vers la dimension cachée
        self.embedding_projection = nn.Linear(input_dim, hidden_dim)

        # Encoder Transformer avec self-attention
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_dim, nhead=num_heads,
            dim_feedforward=hidden_dim*2,
            dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classifieur linéaire pour H/E/C par résidu
        self.classifier = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        # Masque pour ignorer le padding
        mask = (torch.arange(x.size(1), device=x.device)[None, :] >= lengths[:, None])
        x = self.embedding_projection(x)
        x = self.encoder(x, src_key_padding_mask=mask)
        logits = self.classifier(x)
        return logits


In [5]:
# Entraînement d'une époque
def train_one_epoch(model, dataloader, optimizer, loss_fn, device):
    model.train()
    total_loss = 0
    for X, y, lengths in tqdm(dataloader, leave=False):
        X, y, lengths = X.to(device), y.to(device), lengths.to(device)
        optimizer.zero_grad()
        logits = model(X, lengths)
        loss = loss_fn(logits.view(-1, logits.shape[-1]), y.view(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Évaluation
@torch.no_grad()
def evaluate(model, dataloader, loss_fn, device):
    model.eval()
    all_true, all_pred = [], []
    total_loss = 0
    for X, y, lengths in tqdm(dataloader, leave=False):
        X, y, lengths = X.to(device), y.to(device), lengths.to(device)
        logits = model(X, lengths)
        loss = loss_fn(logits.view(-1, logits.shape[-1]), y.view(-1))
        total_loss += loss.item()
        preds = logits.argmax(dim=-1)
        mask = y != -100
        all_true.extend(y[mask].cpu().numpy())
        all_pred.extend(preds[mask].cpu().numpy())
    return total_loss / len(dataloader), np.array(all_true), np.array(all_pred)


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Chargement des datasets
train_loader = DataLoader(ProtBERTDataset("Code/ProtSecPred---Deep-Learning-Pipeline-for-Protein-Secondary-Structure-Classification/data/embeddings_train"), batch_size=2, shuffle=True, collate_fn=collate_batch)
valid_loader = DataLoader(ProtBERTDataset("Code/ProtSecPred---Deep-Learning-Pipeline-for-Protein-Secondary-Structure-Classification/data/embeddings_valid"), batch_size=2, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(ProtBERTDataset("Code/ProtSecPred---Deep-Learning-Pipeline-for-Protein-Secondary-Structure-Classification/data/embeddings_testing"), batch_size=2, shuffle=False, collate_fn=collate_batch)

# Initialisation du modèle et des paramètres d'entraînement
model = ProtBERT_Transformer()
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss(ignore_index=-100)

n_epochs = 8
all_results = []

for epoch in range(n_epochs):
    print(f"\n=== Époque {epoch+1}/{n_epochs} ===")
    train_loss = train_one_epoch(model, train_loader, optimizer, loss_fn, device)
    val_loss, y_val_true, y_val_pred = evaluate(model, valid_loader, loss_fn, device)

    acc = accuracy_score(y_val_true, y_val_pred)
    bal_acc = balanced_accuracy_score(y_val_true, y_val_pred)
    macro_f1 = f1_score(y_val_true, y_val_pred, average='macro')
    q3 = np.mean(y_val_true == y_val_pred)

    print(f"Train loss={train_loss:.4f}, Val loss={val_loss:.4f}")
    print(f"Accuracy={acc:.4f}, Balanced={bal_acc:.4f}, Macro F1={macro_f1:.4f}, Q3={q3:.4f}")

    all_results.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'accuracy': acc,
        'balanced_accuracy': bal_acc,
        'macro_f1': macro_f1,
        'Q3': q3
    })


FileNotFoundError: [Errno 2] No such file or directory: 'Code/ProtSecPred---Deep-Learning-Pipeline-for-Protein-Secondary-Structure-Classification/data/embeddings_train\\labels.npy'

In [None]:
print("\n=== Résultats complets par époque ===")
for r in all_results:
    print(f"Époque {r['epoch']}: "
          f"Train loss={r['train_loss']:.4f}, Val loss={r['val_loss']:.4f}, "
          f"Accuracy={r['accuracy']:.4f}, Balanced={r['balanced_accuracy']:.4f}, "
          f"Macro F1={r['macro_f1']:.4f}, Q3={r['Q3']:.4f}")


In [None]:
_, y_test_true, y_test_pred = evaluate(model, test_loader, loss_fn, device)
cm = confusion_matrix(y_test_true, y_test_pred)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=["H","E","C"], yticklabels=["H","E","C"])
plt.show()
