<a href="https://colab.research.google.com/github/MedjialeuJordan/ceri-m1-techniques-de-test/blob/master/MLP%2BLSTMFinal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Preparation des données

In [None]:
train_path="/content/protein-secondary-structure.train"
test_path="/content/protein-secondary-structure.test"

#afficher les 30 premières lignes de chaque fichier
with open(train_path, 'r') as f:
  train_lines = [next(f) for x in range(30)]
with open(test_path, 'r') as f:
  test_lines = [next(f) for x in range(30)]

train_lines, test_lines

#Implementation du parsing

In [None]:
# Implémentation du parsing

from typing import List, Tuple

def parse_protein_file(filepath: str) -> Tuple[List[str], List[str]]:
    sequences = []
    structures = []
    current_seq = []
    current_struct = []

    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if not line or line.startswith('#'):
                continue
            if line == '<>':
                if current_seq and current_struct:
                    sequences.append(''.join(current_seq))
                    structures.append(''.join(current_struct))
                current_seq = []
                current_struct = []
            else:
                parts = line.split()
                if len(parts) == 2:
                    aa, ss = parts
                    current_seq.append(aa)
                    current_struct.append(ss if ss in "hecHEC" else 'c')  #'_' devient 'c'

    # Ajout de la dernière séquence si nécessaire
    if current_seq and current_struct:
        sequences.append(''.join(current_seq))
        structures.append(''.join(current_struct))

    return sequences, structures

# Appliquer au jeu de train
train_sequences, train_labels = parse_protein_file(train_path)
test_sequences, test_labels = parse_protein_file(test_path)

# Affichez les 5 premières paires séquence / structure
list(zip(train_sequences, train_labels))[:5]






[('GVGTVPMTDYGNDVEYYGQVTIGTPGKSFNLNFDTGSSNLWVGSVQCQASGCKGGRDKFNPSDGSTFKATGYDASIGYGDGSASGVLGYDTVQVGGIDVTGGPQIQLAQRLGGGGFPGDNDGLLGLGFDTLSITPQSSTNAFDQVSAQGKVIQPVFVVYLAASNISDGDFTMPGWIDNKYGGTLLNTNIDAGEGYWALNVTGATADSTYLGAIFQAILDTGTSLLILPDEAAVGNLVGFAGAQDAALGGFVIACTSAGFKSIPWSIYSAIFEIITALGNAEDDSGCTSGIGASSLGEAILGDQFLKQQYVVFDRDNGIRLAPVA',
  'ccccccccccccccccccceecccccceecceeecccccceeccccccccccccccccccccccccccccccccccccccccceecccccccccccccccccccceeeeecccccccccccceeeccccccccccccccchhhhhhhccccccccccceeeccccceeecccccccccccccccccccccccccccceeccccccccccccccceecccccccceecccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccceechhhhccccceeeccceeeccccc'),
 ('AQCEATIESNDAMQYDLKEMVVDKSCKQFTVHLKHVGKMAKSVMGHNWVLTKEADKEGVATDGMNAGLAQDYVKAGDTRVIAHTKVIGGGESDSVTFDVSKLTPGEAYAYFCSFPGHWAMMKGTLKLSN',
  'ccceeeeeeccccccccceeeeecccceeeeeeeeccccchhhhcccceeeeccchhhhhhhhhccccccccccccccccceecccccccceeeeeeeccccccccceeeecccccccccceeeeeeec'),
 ('SVDIQGNDQMQFNTNAITVDKSCKQFTVNLSHPGNLPKNVMGHNWVLSTAADMQGVVTDGMASGL

#Creation de la fenetre glissante et encodages des labels


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from itertools import product

# Constantes
WINDOW_SIZE = 13
AA_LIST = list("ACDEFGHIKLMNPQRSTVWY")
AA_TO_INDEX = {aa: i for i, aa in enumerate(AA_LIST)}
SS_TO_INDEX = {'c': 0, 'h': 1, 'e': 2}
NUM_AA = len(AA_LIST)

# --- Fonction d'encodage one-hot ---
#Transforme une acide aminé en un vecteur binaire de taille 20, la position correspondant a l'acide est un et le reste c'est 0
def one_hot_encode(aa: str) -> torch.Tensor:
    vec = torch.zeros(NUM_AA)
    if aa in AA_TO_INDEX:
        vec[AA_TO_INDEX[aa]] = 1.0
    return vec


def get_param_combinations(param_grid):
    keys = param_grid.keys()
    values = param_grid.values()
    for combo in product(*values):
        yield dict(zip(keys, combo))

# --- Dataset personnalisé ---
class ProteinDataset(Dataset):
#Cree un dataset où chaque echantillon est une fenetre glissante autour d'un acide aminé
    def __init__(self, sequences, structures, window_size=13, mode='one_hot', embedding_dim=None):
        self.samples = []
        self.mode=mode
        self.embedding_dim=embedding_dim
        self.window_size=window_size
        pad = WINDOW_SIZE // 2
        #le padding permet de centrer la fenetre autour de tous les acides aminés
        for seq, ss in zip(sequences, structures):
            seq = pad * 'X' + seq + pad * 'X'
            ss = 'c' * pad + ss + 'c' * pad
            for i in range(pad, len(seq) - pad):
                window = seq[i - pad: i + pad + 1]
                if 'X' in window:
                    continue
                encoded = torch.stack([one_hot_encode(aa) for aa in window])
                label = SS_TO_INDEX.get(ss[i].lower(), 0)
                self.samples.append((encoded, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]


# --- MLP ---
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        #1ere couche cachée
        self.fc1 = nn.Linear(WINDOW_SIZE * NUM_AA, 128) #Entrée la fenetre glissante one hot (13*20),sortie 128 neurones
        self.bn1 = nn.BatchNorm1d(128) #application de la batchNorm
        self.drop1 = nn.Dropout(0.3)

        #2eme couche cachée
        self.fc2 = nn.Linear(128, 64) #reduction a 64 neurones
        self.bn2 = nn.BatchNorm1d(64)
        self.drop2 = nn.Dropout(0.3)

        #derniere couche
        self.fc3 = nn.Linear(64, 3)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.drop1(F.relu(self.bn1(self.fc1(x))))
        x = self.drop2(F.relu(self.bn2(self.fc2(x))))
        return F.log_softmax(self.fc3(x), dim=1)#réduit a 3 sorties c,h,e

# --- LSTM ---
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=NUM_AA, hidden_size=64, batch_first=True)
        self.bn = nn.BatchNorm1d(64)
        self.drop = nn.Dropout(0.3)
        self.fc = nn.Linear(64, 3)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        h = self.bn(hn[-1])
        h = self.drop(h)
        return F.log_softmax(self.fc(h), dim=1)

# Prochaine étape : entraînement et test des modèles



#Training du modele

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MLP().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#Creation de l'optimizer Adam qui ajuste les poids d'un modele

#Entrainement du modele
def train_model(model,train_loader,optimizer,device):
# Entraînement simple
  for epoch in range(1, 10):
      model.train()
      for x, y in train_loader:
          x, y = x.to(device), y.to(device)
          optimizer.zero_grad()#Reinitialise les gradients a chaque batch
          loss = F.nll_loss(model(x), y)#Calcule de la loss
          loss.backward()#backpropagation
          optimizer.step()#mise a jour des poids
      print(f"Epoch {epoch} terminée.")

# Test
def evaluate(model, test_loader, device):
  model.eval()
  correct = 0
  with torch.no_grad():
      for x, y in test_loader:
          x, y = x.to(device), y.to(device)
          pred = model(x).argmax(dim=1)
          correct += pred.eq(y).sum().item()

  print(f"Test accuracy: {100. * correct / len(test_dataset):.2f}%")


#Applications des hyper parametres

In [None]:
from itertools import product

def get_param_combinations(param_grid):
    keys = param_grid.keys()
    values = param_grid.values()
    for combo in product(*values):
        yield dict(zip(keys, combo))

def grid_search(train_data, test_data, param_grid):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = []

    for params in get_param_combinations(param_grid):
        print(f"\nTesting: {params}")
        train_dataset = ProteinDataset(*train_data, window_size=params['window_size'], mode=params['embedding_mode'], embedding_dim=params['embedding_dim'])
        test_dataset = ProteinDataset(*test_data, window_size=params['window_size'], mode=params['embedding_mode'], embedding_dim=params['embedding_dim'])

        train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=128)

        input_size = len(AA_LIST) if params['embedding_mode'] == 'one_hot' else params['embedding_dim']

        if params['model_type'] == 'MLP':
            model = MLP().to(device)
        else:
            model = LSTM(input_size).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'], weight_decay=params['weight_decay'])
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5) if params['sched'] else None

        for epoch in range(5):
            train_model(model, train_loader, optimizer, device)

        acc = evaluate(model, test_loader, device)
        print(f"Accuracy: {acc:.2f}%")
        results.append((params, acc))

    return results

# grille d'hyperparametre
param_grid = {
    "model_type": ["MLP", "RNN"],
    "embedding_mode": ["one_hot"],
    "sched": [True],
    "window_size": [5, 13],
    "embedding_dim": [8],
    "batch_size": [4],
    "lr": [1e-2],
    "weight_decay": [0.1],
}

train_sequences, train_labels = parse_protein_file("/content/protein-secondary-structure.train")
test_sequences, test_labels = parse_protein_file("/content/protein-secondary-structure.test")
results = grid_search((train_sequences, train_labels), (test_sequences, test_labels), param_grid)
print(results)


Testing: {'model_type': 'MLP', 'embedding_mode': 'one_hot', 'sched': True, 'window_size': 5, 'embedding_dim': 8, 'batch_size': 4, 'lr': 0.01, 'weight_decay': 0.1}
Epoch 1 terminée.
Epoch 2 terminée.
Epoch 3 terminée.
Epoch 4 terminée.
Epoch 5 terminée.
Epoch 6 terminée.
Epoch 7 terminée.
Epoch 8 terminée.
Epoch 9 terminée.
Epoch 1 terminée.
Epoch 2 terminée.
Epoch 3 terminée.
Epoch 4 terminée.
Epoch 5 terminée.
Epoch 6 terminée.
Epoch 7 terminée.
Epoch 8 terminée.
Epoch 9 terminée.
Epoch 1 terminée.
Epoch 2 terminée.
Epoch 3 terminée.
Epoch 4 terminée.
Epoch 5 terminée.
Epoch 6 terminée.
Epoch 7 terminée.
Epoch 8 terminée.
Epoch 9 terminée.
Epoch 1 terminée.
Epoch 2 terminée.
Epoch 3 terminée.
Epoch 4 terminée.
Epoch 5 terminée.
Epoch 6 terminée.
Epoch 7 terminée.
Epoch 8 terminée.
Epoch 9 terminée.
Epoch 1 terminée.
Epoch 2 terminée.
Epoch 3 terminée.
Epoch 4 terminée.
Epoch 5 terminée.
Epoch 6 terminée.
Epoch 7 terminée.
Epoch 8 terminée.
Epoch 9 terminée.


NameError: name 'test_dataset' is not defined