In [1]:
!python3.10 -m venv pytorch-env

In [3]:
!pytorch-env\Scripts\activate 

In [5]:
!pip install notebook ipykernel



In [7]:
#installations des librairies
!pip install torch --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118


In [9]:
!pip install pandas



In [11]:
!pip install sentencepiece



In [13]:
!pip install matplotlib



In [138]:
!pip install wandb
!wandb login



wandb: Currently logged in as: marius-casamian (marius-casamian-sophia-antipolis). Use `wandb login --relogin` to force relogin


In [15]:
#import des librairies pour le RNN

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import sentencepiece as spm
from torch.nn.utils.rnn import pad_sequence
import re
import matplotlib.pyplot as plt
import torch.optim as optim


In [16]:
#import des librairies pour le RNN  + Transformer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import sentencepiece as spm
from torch.nn.utils.rnn import pad_sequence
import re
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import math
import wandb
import torch.optim as optim


In [19]:
#PARTIE 1 : CHAR-LEVEL

In [126]:
# Lecture du dataset
dataset_path = "Desktop/Projet_Methodes_Apprentissages/inputs/en-fr.csv"  
df = pd.read_csv(dataset_path, usecols=["en", "fr"], nrows=100000)
print(df.head())

                                                  en  \
0  Changing Lives | Changing Society | How It Wor...   
1                                           Site map   
2                                           Feedback   
3                                            Credits   
4                                           Français   

                                                  fr  
0  Il a transformé notre vie | Il a transformé la...  
1                                       Plan du site  
2                                        Rétroaction  
3                                            Crédits  
4                                            English  


#### Tokenizer Simple caractère par caractère

In [50]:
class SimpleTokenizer:
    """
    Tokenizer qui coupe chaque phrase caractère par caractère.
    """
    def __init__(self):
        # Initialisation des tokens spéciaux
        self.char2id = {"<bos>": 0, "<eos>": 1}
        self.id2char = {0: "<bos>", 1: "<eos>"}
        
    def fit(self, sentences):
        """
        Construit le vocabulaire à partir d'une liste de phrases
        """
        for sentence in sentences:
            for char in sentence:
                if char not in self.char2id:
                    idx = len(self.char2id)
                    self.char2id[char] = idx
                    self.id2char[idx] = char
        
    def encode(self, sentence):
        """
        Convertit une phrase en liste d'ID.
        """
        # On ignore ici la casse 
        encoded = [self.char2id["<bos>"]]
        for char in sentence:
            if char in self.char2id:
                encoded.append(self.char2id[char])
        encoded.append(self.char2id["<eos>"])
        return encoded
    
    def decode(self, ids):
        """
        Convertit une liste d'ID en phrase.
        """
        #on ignore les tokens et on reconstruit les phrases
        chars = []
        for idx in ids:
            if idx not in (self.char2id["<bos>"], self.char2id["<eos>"]):
                chars.append(self.id2char[idx])
        return "".join(chars)
    
    def vocab_size(self):
        return len(self.char2id)

#### Dataset de paires (en - fr)

In [53]:
class SimpleTextPairDataset(Dataset):
    """
    Dataset qui gère des paires (source, cible), 
    utilise un tokenizer (ici char-level) et tronque au besoin.
    """
    def __init__(self, pairs, tokenizer, max_len=50):
        """
        pairs: liste de tuples (phrase_source, phrase_cible)
        tokenizer: instance de SimpleTokenizer
        max_len: longueur max pour tronquer
        """
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        #tronque pour ne pas exploser la mémoire
        src = src[:self.max_len]
        tgt = tgt[:self.max_len]

        #tokenization et mapping vers ID
        src_ids = self.tokenizer.encode(src)
        tgt_ids = self.tokenizer.encode(tgt)

        #conversion en tenseurs
        src_tensor = torch.tensor(src_ids, dtype=torch.long)
        tgt_tensor = torch.tensor(tgt_ids, dtype=torch.long)

        return src_tensor, tgt_tensor

#### Seq2Seq RNN simple 

In [56]:
class Seq2SeqRNN(nn.Module):
    """
    Modèle de traduction simple RNN.
    """
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        
        #embedding
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        #RNN simple
        self.rnn = nn.RNN(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            batch_first=True
        )
        
        #projection linéaire des états cachés vers le vocab
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
        self.hidden_dim = hidden_dim

    def forward(self, src, tgt):
        """
        On réalise un "teacher forcing" direct dans le code d’entraînement
        """
        #encode phrase source
        emb_src = self.embedding(src)   # [batch_size, src_len, embed_dim]
        _, hidden = self.rnn(emb_src)   # hidden [1, batch_size, hidden_dim]
        
        #decoder séquence cible
        emb_tgt = self.embedding(tgt)   # [batch_size, tgt_len, embed_dim]
        outputs, _ = self.rnn(emb_tgt, hidden)
        
        #projection vers le vocab
        logits = self.fc(outputs)       # [batch_size, tgt_len, vocab_size]
        
        return logits

#### Entraînement sur une époque

In [59]:
def train_one_epoch(model, dataloader, optimizer, device):
    model.train()
    criterion = nn.CrossEntropyLoss()
    
    total_loss = 0
    total_samples = 0
    correct_top1 = 0
    correct_top5 = 0
    
    for (src, tgt) in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        #teacher forcing
        # On sépare la cible en (tgt_input, tgt_target)
        tgt_input = tgt[:, :-1]   # tout sauf le dernier token
        tgt_target = tgt[:, 1:]   # tout sauf le premier token
        
        #passage modèle
        logits = model(src, tgt_input)
        
        #reshape pour la cross-entropy
        batch_size, seq_len, vocab_size = logits.shape
        logits_2d = logits.reshape(-1, vocab_size)     # [batch_size * seq_len, vocab_size]
        targets_1d = tgt_target.reshape(-1)            # [batch_size * seq_len]
        
        loss = criterion(logits_2d, targets_1d)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        #calcul de la top-1 accuracy
        _, preds_top1 = torch.max(logits_2d, dim=1)  # [N]
        correct_top1 += (preds_top1 == targets_1d).sum().item()
        
        #calcul de la top-5 accuracy
        top5_vals, top5_idxs = torch.topk(logits_2d, k=5, dim=1) 
        #on compare la cible à ces 5 IDs
        targets_1d_2col = targets_1d.unsqueeze(1)                 
        match_matrix = (top5_idxs == targets_1d_2col)             
        correct_top5 += match_matrix.any(dim=1).sum().item()
        
        total_samples += logits_2d.size(0)
    
    avg_loss = total_loss / len(dataloader)
    top1_acc = correct_top1 / total_samples
    top5_acc = correct_top5 / total_samples
    
    return avg_loss, top1_acc, top5_acc

#### entraînement du modèle

In [62]:
def train_model(model, train_dataset, batch_size=2, epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    
    dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(1, epochs+1):
        avg_loss, top1, top5 = train_one_epoch(model, dataloader, optimizer, device)
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Top1: {top1:.4f}, Top5: {top5:.4f}")

#### Fonction de traduction que l'on modifiera plus tard 

In [65]:
def translate_sentence_debug(model, source_sentence, tokenizer, max_len=50, device="cuda"):
    """
    Fonction de traduction simple, en affichant 
    les probabilités de chaque token généré.
    """
    model.eval()
    
    # Encode la phrase source
    src_ids = tokenizer.encode(source_sentence)
    src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device)
    
    with torch.no_grad():
        # Passage encodeur
        emb_src = model.embedding(src_tensor)
        _, hidden = model.rnn(emb_src)
    
    generated_ids = []
    #<bos>
    current_token = torch.tensor([[tokenizer.char2id["<bos>"]]], dtype=torch.long, device=device)
    
    for _ in range(max_len):
        with torch.no_grad():
            emb_tgt = model.embedding(current_token)
            output, hidden = model.rnn(emb_tgt, hidden)
            logits = model.fc(output.squeeze(1))  # shape: [1, vocab_size]
            probs = torch.softmax(logits, dim=1)
            
            #token aléatoire pondéré par probs
            next_token_id = torch.multinomial(probs, num_samples=1).item()
            
            print(f"Token généré : {next_token_id}, Probabilité : {probs[0, next_token_id]:.4f}")
            
            if next_token_id == tokenizer.char2id["<eos>"]:
                break
            
            generated_ids.append(next_token_id)
            current_token = torch.tensor([[next_token_id]], dtype=torch.long, device=device)
    
    return tokenizer.decode(generated_ids)

#### Pipeline d'éxécution 

In [68]:
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Appareil utilisé : {device}")
    
    english_sentences = df["en"].astype(str).tolist()
    french_sentences = df["fr"].astype(str).tolist()
    # Création des paires
    pairs = list(zip(english_sentences, french_sentences))
    print(f"Nombre de paires chargées : {len(pairs)}")
    
    # Construction du tokenizer char-level
    tokenizer = SimpleTokenizer()
    print(f"exemple voc généré: {tokenizer.char2id}")
    tokenizer.fit(english_sentences + french_sentences)
    print(f"Taille du vocabulaire : {tokenizer.vocab_size()}")
    
    filtered_pairs = [(src, tgt) for src, tgt in pairs if len(src) > 3 and len(tgt) > 3]
    dataset = SimpleTextPairDataset(filtered_pairs, tokenizer, max_len=10)
    
    #exemple paire
    src, tgt = dataset[0]
    print("Source tensor:", src)
    print("Target tensor:", tgt)

    #traduction origninale ex
    src_decoded = tokenizer.decode(src.tolist())
    tgt_decoded = tokenizer.decode(tgt.tolist())
    print("Source (decoded):", src_decoded)
    print("Target (decoded):", tgt_decoded)

    vocab_size = tokenizer.vocab_size()
    model = Seq2SeqRNN(vocab_size=vocab_size, embed_dim=128, hidden_dim=256).to(device)

    print(model)
    
    # j'aurais pu faire train, test, split 
    train_model(model, dataset, batch_size=1, epochs=10)
    
    # Test
    test_en = "hello"
    translation = translate_sentence_debug(model, test_en, tokenizer)
    print(f"\nTraduction de '{test_en}' = '{translation}'")

Appareil utilisé : cuda
Nombre de paires chargées : 100000
exemple voc généré: {'<bos>': 0, '<eos>': 1}
Taille du vocabulaire : 203
Source tensor: tensor([0, 2, 3, 4, 5, 6, 7, 5, 6, 8, 9, 1])
Target tensor: tensor([ 0, 21, 26,  8,  4,  8, 17, 23,  4,  5, 12,  1])
Source (decoded): Changing L
Target (decoded): Il a trans
Seq2SeqRNN(
  (embedding): Embedding(203, 128)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=203, bias=True)
)
Epoch 1, Loss: 1.7752, Top1: 0.5120, Top5: 0.7996
Epoch 2, Loss: 1.7926, Top1: 0.5076, Top5: 0.8005
Epoch 3, Loss: 1.9435, Top1: 0.4673, Top5: 0.7759
Epoch 4, Loss: 2.0949, Top1: 0.4226, Top5: 0.7525
Epoch 5, Loss: 2.1759, Top1: 0.3990, Top5: 0.7388
Epoch 6, Loss: 2.2226, Top1: 0.3872, Top5: 0.7311
Epoch 7, Loss: 2.2383, Top1: 0.3836, Top5: 0.7284
Epoch 8, Loss: 2.2842, Top1: 0.3682, Top5: 0.7217
Epoch 9, Loss: 2.2818, Top1: 0.3690, Top5: 0.7226
Epoch 10, Loss: 2.2934, Top1: 0.3643, Top5: 0.7211
Token généré : 9, Probabil

#### Partie 2 Tokenizers 

In [164]:

#PARTIE 2 : SENTENCEPIECE TOKENIZERS

#Créer un sous-ensemble du dataset pour l'entraînement des tokenizers
subset_size = 10000
df_subset = df.sample(n=subset_size, random_state=42)  
df_subset["en"].to_csv("en_sub.txt", index=False, header=False)
df_subset["fr"].to_csv("fr_sub.txt", index=False, header=False)

#Entraînement du tokenizer anglais
spm.SentencePieceTrainer.train(
    input='en_sub.txt',
    model_prefix='spm_en',
    vocab_size=5000, 
    model_type='bpe', #BPE plutôt que le default cf. compte-rendu
    user_defined_symbols=['<pad>','<bos>', '<eos>']
)
#Entraînement du tokenizer français
spm.SentencePieceTrainer.train(
    input='fr_sub.txt',
    model_prefix='spm_fr',
    vocab_size=5000,
    model_type='bpe',
    user_defined_symbols=['<pad>','<bos>', '<eos>']
)

In [166]:
#Tokenizer SentencePiece

class SentencePieceTokenizer:
    """
    Tokenizer utilisant SentencePiece.
    """
    def __init__(self, model_path):
        self.sp = spm.SentencePieceProcessor(model_file=model_path)
        self.vocab_size = self.sp.get_piece_size()
        
        # IDs pour tokens spéciaux
        self.pad_id = self.sp.piece_to_id("<pad>")
        self.bos_id = self.sp.piece_to_id("<bos>")
        self.eos_id = self.sp.piece_to_id("<eos>")

    def encode(self, text):
        """
        Encode une phrase en ID via SentencePiece.
        """
        return [self.bos_id] + self.sp.encode(text, out_type=int) + [self.eos_id]

    def decode(self, tokens):
        """
        Decode une liste d'ID en texte.
        """
        tokens = [t for t in tokens if t != self.pad_id]
        return self.sp.decode(tokens)

In [168]:
#Dataset avec SentencePiece et filtrage par longueur max

class SPTextPairDataset(Dataset):
    """
    Dataset pour paires (source_en, target_fr) utilisant SentencePiece.
    tronquage.
    """
    def __init__(self, pairs, sp_en, sp_fr, max_len=80):
        self.sp_en = sp_en
        self.sp_fr = sp_fr
        self.max_len = max_len
        self.pairs = []

        for en_text, fr_text in pairs:
            # retirer symboles bizarres (exemple)
            en_text = re.sub(r"[|~#]", "", en_text)
            fr_text = re.sub(r"[|~#]", "", fr_text)

            src_ids = sp_en.encode(en_text)
            tgt_ids = sp_fr.encode(fr_text)
            
            # Filtrer si la longueur dépasse max_len
            if len(src_ids) <= max_len and len(tgt_ids) <= max_len:
                self.pairs.append((src_ids, tgt_ids))

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        src_ids, tgt_ids = self.pairs[idx]
        src_tensor = torch.tensor(src_ids, dtype=torch.long)
        tgt_tensor = torch.tensor(tgt_ids, dtype=torch.long)
        return src_tensor, tgt_tensor


In [170]:
#Modèle Seq2Seq RNN un peu plus complexe avec plus de fonctionnalités
class Seq2SeqRNN(nn.Module):
    """
    RNN avec:
      -deux embeddings (anglais / français)
      -dropout
      -un encodeur RNN + un décodeur RNN
    """
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_dim=512, hidden_dim=1024, dropout=0.2):
        super().__init__()
        
        # Embeddings
        self.embedding_src = nn.Embedding(src_vocab_size, embed_dim)
        self.embedding_tgt = nn.Embedding(tgt_vocab_size, embed_dim)
        
        self.dropout = nn.Dropout(p=dropout)
        
        # Encoder RNN
        self.encoder = nn.RNN(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=4,   
            batch_first=True,
            dropout=dropout,
            nonlinearity='relu'
        )
        
        # Decoder RNN
        self.decoder = nn.RNN(
            input_size=embed_dim,
            hidden_size=hidden_dim,
            num_layers=4,
            batch_first=True,
            dropout=dropout,
        )
        
        # Projection linéaire
        self.fc_out = nn.Linear(hidden_dim, tgt_vocab_size)
        
        self.hidden_dim = hidden_dim

    def forward(self, src, tgt):
        """
        src: [batch_size, src_len]
        tgt: [batch_size, tgt_len]
        """
        #Encodeur
        emb_src = self.embedding_src(src)      #[B, src_len, embed_dim]
        emb_src = self.dropout(emb_src)        #Dropout sur embeddings
        _, hidden = self.encoder(emb_src)      #hidden => [num_layers, B, hidden_dim]
        
        #Décodeur (teacher forcing)
        emb_tgt = self.embedding_tgt(tgt)      #[B, tgt_len, embed_dim]
        emb_tgt = self.dropout(emb_tgt)
        outputs, _ = self.decoder(emb_tgt, hidden)
        
        #projection
        logits = self.fc_out(outputs)          #[B, tgt_len, tgt_vocab_size]
        return logits


In [172]:

# 4) Fonction collate_fn pour le padding

def my_collate_fn(batch):
    """
    batch: liste de tuples de tailles variables.
    On va les pad pour obtenir [batch_size, max_seq_len].
    """
    src_list, tgt_list = [], []
    for (src, tgt) in batch:
        src_list.append(src)
        tgt_list.append(tgt)

    src_padded = pad_sequence(src_list, batch_first=True, padding_value=sp_en.pad_id)  
    tgt_padded = pad_sequence(tgt_list, batch_first=True, padding_value=sp_fr.pad_id)

    return src_padded, tgt_padded

In [174]:

# 5) Boucle d'entraînement
def train_one_epoch(model, dataloader, optimizer, device, pad_id):
    model.train()
    criterion = nn.CrossEntropyLoss(ignore_index=pad_id)
    
    total_loss = 0
    total_samples = 0
    correct_top1 = 0
    correct_top5 = 0
    
    for (src, tgt) in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)
        
        # Décalage pour teacher forcing
        tgt_input = tgt[:, :-1]   # tout sauf le dernier
        tgt_target = tgt[:, 1:]   # tout sauf le premier
        
        logits = model(src, tgt_input)  # [B, seq_len, vocab_size]
        
        # On reshape pour la cross entropy
        B, seq_len, vocab_size = logits.shape
        logits_2d = logits.reshape(-1, vocab_size)   # [B*seq_len, vocab_size]
        targets_1d = tgt_target.reshape(-1)          # [B*seq_len]

         # Vérifie les données
        assert torch.isfinite(src).all(), "NaN ou inf dans les entrées source"
        assert torch.isfinite(tgt_input).all(), "NaN ou inf dans les entrées cible"
        assert torch.isfinite(logits).all(), "NaN ou inf dans les logits"

        loss = criterion(logits_2d, targets_1d)

        if not torch.isfinite(loss):
            print("NaN détectée dans la perte. Vérifie les entrées et les logits.")
            print(f"src: {src}")
            print(f"tgt_input: {tgt_input}")
            print(f"logits: {logits}")
            break
        
        optimizer.zero_grad()
        loss.backward()

        #gradient clipping
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  
        optimizer.step()
        
        total_loss += loss.item()
        
        # Calcul de la top-1 accuracy
        _, preds_top1 = torch.max(logits_2d, dim=1)  
        correct_top1 += (preds_top1 == targets_1d).sum().item()
        
        # Calcul de la top-5 accuracy
        top5_vals, top5_idxs = torch.topk(logits_2d, k=5, dim=1)  
        match_matrix = (top5_idxs == targets_1d.unsqueeze(1))
        correct_top5 += match_matrix.any(dim=1).sum().item()
        
        total_samples += targets_1d.size(0)
    
    avg_loss = total_loss / len(dataloader)
    top1_acc = correct_top1 / total_samples
    top5_acc = correct_top5 / total_samples
    
    return avg_loss, top1_acc, top5_acc

def train_model(model, train_dataset, batch_size=16, epochs=30):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=my_collate_fn
    )
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    for epoch in range(1, epochs + 1):
        avg_loss, top1, top5 = train_one_epoch(model, dataloader, optimizer, device, sp_fr.pad_id)
        print(f"Epoch {epoch}/{epochs} - Loss: {avg_loss:.4f} - Top1: {top1:.4f} - Top5: {top5:.4f}")

In [176]:

# Fonction de traduction (décodage)
def translate_sentence_debug(model, source_sentence, sp_en, sp_fr, max_len=80, device="cuda"):
    """
    Traduction en->fr:
      -encode la phrase source
      -génère token par token
      -décode le résultat en string
      -sampling
    """
    model.eval()
    model.to(device)
    
    # Encode la phrase source
    src_ids = sp_en.encode(source_sentence)
    src_tensor = torch.tensor([src_ids], dtype=torch.long, device=device)
    
    with torch.no_grad():
        # Encodeur
        emb_src = model.embedding_src(src_tensor)
        emb_src = model.dropout(emb_src)
        _, hidden = model.encoder(emb_src)  #[1, 1, hidden_dim]
    
    generated_ids = []
    bos_id = sp_fr.bos_id
    eos_id = sp_fr.eos_id
    
    # On initie le décodeur avec <bos>
    current_token = torch.tensor([[bos_id]], dtype=torch.long, device=device)
    
    for _ in range(max_len):
        with torch.no_grad():
            emb_tgt = model.embedding_tgt(current_token)
            emb_tgt = model.dropout(emb_tgt)
            output, hidden = model.decoder(emb_tgt, hidden)  # [1, 1, hidden_dim]
            logits = model.fc_out(output.squeeze(1))         # [1, vocab_size]
            
            # sampling
            probs = torch.softmax(logits, dim=1)
            next_token_id = torch.multinomial(probs, num_samples=1).item()
          
            
            if next_token_id == eos_id:
                break
            generated_ids.append(next_token_id)
            
            current_token = torch.tensor([[next_token_id]], dtype=torch.long, device=device)
    
    # Décodage vers texte
    translation = sp_fr.decode(generated_ids)
    return translation

In [182]:
def clean_and_filter_dataset(df, tokenizer_en, tokenizer_fr, max_len=80, min_len=3, max_ratio=1.5):
    # Supprimer valeurs vides
    df.dropna(subset=["en", "fr"], inplace=True)
    df = df[(df["en"].str.strip() != "") & (df["fr"].str.strip() != "")]
    
    # Supprimer les doublons
    df.drop_duplicates(subset=["en", "fr"], inplace=True)
    
    # Nettoyer le texte
    def clean_text(text):
        text = re.sub(r"[|~#]", "", text)               # Retirer certains caractères
        text = re.sub(r"<[^>]+>", "", text)             # Supprimer balises HTML
        text = re.sub(r"\s+", " ", text)                # Espaces multiples
        return text.strip()
    
    df["en"] = df["en"].apply(clean_text)
    df["fr"] = df["fr"].apply(clean_text)
    
    # Filtrer par longueur
    def filter_by_length(row):
        en_len = len(tokenizer_en.encode(row["en"]))
        fr_len = len(tokenizer_fr.encode(row["fr"]))
        return min_len <= en_len <= max_len and min_len <= fr_len <= max_len
    
    df = df[df.apply(filter_by_length, axis=1)]
    
    # Filtrer les phrases déséquilibrées
    def filter_unbalanced(row):
        en_len = len(tokenizer_en.encode(row["en"]))
        fr_len = len(tokenizer_fr.encode(row["fr"]))
        ratio = max(en_len / fr_len, fr_len / en_len)
        return ratio <= max_ratio
    
    df = df[df.apply(filter_unbalanced, axis=1)]
    
    # Réindexer le DataFrame
    df.reset_index(drop=True, inplace=True)
    
    return df

In [None]:

#7) Script principal
if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[INFO] Appareil utilisé : {device}")

    dataset_path = "Desktop/Projet_Methodes_Apprentissages/inputs/en-fr.csv"

    #Charger les données initiales
    df_init = pd.read_csv(dataset_path, usecols=["en", "fr"], nrows=100000)
    print(f"[INFO] Dataset initial chargé avec {len(df_init)} lignes.")

    #Nettoyer et filtrer les données
    sp_en = SentencePieceTokenizer("spm_en.model")
    sp_fr = SentencePieceTokenizer("spm_fr.model")
    cleaned_df = clean_and_filter_dataset(df_init, tokenizer_en=sp_en, tokenizer_fr=sp_fr, max_len=80)
    cleaned_df.to_csv("cleaned_dataset.csv", index=False)
    print(f"[INFO] Dataset nettoyé sauvegardé dans 'cleaned_dataset.csv'.")
    print(f"[INFO] Nombre de lignes après nettoyage : {len(cleaned_df)}")
    
    #Charger le dataset nettoyé
    df = pd.read_csv("cleaned_dataset.csv", usecols=["en", "fr"])
    print(f"[INFO] Dataset nettoyé chargé avec {len(df)} lignes.")
    print(f"[INFO] Premières lignes : {df.head()}")

    #Créer les paires (anglais, français)
    pairs = []
    for en_text, fr_text in zip(df["en"], df["fr"]):
        if isinstance(en_text, str) and isinstance(fr_text, str) and len(en_text) > 3 and len(fr_text) > 3:
            pairs.append((en_text, fr_text))

    print(f"[INFO] Nombre de paires générées : {len(pairs)}")
    print("[INFO] Exemple d'une paire (anglais -> français) :")
    print("  ", pairs[0])

    #Charger les tokenizers
    sp_en = SentencePieceTokenizer("spm_en.model")
    sp_fr = SentencePieceTokenizer("spm_fr.model")
    print(f"[INFO] Vocab anglais : {sp_en.vocab_size} tokens.")
    print(f"[INFO] Vocab français : {sp_fr.vocab_size} tokens.")
    print("[DEBUG] Vérification du token <pad> ID :")
    print(f"PAD_ID anglais : {sp_en.pad_id}")
    print(f"PAD_ID français : {sp_fr.pad_id}")


    #Créer le dataset avec filtrage de longueur max
    sp_dataset = SPTextPairDataset(pairs, sp_en, sp_fr, max_len=80)
    print(f"[INFO] Nombre de paires après filtrage (max_len=80) : {len(sp_dataset)}")

    #Instancier le modèle Seq2Seq RNN
    model = Seq2SeqRNN(
        src_vocab_size=sp_en.vocab_size, 
        tgt_vocab_size=sp_fr.vocab_size,
        embed_dim=512,   
        hidden_dim=1024,  
        dropout=0.2      
    )
    print("[INFO] Modèle Seq2SeqRNN instancié :")
    print(model)

    #Entraîner le modèle
    print("[INFO] Démarrage de l'entraînement...")
    train_model(model, sp_dataset, batch_size=16, epochs=30)
    print("[INFO] Entraînement terminé.")

    #Tester le modèle avec une traduction
    test_en = "Hello world, how are you?"
    translation = translate_sentence_debug(model, test_en, sp_en, sp_fr, max_len=80, device=device)
    print(f"\n[INFO] Traduction de '{test_en}' = '{translation}'")


[INFO] Appareil utilisé : cuda
[INFO] Dataset initial chargé avec 100000 lignes.
[INFO] Dataset nettoyé sauvegardé dans 'cleaned_dataset.csv'.
[INFO] Nombre de lignes après nettoyage : 77793
[INFO] Dataset nettoyé chargé avec 77793 lignes.
[INFO] Premières lignes :                                                   en  \
0  Changing Lives Changing Society How It Works T...   
1                                           Site map   
2                                           Feedback   
3                                            Credits   
4                                           Français   

                                                  fr  
0  Il a transformé notre vie Il a transformé la s...  
1                                       Plan du site  
2                                        Rétroaction  
3                                            Crédits  
4                                            English  
[INFO] Nombre de paires générées : 77295
[INFO] Exemple d'une paire

#### Transformer

In [144]:

#TRANSFORMER

class PositionalEncoding(nn.Module):
    def __init__(self, d_model: int, dropout: float, max_len: int = 5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)

        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size: int):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: torch.Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, emb_size, nhead, src_vocab_size, tgt_vocab_size, dim_feedforward=2048, dropout=0.2):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True
        )
        self.generator = nn.Linear(emb_size, tgt_vocab_size)

        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout)

    def forward(self, src, tgt, src_mask, tgt_mask, src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(tgt))
        outs = self.transformer(
            src_emb, tgt_emb, src_mask, tgt_mask, None,
            src_padding_mask, tgt_padding_mask, memory_key_padding_mask
        )
        return self.generator(outs)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=device)) == 1).transpose(0, 1)
    mask = mask.masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask.type(torch.bool)  # Transforme en masque booléen

def create_mask(src, tgt, device):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    tgt_mask = generate_square_subsequent_mask(tgt_seq_len).type(torch.bool)  

    src_padding_mask = (src == sp_en.pad_id).to(torch.bool)
    tgt_padding_mask = (tgt == sp_fr.pad_id).to(torch.bool)

    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask



#### wandb pour le suivi des runs

In [146]:
# Initialisation wandb
wandb.init(
    project="transformer-pytorch-traduction",
    entity="marius-casamian-sophia-antipolis",
    config={
        "batch_size": 32,
        "epochs":  50,
        "learning_rate": 0.0001,
        "embedding_size": 512,
        "num_encoder_layers": 6,
        "num_decoder_layers": 6,
    }
)
config = wandb.config

wandb: Currently logged in as: marius-casamian (marius-casamian-sophia-antipolis). Use `wandb login --relogin` to force relogin
wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

In [148]:
#ENTRAÎNEMENT & ÉVALUATION

def train_epoch_transformer(model, optimizer, loss_fn, dataloader, device):
    model.train()
    total_loss = 0.0

    for src, tgt in dataloader:
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input, device)

        logits = model(
            src,
            tgt_input,
            src_mask.type(torch.bool),  # S'assurer que le type est cohérent
            tgt_mask.type(torch.bool),  # Transformer float en bool si nécessaire
            src_padding_mask.type(torch.bool),  # Idem pour le padding
            tgt_padding_mask.type(torch.bool),
            src_padding_mask.type(torch.bool)
        )


        optimizer.zero_grad()
        loss = loss_fn(logits.view(-1, logits.size(-1)), tgt_out.contiguous().view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)


In [150]:

# SCRIPT PRINCIPAL

if __name__ == "__main__":
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Chargement des données nettoyées
    df = pd.read_csv("cleaned_dataset.csv")
    pairs = [(row["en"], row["fr"]) for _, row in df.iterrows()]

    sp_en = SentencePieceTokenizer("spm_en.model")
    sp_fr = SentencePieceTokenizer("spm_fr.model")

    dataset = SPTextPairDataset(pairs, sp_en, sp_fr, max_len=50)
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=my_collate_fn, shuffle=True)

    #instancier le modèle Transformer
    transformer = Seq2SeqTransformer(
        num_encoder_layers=6,
        num_decoder_layers=6,
        emb_size=512,
        nhead=8,
        src_vocab_size=sp_en.vocab_size,
        tgt_vocab_size=sp_fr.vocab_size
    ).to(device)

    total_params = sum(p.numel() for p in transformer.parameters())
    print(f"{total_params:,} total parameters.")
    total_trainable_params = sum(
        p.numel() for p in transformer.parameters() if p.requires_grad)
    print(f"{total_trainable_params:,} training parameters.")
    print(transformer)

    loss_fn = nn.CrossEntropyLoss(ignore_index=sp_fr.pad_id)
    optimizer = torch.optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

    # Entraîner le modèle Transformer
    for epoch in range(50):
        train_loss = train_epoch_transformer(transformer, optimizer, loss_fn, dataloader, device)
        wandb.log({
            "epoch": epoch,
            "train_loss": train_loss,
        })

        print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}")

    torch.save(transformer, 'model.pth')
    wandb.save('model_weights.pth')

71,806,544 total parameters.
71,806,544 training parameters.
Seq2SeqTransformer(
  (transformer): Transformer(
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0-5): 6 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
          )
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (dropout): Dropout(p=0.2, inplace=False)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.2, inplace=False)
          (dropout2): Dropout(p=0.2, inplace=False)
        )
      )
      (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    )
    (decoder): TransformerDecoder(
      (layers): ModuleList(
        (0-5): 6 x 

#### top-p sampling

In [303]:


# top-p sampling

def top_p_sampling(logits: torch.Tensor, p: float, temperature: float) -> int:
    """
    1) On applique le temp en divisant les logits
    2) On calcule softmax pour obtenir la distribution
    3) On trie les tokens par probabilité décroissante
    4) On tronque là où la somme cumulée des proba dépasse p
    5) On renormalise et on échantillonne
    """
    #ggestion de la température
    logits = logits / temperature
    
    # Distribution de probabilité
    probs = F.softmax(logits, dim=-1)
    
    #Tri décroissant
    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
    cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
    
    #Tronquer au seuil p
    cutoff_idx = torch.searchsorted(cumulative_probs, p).item()
    truncated_probs = sorted_probs[:cutoff_idx + 1]
    truncated_indices = sorted_indices[:cutoff_idx + 1]
    
    #Renormaliser
    truncated_probs = truncated_probs / truncated_probs.sum()
    
    # Échantillonnage
    sampled_idx = torch.multinomial(truncated_probs, 1).item()
    next_token_id = truncated_indices[sampled_idx].item()
    return next_token_id


#on tente de générer la traductionn avec sampling

def generate_translation_top_p(
    model: nn.Module, 
    src_sentence: str,
    tokenizer_src: SentencePieceTokenizer,
    tokenizer_tgt: SentencePieceTokenizer,
    device: torch.device,
    p: float = 0.85,
    max_len: int = 50,
    temperature: float = 1.0
) -> str:
    """
    Etapes :
      1) On encode src_sentence en ID.
      2) On fait tourner le modèle encodeur-décodeur token par token.
      3) A chaque étape, on applique top-p sampling pour choisir le token suivant.
      4) On arrête si on rencontre <eos> ou si on dépasse max_len.
    """
    model.eval()

    #Encode la phrase source pour la passer au modèle
    src_ids = tokenizer_src.encode(src_sentence)

    
    src_tensor = torch.tensor([[tokenizer_src.bos_id] + src_ids + [tokenizer_src.eos_id]],
                              device=device)

    # avec <bos> côté cible pour séquence générée
    generated = [tokenizer_tgt.bos_id]

    with torch.no_grad():
        for _ in range(max_len):
            tgt_tensor = torch.tensor([generated], dtype=torch.long, device=device)
            
            # Crée les masques (vous avez déjà cette fonction)
            src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(
                src_tensor, tgt_tensor, device
            )
            
            #obtenir les logits
            logits = model(
                src_tensor, 
                tgt_tensor,
                src_mask,
                tgt_mask,
                src_padding_mask,
                tgt_padding_mask,
                src_padding_mask  # key_padding_mask du memory
            )

            #les logits du dernier token
            next_token_logits = logits[0, -1, :]

            #top-p sampling
            next_token_id = top_p_sampling(next_token_logits, p=p, temperature=temperature)

            #Test de fin
            if next_token_id == tokenizer_tgt.eos_id:
                break

            # Sinon on ajoute ce token à la séquence générée
            generated.append(next_token_id)


    #on supprime <bos>
    generated_tokens = generated[1:]
    translation = tokenizer_tgt.decode(generated_tokens)
    return translation


In [311]:
#Exemple de test

if __name__ == "__main__":
    test_sentences = [
        "Change the world",
        "Someone who live in my country",
        "Near the sea",
        "World is difficult"
    ]
    
    print("[INFO] Tests de traduction (top-p sampling) :")
    for sentence in test_sentences:
        translation = generate_translation_top_p(
            model=transformer,
            src_sentence=sentence,
            tokenizer_src=sp_en,
            tokenizer_tgt=sp_fr,
            device=device,
            p=0.7,         
            max_len=50,
            temperature=1.0 #
        )
        print(f"Anglais : {sentence}")
        print(f"Français (top-p) : {translation}")
        print("-" * 50)

[INFO] Tests de traduction (top-p sampling) :
Anglais : Change the world
Français (top-p) : Pôle
--------------------------------------------------
Anglais : Someone who live in my country
Français (top-p) : Les noms d’information
--------------------------------------------------
Anglais : Near the sea
Français (top-p) : la population
--------------------------------------------------
Anglais : World is difficult
Français (top-p) : L'Inde est le deuxième pays importateur
--------------------------------------------------
