In [205]:
import torch
import torchvision
import matplotlib.pyplot as plt
from torch import nn
import math

# Création de la classe Transformers

In [208]:
# Fonction pour le débuggage vérifie qu'il n'y a pas de nan dans les tenseurs
def check_for_nans(tensor, tensor_name):
    if torch.isnan(tensor).any():
        print(f"NaN detected in {tensor_name}")

Création des différentes couches

In [211]:
class FeedForwardNetwork(nn.Module):
    
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [213]:
def create_mask(x):
    len = x.size(0)
    mask = torch.triu(torch.ones(len,len), diagonal = 1) * (-1e9) # Matrice triangulaire supérieur de valeur -inf
    return mask

In [215]:
class PositionalEncoding(nn.Module):

    def __init__(self, max_length, d_model):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
        

In [217]:
class Encoder(nn.Module):

    def __init__(self, d_model, n_heads, d_ff):
        super(Encoder, self).__init__()

        self.attention = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        

    def forward(self, x):
        attention_output, wei = self.attention(x, x, x)
        x = self.norm1(x + attention_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x



In [219]:
class Decoder(nn.Module):

    def __init__(self, d_model, n_heads, d_ff):
        super(Decoder, self).__init__()

        self.attention1 = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.attention2 = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_output, mask):
        attention_output1, wei = self.attention1(x, x, x,attn_mask = mask)
        x = self.norm1(x + attention_output1) 
        attention_output2, wei = self.attention2(x, enc_output, enc_output)
        x = self.norm2(x + attention_output2)
        ffn_output = self.ffn(x)
        x = self.norm3(x + ffn_output)
        return x



In [221]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, target_size, max_length, d_model, num_heads, d_ff, n_layers):
        super(Transformer, self).__init__()
        
        self.enc_embedding = nn.Embedding(vocab_size, d_model)
        self.dec_embedding = nn.Embedding(target_size, d_model)
        self.positional_encoding = PositionalEncoding(max_length, d_model)
        
        self.encoder_layers = [Encoder(d_model, num_heads,d_ff) for i in range(n_layers)]
        self.decoder_layers = [Decoder(d_model, num_heads, d_ff) for i in range(n_layers)]

        self.linear = nn.Linear(d_model, target_size)

    def check_for_nans(self):
        for name, param in self.named_parameters():
            if torch.isnan(param).any():
                print(f"NaN detected in parameter: {name}")

    def forward(self,inp,out):

        check_for_nans(inp,"inp")
        check_for_nans(out, "out")
        
        mask = create_mask(out)

        check_for_nans(mask, "mask")
        
        out_embedded = self.positional_encoding(self.dec_embedding(out))
        inp_embedded = self.positional_encoding(self.enc_embedding(inp))

        check_for_nans(out_embedded, "out_embedded")
        check_for_nans(inp_embedded, "inp_embedded")
        
        enc_output = inp_embedded
        for encoder in self.encoder_layers:
            enc_output = encoder(enc_output)

        check_for_nans(enc_output, "enc_output")
        
        dec_output = out_embedded
        for decoder in self.decoder_layers:
            dec_output = decoder(dec_output, enc_output, mask)

        check_for_nans(dec_output, "dec_output")
        
        output = self.linear(dec_output)

        check_for_nans(output, "output")
        return output   

# Mise en forme des données

In [224]:
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
punctuation = string.punctuation

In [226]:
seq_len = 10

In [228]:
# On importe le dataset
x = []
y = []
with open("eng_-french.csv",encoding = "utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        y.append(row[0])
        x.append(row[-1])
    x.pop(0)
    y.pop(0)

In [230]:
# On n'utilise qu'une partie du dataset
x_red = x[:40000]
y_red = y[:40000]

In [232]:
from sklearn.model_selection import train_test_split

In [234]:
x_train, x_test, y_train, y_test = train_test_split(x_red, y_red, test_size = 0.2)

In [236]:
# On tokenize les phrase
x_train = [word_tokenize(word) for word in x_train]
y_train = [word_tokenize(word) for word in y_train]
x_test = [word_tokenize(word) for word in x_test]
y_test = [word_tokenize(word) for word in y_test]

In [238]:
# On retire la ponctuation
for i in range(len(x_train)):
    for j in range(len(x_train[i])-1,-1,-1):
        if x_train[i][j] in punctuation:
            x_train[i].pop(j)

for i in range(len(y_train)):
    for j in range(len(y_train[i])-1,-1,-1):
        if y_train[i][j] in punctuation:
            y_train[i].pop(j)

for i in range(len(x_test)):
    for j in range(len(x_test[i])-1,-1,-1):
        if x_test[i][j] in punctuation:
            x_test[i].pop(j)

for i in range(len(y_test)):
    for j in range(len(y_test[i])-1,-1,-1):
        if y_test[i][j] in punctuation:
            y_test[i].pop(j)


In [240]:
# On lemmise tous nos mots
stemmer = PorterStemmer()
for i in range(len(y_train)):
    y_train[i] = [stemmer.stem(word) for word in y_train[i]]

for i in range(len(x_train)):
    x_train[i] = [stemmer.stem(word) for word in x_train[i]]

for i in range(len(y_test)):
    y_test[i] = [stemmer.stem(word) for word in y_test[i]]

for i in range(len(x_test)):
    x_test[i] = [stemmer.stem(word) for word in x_test[i]]


In [242]:
# On récupère la taille des 2 vocabulaires (francais/anglais)
vocab = []
target_vocab = []
for sentence in x_train:
    for word in sentence:
        if word not in vocab:
            vocab.append(word)

for sentence in y_train:
    for word in sentence:
        if word not in target_vocab:
            target_vocab.append(word)

print(len(vocab))
print(len(target_vocab))

8110
3610


In [244]:
from collections import Counter

In [246]:
# On crée les dictionnaires des 2 vocabulaires
cnt = Counter()
for sentence in x_train:
    for word in sentence:
        cnt[word] += 1

li = cnt.most_common(len(vocab))
vocab = {}
for i in range(len(li)):
    word, n = li[i]
    vocab[word] = i + 4

In [248]:
cnt = Counter()
for sentence in y_train:
    for word in sentence:
        cnt[word] += 1

li = cnt.most_common(len(target_vocab))
target_vocab = {}
for i in range(len(li)):
    word, n = li[i]
    target_vocab[word] = i + 4

In [250]:
# On enlève les mots qui n'appartiennent pas au vocab et remplace les autre par un entier
for sentence in x_train:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in vocab:
            sentence.pop(i)
        else:
            sentence[i] = vocab[sentence[i]]

for sentence in x_test:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in vocab:
            sentence.pop(i)
        else:
            sentence[i] = vocab[sentence[i]]

In [252]:
for sentence in y_train:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in target_vocab:
            sentence.pop(i)
        else:
            sentence[i] = target_vocab[sentence[i]]

for sentence in y_test:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in target_vocab:
            sentence.pop(i)
        else:
            sentence[i] = target_vocab[sentence[i]]

In [254]:
# Permet de transformer chaque phrase en un vecteur de dim max_length + 2
def make_vector(li, max_length):
    if len(li) > max_length:
        return [1] + li[:max_length] + [2]
    else:
        return [1] + li + [3 for i in range(max_length - len(li))] + [2]

In [256]:
# On applique la fonction a nos phrases
for i in range(len(x_train)):
    x_train[i] = make_vector(x_train[i], seq_len)
for i in range(len(y_train)):
    y_train[i] = make_vector(y_train[i], seq_len)

for i in range(len(x_test)):
    x_test[i] = make_vector(x_test[i], seq_len)
for i in range(len(y_test)):
    y_test[i] = make_vector(y_test[i], seq_len)

In [258]:
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import IntTensor

In [260]:
# Création des Dataloader
trainset = TensorDataset(IntTensor(x_train), IntTensor(y_train))
train_dataloader = DataLoader(trainset, batch_size=32, shuffle=True, drop_last=True)

testset = TensorDataset(IntTensor(x_test), IntTensor(y_test))
test_dataloader = DataLoader(testset, batch_size=1, shuffle=True, drop_last=True)

# Entrainement du transformer

In [263]:
from torch.optim import Adam

In [265]:
# Définition des hyperparamètres
tf1 = Transformer(len(vocab) + 4,len(target_vocab) + 4, seq_len + 2, 512, 8, 1024, 6)

In [267]:
adam = Adam(tf1.parameters(),lr=1e-5)
loss_fn = nn.CrossEntropyLoss(reduction='mean')

In [269]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)

    model.train()
    for batch, (X, y) in enumerate(dataloader):

        batch_size = len(y)
        pred = model(X, y)
        pred = pred.view(-1, pred.size(-1))  
        y = y.view(-1)  
        y = y.long()
        loss = loss_fn(pred, y)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [271]:
# Entrainement sur 3 epochs
nb_epoch = 3
for i in range(nb_epoch):
    train_loop(train_dataloader, tf1,loss_fn, adam)

loss: 7.963542  [   32/32000]
loss: 7.371623  [ 3232/32000]
loss: 6.790514  [ 6432/32000]
loss: 6.258421  [ 9632/32000]
loss: 5.786921  [12832/32000]
loss: 5.276005  [16032/32000]
loss: 4.723555  [19232/32000]
loss: 4.324573  [22432/32000]
loss: 3.934635  [25632/32000]
loss: 3.510591  [28832/32000]
loss: 3.081415  [   32/32000]
loss: 2.749336  [ 3232/32000]
loss: 2.539872  [ 6432/32000]
loss: 2.242742  [ 9632/32000]
loss: 2.076568  [12832/32000]
loss: 1.973610  [16032/32000]
loss: 1.909057  [19232/32000]
loss: 1.760130  [22432/32000]
loss: 1.733293  [25632/32000]
loss: 1.685493  [28832/32000]
loss: 1.626862  [   32/32000]
loss: 1.587809  [ 3232/32000]
loss: 1.470157  [ 6432/32000]
loss: 1.462643  [ 9632/32000]
loss: 1.457907  [12832/32000]
loss: 1.531557  [16032/32000]
loss: 1.334028  [19232/32000]
loss: 1.384270  [22432/32000]
loss: 1.323455  [25632/32000]
loss: 1.300388  [28832/32000]


# Prédictions

In [305]:
def greedy_decode(model, src_tensor, max_len=seq_len + 2, start_token=1, end_token=2):
    output = []

    batch_size = src_tensor.shape[0]  
    dec_input = torch.full((1, batch_size), start_token, dtype=torch.long)  

    # On transpose pour respecter l'ordre des dimensions
    src_tensor = src_tensor.transpose(0, 1)  
    
    for i in range(max_len):
        # On recupère l'output du modèle
        dec_output = model(src_tensor, dec_input) 

        # On sélectionne le token avec la meilleur probabilité
        next_token = dec_output[-1, :, :].argmax(dim=-1)  
        
        output.extend(next_token.tolist())  
        
        # On le convertit pour pouvoir l'ajouter à l'input du decodeur
        next_token = next_token.unsqueeze(0)  
        next_token = next_token.transpose(0,1)
        
        # On concatene le token avec l'input du décodeur
        dec_input = torch.cat([dec_input, next_token], dim=0)  


        # Si le token de fin est généré on s'arrete
        if (next_token == end_token).all():  
            break

    return output


In [307]:
# Transforme une liste de token en string
def decode(x):
    for i in range(len(x)):
        if x[i] <= 3:
            x[i] = ""
        else:    
            for keys in target_vocab.keys():
                if x[i] == target_vocab[keys]:
                    x[i] = keys + " "
    sentence = ""
    for i in range(len(x)):
        sentence += x[i] 
    return sentence

In [277]:
from nltk.translate.bleu_score import sentence_bleu

In [279]:
# Donne la moyenne du score BLEU du model sur les données d'un dataloader
def evaluate_model(model, dataloader):
    model.eval()
    total_bleu = 0
    for  batch, (X, y) in enumerate(dataloader): 
        predicted_tokens = greedy_decode(model, X)
        predicted_translation = decode(predicted_tokens)
        
        target = decode(y.tolist()[0])
        bleu_score = sentence_bleu(target,predicted_translation )
        total_bleu += bleu_score

    avg_bleu = total_bleu / len(dataloader)
    print(f"Average BLEU Score: {avg_bleu}")

In [281]:
evaluate_model(tf1, test_dataloader)

Average BLEU Score: 0.0


In [309]:
# Prend un string et renvoie le string qui est donnée par le model
def translate(x,model = tf1):
    x = word_tokenize(x)
    for i in range(len(x) - 1, -1, -1):
        if x[i] in punctuation:
            x.pop(i)
    x = [stemmer.stem(word) for word in x]
    for i in range(len(x) - 1, -1, -1):
        if x[i] not in vocab:
            x.pop(i)
        else:
            x[i] = vocab[x[i]]
    x = make_vector(x, seq_len)
    x = [x]
    xset = TensorDataset(IntTensor(x), IntTensor(x))
    x_dataloader = DataLoader(xset, batch_size=1, shuffle=True, drop_last=True)
    
    for  batch, (X, y) in enumerate(x_dataloader):
        predicted_tokens = greedy_decode(model, X)
        print(predicted_tokens)
        print(batch)
        x = decode(predicted_tokens)   
    return x
    

In [311]:
translate("Etre ou ne pas etre")

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
0


''