In [5]:
import torch
import torchvision
import matplotlib.pyplot as plt
from torch import nn
import math

# Création de la classe Transformers

Création des différentes couches

In [343]:
class FeedForwardNetwork(nn.Module):
    
    def __init__(self, d_model, d_ff):
        super(FeedForwardNetwork, self).__init__()
        
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [345]:
# Optionel : Coder notre propre couche multi-attention head (pour ne pas utiliser celle fournit par torch.nn)

In [347]:
def create_mask(x):
    len = x.size(0)
    mask = torch.triu(torch.ones(len,len), diagonal = 1) * (-1) * float("inf") # Matrice triangulaire supérieur de valeur -inf
    return mask

In [349]:
class PositionalEncoding(nn.Module):

    def __init__(self, max_length, d_model):
        super(PositionalEncoding, self).__init__()
        
        pe = torch.zeros(max_length, d_model)
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        
        self.register_buffer('pe', pe.unsqueeze(0))
        
    def forward(self, x):
        return x + self.pe[:, :x.size(1)]
        

In [351]:
class Encoder(nn.Module):

    def __init__(self, d_model, n_heads, d_ff):
        super(Encoder, self).__init__()

        self.attention = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm2 = nn.LayerNorm(d_model)
        

    def forward(self, x):
        attention_output, wei = self.attention(x, x, x)
        x = self.norm1(x + attention_output)
        ffn_output = self.ffn(x)
        x = self.norm2(x + ffn_output)
        return x



In [353]:
class Decoder(nn.Module):

    def __init__(self, d_model, n_heads, d_ff):
        super(Decoder, self).__init__()

        self.attention1 = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.attention2 = torch.nn.MultiheadAttention(d_model, n_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.ffn = FeedForwardNetwork(d_model, d_ff)
        self.norm3 = nn.LayerNorm(d_model)

    def forward(self, x, enc_output, mask):
        attention_output1, wei = self.attention1(x, x, x,attn_mask = mask)
        x = self.norm1(x + attention_output1)
        attention_output2, wei = self.attention2(x, enc_output, enc_output)
        x = self.norm2(x + attention_output2)
        ffn_output = self.ffn(x)
        x = self.norm3(x + ffn_output)
        return x



In [355]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, target_size, max_length, d_model, num_heads, d_ff, n_layers):
        super(Transformer, self).__init__()
        
        self.enc_embedding = nn.Embedding(vocab_size, d_model)
        self.dec_embedding = nn.Embedding(target_size, d_model)
        self.positional_encoding = PositionalEncoding(max_length, d_model)
        
        self.encoder_layers = [Encoder(d_model, num_heads,d_ff) for i in range(n_layers)]
        self.decoder_layers = [Decoder(d_model, num_heads, d_ff) for i in range(n_layers)]

        self.linear = nn.Linear(d_model, target_size)

    def forward(self,inp,out):
        mask = create_mask(out)
        out_embedded = self.positional_encoding(self.dec_embedding(out))
        inp_embedded = self.positional_encoding(self.enc_embedding(inp))

        enc_output = inp_embedded
        for encoder in self.encoder_layers:
            enc_output = encoder(enc_output)

        dec_output = out_embedded
        for decoder in self.decoder_layers:
            dec_output = decoder(dec_output, enc_output, mask)

        output = self.linear(dec_output)

        return output   

# Mise en forme des données

In [205]:
import nltk
import csv
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
import string
punctuation = string.punctuation

In [207]:
x = []
y = []
with open("eng_-french.csv",encoding = "utf-8") as file:
    reader = csv.reader(file)
    for row in reader:
        y.append(row[0])
        x.append(row[-1])
    x.pop(0)
    y.pop(0)

In [304]:
x_red = x[:10000]
y_red = y[:10000]

In [306]:
x_train = [word_tokenize(word) for word in x_red]
y_train = [word_tokenize(word) for word in y_red]

In [308]:
for i in range(len(x_train)):
    for j in range(len(x_train[i])-1,-1,-1):
        if x_train[i][j] in punctuation:
            x_train[i].pop(j)

for i in range(len(y_train)):
    for j in range(len(y_train[i])-1,-1,-1):
        if y_train[i][j] in punctuation:
            y_train[i].pop(j)


In [310]:
stemmer = PorterStemmer()
for i in range(len(y_train)):
    y_train[i] = [stemmer.stem(word) for word in y_train[i]]

for i in range(len(x_train)):
    x_train[i] = [stemmer.stem(word) for word in x_train[i]]


In [312]:
vocab = []
target_vocab = []
for sentence in x_train:
    for word in sentence:
        if word not in vocab:
            vocab.append(word)

for sentence in y_train:
    for word in sentence:
        if word not in target_vocab:
            target_vocab.append(word)

print(len(vocab))
print(len(target_vocab))

3796
1719


In [314]:
from collections import Counter

In [316]:
cnt = Counter()
for sentence in x_train:
    for word in sentence:
        cnt[word] += 1

li = cnt.most_common(len(vocab))
vocab = {}
for i in range(len(li)):
    word, n = li[i]
    vocab[word] = i + 3

In [318]:
cnt = Counter()
for sentence in y_train:
    for word in sentence:
        cnt[word] += 1

li = cnt.most_common(len(target_vocab))
target_vocab = {}
for i in range(len(li)):
    word, n = li[i]
    target_vocab[word] = i + 3

In [320]:
for sentence in x_train:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in vocab:
            sentence.pop(i)
        else:
            sentence[i] = vocab[sentence[i]]
            

In [322]:
for sentence in y_train:
    for i in range(len(sentence) - 1, -1, -1):
        if sentence[i] not in target_vocab:
            sentence.pop(i)
        else:
            sentence[i] = target_vocab[sentence[i]]

In [324]:
def make_vector(li, max_length):
    if len(li) > max_length:
        return [1] + li[:max_length] + [2]
    else:
        return [1] + li + [0 for i in range(max_length - len(li))] + [2]

In [326]:
for i in range(len(x_train)):
    x_train[i] = make_vector(x_train[i], 40)
for i in range(len(y_train)):
    y_train[i] = make_vector(y_train[i], 40)

In [328]:
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import IntTensor

In [387]:
# Création des Dataloader
trainset = TensorDataset(IntTensor(x_train), IntTensor(y_train))
train_dataloader = DataLoader(trainset, batch_size=16, shuffle=True, drop_last=True)


# Entrainement du transformer

In [390]:
from torch.optim import Adam

In [392]:
tf1 = Transformer(len(vocab) + 3,len(target_vocab) + 3, 42, 512, 8, 1024, 6)

In [408]:
adam = Adam(tf1.parameters(),lr=1e-5)
loss_fn = nn.CrossEntropyLoss(reduction='mean')

In [414]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        batch_size = len(y)
        pred = model(X, y)
        pred = pred.view(-1, pred.size(-1))  # [batch_size * seq_len, num_classes]
        y = y.view(-1)  # [batch_size * seq_len]
        y = y.long()
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 10 == 0:
            print(torch.min(y), torch.max(y))
            print(torch.min(pred), torch.max(pred))
            print(torch.isnan(X).any())
            for name, param in model.named_parameters():
                if torch.isnan(param).any():
                    print(f"NaN detected in parameter: {name}")
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [416]:
nb_epoch = 3
for i in range(nb_epoch):
    train_loop(train_dataloader, tf1,loss_fn, adam)

tensor(0) tensor(925)
tensor(nan, grad_fn=<MinBackward1>) tensor(nan, grad_fn=<MaxBackward1>)
tensor(False)
NaN detected in parameter: enc_embedding.weight
NaN detected in parameter: dec_embedding.weight
NaN detected in parameter: linear.weight
NaN detected in parameter: linear.bias
loss:     nan  [   16/10000]


KeyboardInterrupt: 