In [1]:
# %load C:\Users\jacqu\Scolarite\M2A\AMAL\student_tp6\src\tp6-traduction.py
import datamaestro
from torch.autograd import Variable
import numpy as np
import logging
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
import unicodedata
import string
from tqdm import tqdm
from pathlib import Path
from typing import List

import time
import re
from torch.utils.tensorboard import SummaryWriter
logging.basicConfig(level=logging.INFO)

FILE = r"C:\Users\jacqu\Scolarite\M2A\AMAL\data\en-fra.txt"

#writer = SummaryWriter("runs/tag-"+time.asctime())

def normalize(s):
    return re.sub(' +',' ', "".join(c if c in string.ascii_letters else " "
         for c in unicodedata.normalize('NFD', s.lower().strip())
         if  c in string.ascii_letters+" "+string.punctuation)).strip()


class Vocabulary:
    """Permet de gérer un vocabulaire.

    En test, il est possible qu'un mot ne soit pas dans le
    vocabulaire : dans ce cas le token "__OOV__" est utilisé.
    Attention : il faut tenir compte de cela lors de l'apprentissage !

    Utilisation:

    - en train, utiliser v.get("blah", adding=True) pour que le mot soit ajouté
      automatiquement
    - en test, utiliser v["blah"] pour récupérer l'ID du mot (ou l'ID de OOV)
    """
    PAD = 0
    EOS = 1
    SOS = 2
    OOVID = 3

    def __init__(self, oov: bool):
        self.oov = oov
        self.id2word = ["PAD", "EOS", "SOS"]
        self.word2id = {"PAD": Vocabulary.PAD, "EOS": Vocabulary.EOS, "SOS": Vocabulary.SOS}
        if oov:
            self.word2id["__OOV__"] = Vocabulary.OOVID
            self.id2word.append("__OOV__")

    def __getitem__(self, word: str):
        if self.oov:
            return self.word2id.get(word, Vocabulary.OOVID)
        return self.word2id[word]

    def get(self, word: str, adding=True):
        try:
            return self.word2id[word]
        except KeyError:
            if adding:
                wordid = len(self.id2word)
                self.word2id[word] = wordid
                self.id2word.append(word)
                return wordid
            if self.oov:
                return Vocabulary.OOVID
            raise

    def __len__(self):
        return len(self.id2word)

    def getword(self, idx: int):
        if idx < len(self):
            return self.id2word[idx]
        return None

    def getwords(self, idx: List[int]):
        return [self.getword(i) for i in idx]



class TradDataset():
    def __init__(self,data,vocOrig,vocDest,adding=True,max_len=10):
        self.sentences =[]
        for s in tqdm(data.split("\n")):
            if len(s)<1:continue
            orig,dest=map(normalize,s.split("\t")[:2])
            if len(orig)>max_len: continue
            self.sentences.append((torch.tensor([vocOrig.get(o) for o in orig.split(" ")]+[Vocabulary.EOS]),torch.tensor([vocDest.get(o) for o in dest.split(" ")]+[Vocabulary.EOS])))
    def __len__(self):return len(self.sentences)
    def __getitem__(self,i): return self.sentences[i]



def collate(batch):
    orig,dest = zip(*batch)
    o_len = torch.tensor([len(o) for o in orig])
    d_len = torch.tensor([len(d) for d in dest])
    return pad_sequence(orig),o_len,pad_sequence(dest),d_len


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


with open(FILE) as f:
    lines = f.readlines()

lines = [lines[x] for x in torch.randperm(len(lines))]
idxTrain = int(0.8*len(lines))

vocEng = Vocabulary(True)
vocFra = Vocabulary(True)

BATCH_SIZE = 100
MAX_LEN = 200

datatrain = TradDataset("".join(lines[:idxTrain]),vocEng,vocFra,max_len=MAX_LEN)
datatest = TradDataset("".join(lines[idxTrain:]),vocEng,vocFra,max_len=MAX_LEN)

train_loader = DataLoader(datatrain, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(datatest, collate_fn=collate, batch_size=BATCH_SIZE, shuffle=True)

#  TODO:  Implémenter l'encodeur, le décodeur et la boucle d'apprentissage


100%|███████████████████████████████████████████████████████████████████████| 136521/136521 [00:07<00:00, 18385.65it/s]
100%|█████████████████████████████████████████████████████████████████████████| 34132/34132 [00:01<00:00, 17069.06it/s]


In [2]:
print(len(vocEng),len(vocFra))

13787 25684


# Encoder/Decoder

In [3]:
class Encoder(nn.Module):
    def __init__(self, dim_emb, dim_hidden):
        super(Encoder, self).__init__()
        self.dim_hidden = dim_hidden
        self.dim_emb = dim_emb
        self.n_layers = 2
        self.emb = nn.Embedding(len(vocEng), dim_emb)
        self.gru = nn.GRU(dim_emb, dim_hidden,num_layers = self.n_layers)
        
    def forward(self, x, h):        
        z = self.emb(x) 
        output, h_n = self.gru(z,h)        
        return(output,h_n)        

    def init_hidden(self, batch_size):
        return(Variable(next(self.parameters()).data.new(self.n_layers, batch_size, self.dim_hidden)))

    
class Decoder(nn.Module):
    def __init__(self,dim_emb_fr,dim_hidden):
        super(Decoder, self).__init__()
        self.dim_emb_fr = dim_emb_fr
        self.dim_hidden = dim_hidden
        
        self.emb = nn.Embedding(len(vocFra), dim_emb_fr)
        self.relu = nn.ReLU()
        self.gru = nn.GRU(dim_emb_eng, dim_hidden,num_layers = 2)
        self.decoder = nn.Linear(dim_hidden,len(vocFra))
        self.softmax = nn.Softmax(dim=1) ##Check la dim
                
    def forward(self,x,h):
        x_emb_relu = self.relu(self.emb(x))
        output,h_n = self.gru(x_emb_relu,h)
        return(self.softmax(self.decoder(output)),h_n)
   
    
    
    def generate(self, hidden, lenseq=None, SOS_IX = 2, EOS_IX = 1, PAD = 0, random_gen = False):
        BATCH_SIZE = hidden.shape[1]
        seq = torch.tensor([SOS_IX]).long().repeat(BATCH_SIZE).unsqueeze(0).to(device) #Initialize empty sequence with SOS as first token
        seq_probs = torch.tensor([]).to(device)
        done_mask = torch.ones(BATCH_SIZE).long().to(device)
 
        while len(seq) <= lenseq and done_mask.sum() > 0:
            #encoded_seq, hidden = self.gru(embed_seq, hidden)
            #decoded_seq = self.decode(encoded_seq)
            decoded_seq, h_n = self.forward(seq,hidden)
 
            if random_gen:
                probs = decoded_seq.cpu().numpy()
                nxt_wd = np.random.choice(range(len(decoded_seq[-1][0])), p = probs[0])
                nxt_wd = torch.tensor(nxt_wd)
            else:
                nxt_wd = decoded_seq[-1].argmax(dim=1) # Most likely next word
 
            seq = torch.cat((seq, nxt_wd.unsqueeze(0)))
            done_mask = done_mask * (seq[-1] != EOS_IX)
            seq = seq * done_mask
            seq_probs = torch.cat((seq_probs, decoded_seq[-1].unsqueeze(0)))
 
        return seq[1:], seq_probs

## Train

In [4]:
# ENCODERS
 
dim_emb, dim_emb_eng = 40,45
latent_dim = 70
encoder = Encoder(dim_emb, latent_dim).to(device)
decoder = Decoder(dim_emb_eng, latent_dim).to(device)
 
# HYPERPARAMETERS

n_epoch = 1
iter = 0
lr = 1e-2
constrain_prob = 0.75
 
    
# LEARNING
 
PAD = 0
optim_encod = torch.optim.Adam(params = encoder.parameters(), lr=lr)
optim_decod = torch.optim.Adam(params = decoder.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss(ignore_index=PAD)
 
    
start = time.time()
list_loss,epoch_loss,Big_list_loss = [],[],[]
for epoch in range(n_epoch):

    Big_list_loss += list_loss
    if list_loss != []:
        epoch_loss.append(np.mean(list_loss))
    list_loss = []
    
    for eng_seq, len_eng_seq, fr_seq, len_fr_seq in train_loader:
        optim_encod.zero_grad()
        optim_decod.zero_grad()
        eng_seq, fr_seq = eng_seq.to(device), fr_seq.to(device) 
        hidden_enc = encoder.init_hidden(100).to(device)
        encoded_original_seq, hidden = encoder(eng_seq,hidden_enc)
 
        toss = torch.rand(1)
 
        if toss <= constrain_prob:
            # TEACHER FORCING
            loss = 0
            gen_seq_probs = torch.tensor([]).to(device)
            for k in range(len(fr_seq)):
                nxt_wd, nxt_wd_prob = decoder.generate(hidden, lenseq = 1)
                gen_seq_probs = torch.cat((gen_seq_probs, nxt_wd_prob))
                _, hidden = decoder(fr_seq[:k+1], hidden)

        else:
            gen_seq, gen_seq_probs = decoder.generate(hidden, lenseq = len_fr_seq.max())
        print(gen_seq_probs.shape,fr_seq.shape)
        loss = criterion(gen_seq_probs.view(fr_seq.view(-1).size(0),-1), fr_seq.detach().view(-1))
        #print(loss)
  
        if iter%10 == 0:
            #print(f'iter {iter} loss: {loss / len(fr_seq)}')
            if iter != 0:
                break
 
        loss.backward()
        optim_decod.step()
        optim_encod.step()
        list_loss.append(float(loss))
        iter += 1
        
stop = time.time()
print(f"Done {nbr_epoch} epoch in {(stop-start)/60 :.2} minutes.")    

torch.Size([19, 100, 25684]) torch.Size([19, 100])
torch.Size([17, 100, 25684]) torch.Size([17, 100])


RuntimeError: CUDA out of memory. Tried to allocate 168.00 MiB (GPU 0; 6.00 GiB total capacity; 1.12 GiB already allocated; 95.44 MiB free; 1.81 GiB reserved in total by PyTorch)