In [1]:
# Importing the libraries

import torch as th
import torch as tn
from torchtext import vocab
import pickle as pl
from Transformer_Builtin import src_mask,tgt_mask
from Transformer import Transformer
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# Load English data and French data
with open('data/English_encodings.pkl', 'rb') as f:
    english_encodings,english_sentences,Paddings_en,Vocab_en = pl.load(f)
with open('data/French_encodings.pkl', 'rb') as f:
    french_encodings,french_sentences,Paddings_fr,Vocab_fr = pl.load(f)

# Get the vocabulary size
src_vocab_size = Vocab_fr.__len__()
tgt_vocab_size = Vocab_en.__len__()


In [24]:
d_model = 128
# Define Train/test split and Masking
X = tn.tensor(french_encodings)
Y = tn.tensor(english_encodings)


n_train = 1000
X_train = X[:n_train]
Y_train = Y[:n_train]
X_vali = X[10001:12000]
Y_vali = Y[10001:12000]
X_test = X[12001:]
Y_test = Y[12001:]

# Create batches
batch_size = 50

# Initialize the Batch tensors
X_train_batches = tn.zeros((int(n_train/batch_size),batch_size,27),dtype = tn.int64)
Y_train_batches = tn.zeros((int(n_train/batch_size),batch_size,27),dtype = tn.int64)

# Create the batches
for batch in range(int(n_train/batch_size)):

    # Fill Data batches
    X_train_batches[batch] = X_train[batch*batch_size:(batch+1)*batch_size]
    Y_train_batches[batch] = Y_train[batch*batch_size:(batch+1)*batch_size]

# Initialize the Mask tensors
#src_mask_test = tn.zeros((int(n_train/batch_size),batch_size*8,27,27))
#tgt_mask_test = tn.zeros((int(n_train/batch_size),batch_size*8,27,27))

src_key_masks = tn.ones((int(n_train/batch_size),batch_size,27,d_model))
tgt_key_masks = tn.ones((int(n_train/batch_size),batch_size,27,d_model))

idx_sample = 0
for batch in range(int(n_train/batch_size)):
    for sample in range(batch_size):
        
        src_key_masks[batch,sample][-Paddings_fr[idx_sample]:] = 0
        tgt_key_masks[batch,sample][-Paddings_en[idx_sample]:] = 0

        # for i in range(27):
        #     if np.random.rand() < 0.1:
        #         tgt_key_masks[batch,sample][i] = True

        #tgt_mask_test[batch,idx_sample*8:idx_sample*8+8] = tgt_mask(27,Paddings_en[idx_sample])
        #src_mask_test[batch,idx_sample*8:idx_sample*8+8] = src_mask(27,Paddings_fr[idx_sample])

        idx_sample += 1



In [15]:
# Intialize the model with set hyperparameters
T = 27
d_model = 128 # Dimension of the model (Embedding size)
d_ff = 256 # Dimension of the feedforward network model in transformer
nhead = 3 # Number of heads in the multiheadattention models
dk = 64
dv = 64
num_layers = 3

src_padding_idx = Vocab_fr.__getitem__('<Pad>')
tgt_padding_idx = Vocab_en.__getitem__('<Pad>')

# Initialize the model
Model = Transformer(
    T = T,
    d_model = d_model,
    nhead = nhead,
    d_ff = d_ff,
    dk = 64,
    dv = 64,
    num_layers = num_layers,
    src_vocab_size = src_vocab_size,
    tgt_vocab_size = tgt_vocab_size,
    src_padding_idx = src_padding_idx,
    tgt_padding_idx = tgt_padding_idx,
    dropout=0.1
)
    

In [None]:
tgt_mask = th.full((T,T),float('-inf')).triu(diagonal=1)
out = Model(X_train[0],Y_train[0],src_padding_mask = src_key_masks[0][0],tgt_padding_mask = tgt_key_masks[0][0],tgt_mask = tgt_mask)
out.argmax(dim=1)

In [19]:
loss_fn = tn.nn.CrossEntropyLoss()
optimizer = tn.optim.Adam(Model.parameters(), lr=0.0001)

# Def the training function
def trainmodel(epochs):
    if tn.cuda.is_available():
        device = tn.device("cuda")
    elif tn.backends.mps.is_available():
        device = "cpu"
    else:
        device = "cpu"

    Model.to(device)
    X_train_batches_mps = X_train_batches.to(device)
    Y_train_batches_mps = Y_train_batches.to(device)
    tgt_mask = th.full((T,T),float('-inf')).triu(diagonal=1).to(device)


    for epoch in tqdm(range(epochs)):
        loss_epoch = 0

        for X_batch,Y_batch,src_m,tgt_m in zip(X_train_batches_mps,Y_train_batches_mps,src_key_masks.to(device),tgt_key_masks.to(device)):
            
            
            optimizer.zero_grad()
            # out = Model(X_batch,Y_batch,
            #             #tgt_mask = no_ahead_mask_mps,
            #             #src_key_padding_mask = src_m,
            #             #tgt_key_padding_mask = tgt_m
            #             )

            loss = 0
            for i in range(batch_size):
                out = Model(X_batch[i],Y_batch[i],
                        tgt_mask = tgt_mask,
                         src_key_padding_mask = src_m[i],
                         tgt_key_padding_mask = tgt_m[i]
                         )

                loss += loss_fn(out,Y_batch[i])
            #loss = loss_fn(out,Y_batch)
            loss_epoch += loss.item()

            loss.backward()
            
            optimizer.step()
        print(f"Epoch: {epoch} Loss: {loss_epoch/n_train}")    

#%%

trainmodel(10)

tn.save(Model.state_dict(), "Transformer.pt")

 10%|█         | 1/10 [00:07<01:11,  7.90s/it]

Epoch: 0 Loss: 5.1924647521972656


 20%|██        | 2/10 [00:15<01:02,  7.85s/it]

Epoch: 1 Loss: 4.989856658935547


 30%|███       | 3/10 [00:23<00:54,  7.84s/it]

Epoch: 2 Loss: 4.956603897094727


 40%|████      | 4/10 [00:30<00:45,  7.65s/it]

Epoch: 3 Loss: 4.941916427612305


 50%|█████     | 5/10 [00:38<00:38,  7.60s/it]

Epoch: 4 Loss: 4.932976272583008


 60%|██████    | 6/10 [00:46<00:30,  7.62s/it]

Epoch: 5 Loss: 4.927104293823242


 70%|███████   | 7/10 [00:53<00:22,  7.64s/it]

Epoch: 6 Loss: 4.923023880004883


 80%|████████  | 8/10 [01:01<00:15,  7.58s/it]

Epoch: 7 Loss: 4.920069320678711


 90%|█████████ | 9/10 [01:08<00:07,  7.63s/it]

Epoch: 8 Loss: 4.917857818603515


100%|██████████| 10/10 [01:16<00:00,  7.63s/it]

Epoch: 9 Loss: 4.916156799316406





In [21]:
tgt_mask = th.full((T,T),float('-inf')).triu(diagonal=1)
out = Model(X_train[0],Y_train[0],tgt_mask = tgt_mask)
out.argmax(dim=1)

tensor([205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205,
        205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205])

In [23]:
Model.PositionalEncoding(Model.Embedding_src(X_train[0]))

tensor([[ 1.0354,  0.1280, -1.0998,  ...,  1.0204,  1.3559,  1.3815],
        [ 0.5106,  1.5056, -0.2125,  ...,  0.2947,  1.5198,  1.3693],
        [ 1.1103, -1.5857, -0.1996,  ..., -0.2834,  0.9721,  0.9523],
        ...,
        [-0.9056,  0.4242,  0.9349,  ...,  1.0000,  0.0028,  1.0000],
        [-0.1324,  0.9912,  0.3354,  ...,  1.0000,  0.0029,  1.0000],
        [ 0.7626,  0.6469, -0.5003,  ...,  1.0000,  0.0030,  1.0000]],
       grad_fn=<AddBackward0>)