In [1]:
# Importing the libraries
import torch as tn
import torch as th
from torchtext import vocab
import pickle as pl
from Models.Transformer import Transformer
from tqdm import tqdm
import numpy as np
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Data
with open('data/English_encodings.pkl', 'rb') as f:
    english_encodings,english_sentences,Paddings_en,Vocab_en = pl.load(f)
with open('data/French_encodings.pkl', 'rb') as f:
    french_encodings,french_sentences,Paddings_fr,Vocab_fr = pl.load(f)

# Get the vocabulary size
src_vocab_size = Vocab_fr.__len__()
tgt_vocab_size = Vocab_en.__len__()
src_padding_idx = Vocab_fr.__getitem__('<Pad>')
tgt_padding_idx = Vocab_en.__getitem__('<Pad>')


# Load Train,Vaildation and Test data
with open('data/Train_data.pkl', 'rb') as f:
    X_train,Y_train,src_padding_mask,tgt_padding_mask = pl.load(f)

with open('data/Validation_data.pkl', 'rb') as f:
    X_vali,Y_vali,src_padding_mask_vali = pl.load(f)

with open('data/Test_data.pkl', 'rb') as f:
    X_test,Y_test,src_padding_mask_test = pl.load(f)

# Set no look mask
tgt_mask = th.triu(th.full((27, 27), float('-inf')), diagonal=1)

In [3]:
# Function to train the model
def train(model, src_data, tgt_data, src_padding_mask, tgt_padding_mask, tgt_mask, optimizer, loss_fn, epochs):
    
    # Check if CUDA/mps is available
    if tn.cuda.is_available():
        device = tn.device("cuda")
    elif tn.backends.mps.is_available():
        device = "cpu"
    else:
        device = "cpu"
    
    # Move the model to the device
    model.to(device)

    # Move the data to the device
    src_data = src_data.to(device)
    tgt_data = tgt_data.to(device)
    src_padding_mask = src_padding_mask.to(device)
    tgt_padding_mask = tgt_padding_mask.to(device)
    tgt_mask = tgt_mask.to(device)
    
    # Initialize the loss
    loss_train = []
    
    # Get the number of batches
    n_batches,batch_size,_ = src_data.shape

    # Train the model
    for epoch in range(epochs):
        # Initialize the loss
        epoch_loss = 0

        # Go trough each batch
        for src_batch,tgt_batch,src_pad,tgt_pad in zip(src_data,tgt_data,src_padding_mask,tgt_padding_mask):
            
            # Zero the gradients
            optimizer.zero_grad()

            # Batch loss
            loss = 0

            # Go trough each batch
            for i in range(batch_size):
                out = model(src_batch[i],tgt_batch[i],
                        tgt_mask = tgt_mask,
                        src_padding_mask = src_pad[i],
                        tgt_padding_mask = tgt_pad[i]
                        )

                loss += loss_fn(out,tgt_batch[i])

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()

            # Add the loss
            epoch_loss += loss.item()
            loss_train.append(loss.item())
        
        print(f'Epoch {epoch+1}/{epochs} Loss: {epoch_loss/(n_batches*batch_size)}')

    
    return loss_train, model


In [4]:
# Intialize the model with set hyperparameters
T = 27
d_model = 512 # Dimension of the model (Embedding size)
d_ff = 2048 # Dimension of the feedforward network model in transformer
nhead = 8 # Number of heads in the multiheadattention models
dk = d_model//nhead
dv = d_model//nhead
num_layers = 6


# Initialize the model
Model = Transformer(
    T = T,
    d_model = d_model,
    nhead = nhead,
    d_ff = d_ff,
    dk = dk,
    dv = dv,
    num_layers = num_layers,
    src_vocab_size = src_vocab_size,
    tgt_vocab_size = tgt_vocab_size,
    src_padding_idx = src_padding_idx,
    tgt_padding_idx = tgt_padding_idx,
    dropout=0.1
)

# Initialize the loss function and optimizer
loss_fn = tn.nn.CrossEntropyLoss()
optimizer = tn.optim.Adam(Model.parameters(), lr=0.0001)

# Train the model
loss_train, Model = train(Model, X_train, Y_train, src_padding_mask, tgt_padding_mask, tgt_mask, optimizer, loss_fn, epochs=10)

# Save the model
tn.save(Model.state_dict(), "Models/Transformer.pt")

# Save the loss
with open('Models/TransformerLoss.pkl', 'wb') as f:
    pl.dump([loss_train] ,f)


Epoch 1/10 Loss: 0.7038267760276794
Epoch 2/10 Loss: 0.09483484647274017
Epoch 3/10 Loss: 0.028080702769756317
Epoch 4/10 Loss: 0.012959643244743347
Epoch 5/10 Loss: 0.007870923562347889
Epoch 6/10 Loss: 0.005108638553321361
Epoch 7/10 Loss: 0.0037165282789617775
Epoch 8/10 Loss: 0.0027161712914705277
Epoch 9/10 Loss: 0.0019443184860050678
Epoch 10/10 Loss: 0.0014907017659395933
