In [1]:
# Importing the libraries
import torch as tn
import torch as th
from torchtext import vocab
import pickle as pl
from Models.Transformer import Transformer
from tqdm import tqdm
import numpy as np
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load Data
with open('data/English_encodings.pkl', 'rb') as f:
    english_encodings,english_sentences,Paddings_en,Vocab_en = pl.load(f)
with open('data/French_encodings.pkl', 'rb') as f:
    french_encodings,french_sentences,Paddings_fr,Vocab_fr = pl.load(f)

# Get the vocabulary size
src_vocab_size = Vocab_fr.__len__()
tgt_vocab_size = Vocab_en.__len__()
src_padding_idx = Vocab_fr.__getitem__('<Pad>')
tgt_padding_idx = Vocab_en.__getitem__('<Pad>')


# Load Train,Vaildation and Test data
with open('data/Train_data.pkl', 'rb') as f:
    X_train,Y_train,src_padding_mask,tgt_padding_mask = pl.load(f)

with open('data/Validation_data.pkl', 'rb') as f:
    X_vali,Y_vali,src_padding_mask_vali = pl.load(f)

with open('data/Test_data.pkl', 'rb') as f:
    X_test,Y_test,src_padding_mask_test = pl.load(f)

tgt_mask = th.triu(th.full((27, 27), float('-inf')), diagonal=1)

In [3]:

# Function to train the model
def train(model, src_data, tgt_data, src_padding_mask, tgt_padding_mask, tgt_mask, optimizer, loss_fn, epochs):
    
    # Check if CUDA/mps is available
    if tn.cuda.is_available():
        device = tn.device("cuda")
    elif tn.backends.mps.is_available():
        device = "cpu"
    else:
        device = "cpu"
    
    # Move the model to the device
    model.to(device)

    # Move the data to the device
    src_data = src_data.to(device)
    tgt_data = tgt_data.to(device)
    src_padding_mask = src_padding_mask.to(device)
    tgt_padding_mask = tgt_padding_mask.to(device)
    tgt_mask = tgt_mask.to(device)



    
    # Initialize the loss
    loss_train = []
    
    n_batches,batch_size,_ = src_data.shape

   
    # Train the model
    for epoch in range(epochs):
        # Initialize the loss
        epoch_loss = 0
        # Train the model
        
        for src_batch,tgt_batch,src_pad,tgt_pad in zip(src_data,tgt_data,src_padding_mask,tgt_padding_mask):
            
            # Zero the gradients
            optimizer.zero_grad()

            # Batch loss
            loss = 0

            # Go trough each batch
            for i in range(batch_size):
                out = model(src_batch[i],tgt_batch[i],
                        tgt_mask = tgt_mask,
                        src_padding_mask = src_pad[i],
                        tgt_padding_mask = tgt_pad[i]
                        )

                loss += loss_fn(out,tgt_batch[i])

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()

            # Add the loss
            epoch_loss += loss.item()
            loss_train.append(loss.item())
        
        print(f'Epoch {epoch+1}/{epochs} Loss: {epoch_loss/(n_batches*batch_size)}')

    
    return loss_train, model


In [4]:
# Intialize the model with set hyperparameters
T = 27
d_model = 512 # Dimension of the model (Embedding size)
d_ff = 2048 # Dimension of the feedforward network model in transformer
nhead = 8 # Number of heads in the multiheadattention models
dk = d_model//nhead
dv = d_model//nhead
num_layers = 6


# Initialize the model
Model = Transformer(
    T = T,
    d_model = d_model,
    nhead = nhead,
    d_ff = d_ff,
    dk = dk,
    dv = dv,
    num_layers = num_layers,
    src_vocab_size = src_vocab_size,
    tgt_vocab_size = tgt_vocab_size,
    src_padding_idx = src_padding_idx,
    tgt_padding_idx = tgt_padding_idx,
    dropout=0.1
)

loss_fn = tn.nn.CrossEntropyLoss()
optimizer = tn.optim.Adam(Model.parameters(), lr=0.0001)

# Train the model
loss_train, Model = train(Model, X_train, Y_train, src_padding_mask, tgt_padding_mask, tgt_mask, optimizer, loss_fn, epochs=10)


tn.save(Model.state_dict(), "Models/Transformer.pt")

with open('Models/TransformerLoss.pkl', 'wb') as f:
    pl.dump([loss_train] ,f)


Epoch 1/10 Loss: 0.7038267760276794
Epoch 2/10 Loss: 0.09483484647274017
Epoch 3/10 Loss: 0.028080702769756317
Epoch 4/10 Loss: 0.012959643244743347
Epoch 5/10 Loss: 0.007870923562347889
Epoch 6/10 Loss: 0.005108638553321361
Epoch 7/10 Loss: 0.0037165282789617775
Epoch 8/10 Loss: 0.0027161712914705277
Epoch 9/10 Loss: 0.0019443184860050678
Epoch 10/10 Loss: 0.0014907017659395933


In [5]:
def Predict(X,print_sentence = True):
    Prediction = th.zeros(27,dtype=th.int32)
    Prediction[:] = Vocab_en.__getitem__('<Pad>')
    Prediction_mask = th.zeros((27,128))

    Prediction[0] = Vocab_en.__getitem__('<Start>')
    Prediction_mask[0] = 1

    src_mask = tn.ones((27,128))
    src_mask[-(X == Vocab_fr.__getitem__('<Pad>')).sum():] = 0

    for i in range(1,27):
        out = Model(X_test[0],Prediction,src_padding_mask = src_mask,tgt_mask=tgt_mask,tgt_padding_mask = Prediction_mask)
        Prediction[i] = out.argmax(1)[i]
        Prediction_mask[i] = 1

    
    if print_sentence:
        Senctence = ""
        for word in Vocab_en.lookup_tokens(Prediction.tolist()):
            Senctence += " " + word

        print(Senctence)

    
    return Prediction,src_mask

def Predict_loss(X,Y,print_sentence = True):

    Prediction = th.zeros(27,dtype=th.int32)
    Prediction[:] = Vocab_en.__getitem__('<Pad>')
    
    Prediction_mask = th.zeros((27,d_model))

    Prediction_vectors = th.zeros((27,tgt_vocab_size))
    Prediction_vectors[0][204] = 1


    Prediction[0] = Vocab_en.__getitem__('<Start>')
    Prediction_mask[0] = 1

    src_mask = tn.ones((27,d_model))
    src_mask[-(X == Vocab_fr.__getitem__('<Pad>')).sum():] = 0

    for i in range(1,27):
        out = Model(X,Prediction,src_padding_mask = src_mask,tgt_mask=tgt_mask,tgt_padding_mask = Prediction_mask)
        Prediction[i] = out.argmax(1)[i]
        Prediction_vectors[i] = out[i]
        Prediction_mask[i] = 1

    
    if print_sentence:
        Senctence_pred = ""
        for word in Vocab_en.lookup_tokens(Prediction.tolist()):
            Senctence_pred += " " + word

        print("Predicted Senctence:")
        print(Senctence_pred)
        print("")
        Senctence_true = ""
        for word in Vocab_en.lookup_tokens(Y.tolist()):
            Senctence_true += " " + word
        print("True Senctence:")
        print(Senctence_true)

    loss = loss_fn(Prediction_vectors,Y)
    print(f"Loss: {loss.item()}")

    return loss

In [6]:
Model.to("cpu")
for i in range(5):
    Predict_loss(X_test[i],Y_test[i])

Predicted Senctence:
 <Start> his favorite favorite is is the banana january but your least favorite is the lime <Pad> <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>

True Senctence:
 <Start> her least favorite fruit is the banana , but your least favorite is the lime . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>
Loss: 0.6782876253128052
Predicted Senctence:
 <Start> the united states is dry may may may , it it is mild warm . . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>

True Senctence:
 <Start> the united states is nice during may , and it is quiet in august . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>
Loss: 2.227289915084839
Predicted Senctence:
 <Start> california is sometimes warm during june , but it is cold snowy autumn fall . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>

True Senctence:
 <Start> california is sometimes hot during june , but it is usually snowy in fall . <End> <Pad> <Pad> <P