In [1]:
# Importing the libraries

import torch as th
import torch as tn
from torchtext import vocab
import pickle as pl
from Models.Transformer import Transformer
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load Data
with open('data/English_encodings.pkl', 'rb') as f:
    english_encodings,english_sentences,Paddings_en,Vocab_en = pl.load(f)
with open('data/French_encodings.pkl', 'rb') as f:
    french_encodings,french_sentences,Paddings_fr,Vocab_fr = pl.load(f)

# Get the vocabulary size
src_vocab_size = Vocab_fr.__len__()
tgt_vocab_size = Vocab_en.__len__()
src_padding_idx = Vocab_fr.__getitem__('<Pad>')
tgt_padding_idx = Vocab_en.__getitem__('<Pad>')


# Load Train,Vaildation and Test data
with open('data/Train_data.pkl', 'rb') as f:
    X_train_batches,Y_train_batches,src_padding_mask,tgt_padding_mask = pl.load(f)

with open('data/Validation_data.pkl', 'rb') as f:
    X_vali_batches,Y_vali_batches,src_padding_mask_vali = pl.load(f)

with open('data/Test_data.pkl', 'rb') as f:
    X_test_batches,Y_test_batches,src_padding_mask_test = pl.load(f)

In [13]:
# Function to train the model
def train(model, src_data, tgt_data, src_padding_mask, tgt_padding_mask, optimizer, loss_fn, epochs, Model_Params=None):
    
    # Check if CUDA/mps is available
    if tn.cuda.is_available():
        device = tn.device("cuda")
    elif tn.backends.mps.is_available():
        device = "cpu"
    else:
        device = "cpu"
    
    # Move the model to the device
    model.to(device)

    # Move the data to the device
    src_data = src_data.to(device)
    tgt_data = tgt_data.to(device)
    src_padding_mask = src_padding_mask.to(device)
    tgt_padding_mask = tgt_padding_mask.to(device)
    
    
    # Initialize the loss
    loss_train = []
    loss_vali = []
    
    n_batches,batch_size,_ = src_data.shape


    # Train the model
    for epoch in range(epochs):
        # Initialize the loss
        epoch_loss = 0
        epoch_loss_vali = 0
        # Train the model
        for src_batch,tgt_batch,src_pad,tgt_pad in zip(src_data,tgt_data,src_padding_mask,tgt_padding_mask):
            
            # Zero the gradients
            optimizer.zero_grad()

            # Batch loss
            loss = 0

            # Go trough each batch
            for i in range(batch_size):
                out = model(src_batch[i],tgt_batch[i],
                        src_padding_mask = src_pad[i],
                        tgt_padding_mask = tgt_pad[i]
                        )

                loss += loss_fn(out,tgt_batch[i])
            
            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()

            # Add the loss
            epoch_loss += loss.item()
            
            
        
        # Validation loss

        loss_train.append(epoch_loss/(n_batches*batch_size))
    
    return loss_train


In [14]:
# Intialize the model with set hyperparameters
T = 27
d_model = 128 # Dimension of the model (Embedding size)
d_ff = 256 # Dimension of the feedforward network model in transformer
nhead = 3 # Number of heads in the multiheadattention models
dk = 64
dv = 64
num_layers = 3



# Initialize the model
Model = Transformer(
    T = T,
    d_model = d_model,
    nhead = nhead,
    d_ff = d_ff,
    dk = 64,
    dv = 64,
    num_layers = num_layers,
    src_vocab_size = src_vocab_size,
    tgt_vocab_size = tgt_vocab_size,
    src_padding_idx = src_padding_idx,
    tgt_padding_idx = tgt_padding_idx,
    dropout=0.1
)
    

In [15]:
loss_fn = tn.nn.CrossEntropyLoss()
optimizer = tn.optim.Adam(Model.parameters(), lr=0.001)

# Train the model
loss_train = train(Model, X_train_batches, Y_train_batches, src_padding_mask, tgt_padding_mask, optimizer, loss_fn, epochs=10)



UnboundLocalError: local variable 'loss_epoch' referenced before assignment

In [12]:
X_train_batches.shape

torch.Size([200, 50, 27])

In [35]:
trainmodel(100)

tn.save(Model.state_dict(), "Transformer.pt")

Epoch: 0 Loss: 0.6889562835693359
Epoch: 1 Loss: 0.6403594646453857
Epoch: 2 Loss: 0.6005053577423096
Epoch: 3 Loss: 0.5656527500152588
Epoch: 4 Loss: 0.5337777290344238
Epoch: 5 Loss: 0.5060778198242187
Epoch: 6 Loss: 0.48012523460388185
Epoch: 7 Loss: 0.45571697425842284
Epoch: 8 Loss: 0.43140914344787595
Epoch: 9 Loss: 0.40972098922729494
Epoch: 10 Loss: 0.3919607830047607
Epoch: 11 Loss: 0.37960798835754395
Epoch: 12 Loss: 0.3641638145446777
Epoch: 13 Loss: 0.3367669448852539
Epoch: 14 Loss: 0.3162376413345337
Epoch: 15 Loss: 0.3018227119445801
Epoch: 16 Loss: 0.2860628709793091
Epoch: 17 Loss: 0.26723638820648193
Epoch: 18 Loss: 0.2516777753829956
Epoch: 19 Loss: 0.24089940643310548
Epoch: 20 Loss: 0.2339546661376953
Epoch: 21 Loss: 0.2102954912185669
Epoch: 22 Loss: 0.18968856048583985
Epoch: 23 Loss: 0.1738504629135132
Epoch: 24 Loss: 0.16169569826126098
Epoch: 25 Loss: 0.15234423685073853
Epoch: 26 Loss: 0.1456462516784668
Epoch: 27 Loss: 0.13956336879730225
Epoch: 28 Loss: 0.1

In [36]:
tgt_mask = th.full((T,T),float('-inf')).triu(diagonal=1)
out = Model(X_train[0],Y_train[0],src_padding_mask = src_key_masks[0][0],tgt_padding_mask = tgt_key_masks[0][0],tgt_mask = tgt_mask)
out.argmax(dim=1)

tensor([206,  18,  24,   0,   9,  68,   5,  40,   1,   8,   4,   0,  56,   3,
         45,   2, 204, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205])

In [37]:
Y_train[0]

tensor([206,  18,  24,   0,   9,  68,   5,  40,   1,   8,   4,   0,  56,   3,
         45,   2, 204, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205])

In [38]:
Prediction = th.zeros(27,dtype=th.int32)
Prediction[:] = Vocab_en.__getitem__('<Pad>')
Prediction_mask = th.zeros((27,128))

Prediction[0] = Vocab_en.__getitem__('<Start>')
Prediction_mask[0] = 1


In [39]:
X_test[0]
src_mask = tn.ones((27,128))
src_mask[-Paddings_fr[0]:] = 0
src_mask

tensor([[1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        [1., 1., 1.,  ..., 1., 1., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [40]:
Prediction = th.zeros(27,dtype=th.int32)
Prediction[:] = Vocab_en.__getitem__('<Pad>')
Prediction_mask = th.zeros((27,128))

Prediction[0] = Vocab_en.__getitem__('<Start>')
Prediction_mask[0] = 1

for i in range(1,27):
    out = Model(X_test[0],Prediction,tgt_mask = tgt_mask,src_padding_mask = src_mask,tgt_padding_mask = Prediction_mask)
    Prediction[i] = out.argmax(1)[i]
    Prediction_mask[i] = 1

Prediction

tensor([206,  32,  12,  12,  14,   0,   6,  14,   1,   7,   0,  13,  12,   0,
          6,  84,   2, 204, 205, 205, 205, 205, 205, 205, 205, 205, 205],
       dtype=torch.int32)

In [80]:
def Predict(X,print_sentence = True):
    Prediction = th.zeros(27,dtype=th.int32)
    Prediction[:] = Vocab_en.__getitem__('<Pad>')
    Prediction_mask = th.zeros((27,128))

    Prediction[0] = Vocab_en.__getitem__('<Start>')
    Prediction_mask[0] = 1

    for i in range(1,27):
        out = Model(X_test[0],Prediction,tgt_mask = tgt_mask,src_padding_mask = src_mask,tgt_padding_mask = Prediction_mask)
        Prediction[i] = out.argmax(1)[i]
        Prediction_mask[i] = 1

    
    if print_sentence:
        Senctence = ""
        for word in Vocab_en.lookup_tokens(Prediction.tolist()):
            Senctence += " " + word

        print(Senctence)

    
    return Prediction

def Predict_loss(X,Y,print_sentence = True):
    Prediction = th.zeros(27,dtype=th.int32)
    Prediction[:] = Vocab_en.__getitem__('<Pad>')
    Prediction_mask = th.zeros((27,128))

    Prediction_vectors = th.zeros((27,207))
    Prediction_vectors[0][204] = 1


    Prediction[0] = Vocab_en.__getitem__('<Start>')
    Prediction_mask[0] = 1

    src_mask = tn.ones((27,128))
    src_mask[-(X == Vocab_fr.__getitem__('<Pad>')).sum():] = 0

    for i in range(1,27):
        out = Model(X,Prediction,tgt_mask = tgt_mask,src_padding_mask = src_mask,tgt_padding_mask = Prediction_mask)
        Prediction[i] = out.argmax(1)[i]
        Prediction_vectors[i] = out[i]
        Prediction_mask[i] = 1

    
    if print_sentence:
        Senctence_pred = ""
        for word in Vocab_en.lookup_tokens(Prediction.tolist()):
            Senctence_pred += " " + word

        print("Predicted Senctence:")
        print(Senctence_pred)
        print("")
        Senctence_true = ""
        for word in Vocab_en.lookup_tokens(Y.tolist()):
            Senctence_true += " " + word
        print("True Senctence:")
        print(Senctence_true)

    loss = loss_fn(Prediction_vectors,Y)
    print(f"Loss: {loss.item()}")

    return loss

In [82]:
l = Predict_loss(X_test[2],Y_test[2])

Predicted Senctence:
 <Start> california is sometimes warm during fall , and it is usually busy in july . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>

True Senctence:
 <Start> california is sometimes hot during june , but it is usually snowy in fall . <End> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad> <Pad>
