In [1]:
# Importing the libraries

import torch as th
import torch as tn
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
import torchtext

from torchtext import vocab
import pickle as pl
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load English data and French data
with open('data/English_encodings.pkl', 'rb') as f:
    english_encodings,english_sentences,Paddings_en,Vocab_en = pl.load(f)
with open('data/French_encodings.pkl', 'rb') as f:
    french_encodings,french_sentences,Paddings_fr,Vocab_fr = pl.load(f)

# Get the vocabulary size
src_vocab_size = Vocab_fr.__len__()
tgt_vocab_size = Vocab_en.__len__()


In [3]:
d_model = 128
# Define Train/test split and Masking
X = tn.tensor(french_encodings)
Y = tn.tensor(english_encodings)


n_train = 1000
X_train = X[:n_train]
Y_train = Y[:n_train]
X_vali = X[10001:12000]
Y_vali = Y[10001:12000]
X_test = X[12001:]
Y_test = Y[12001:]

# Create batches
batch_size = 50

# Initialize the Batch tensors
X_train_batches = tn.zeros((int(n_train/batch_size),batch_size,27),dtype = tn.int64)
Y_train_batches = tn.zeros((int(n_train/batch_size),batch_size,27),dtype = tn.int64)

# Create the batches
for batch in range(int(n_train/batch_size)):

    # Fill Data batches
    X_train_batches[batch] = X_train[batch*batch_size:(batch+1)*batch_size]
    Y_train_batches[batch] = Y_train[batch*batch_size:(batch+1)*batch_size]

# Initialize the Mask tensors
#src_mask_test = tn.zeros((int(n_train/batch_size),batch_size*8,27,27))
#tgt_mask_test = tn.zeros((int(n_train/batch_size),batch_size*8,27,27))

src_key_masks = tn.ones((int(n_train/batch_size),batch_size,27,d_model))
tgt_key_masks = tn.ones((int(n_train/batch_size),batch_size,27,d_model))

idx_sample = 0
for batch in range(int(n_train/batch_size)):
    for sample in range(batch_size):
        
        src_key_masks[batch,sample][-Paddings_fr[idx_sample]:] = 0
        tgt_key_masks[batch,sample][-Paddings_en[idx_sample]:] = 0

        # for i in range(27):
        #     if np.random.rand() < 0.1:
        #         tgt_key_masks[batch,sample][i] = True

        #tgt_mask_test[batch,idx_sample*8:idx_sample*8+8] = tgt_mask(27,Paddings_en[idx_sample])
        #src_mask_test[batch,idx_sample*8:idx_sample*8+8] = src_mask(27,Paddings_fr[idx_sample])

        idx_sample += 1



In [17]:
class LSTMNet(nn.Module):
    def __init__(self, dim_input, dim_recurrent, num_layers, dim_output,src_padding_idx):
        super().__init__()

        self.Embedding_src = nn.Embedding(src_vocab_size,d_model, padding_idx=src_padding_idx)



        self.lstm = nn.LSTM(input_size = dim_input,
                            hidden_size = dim_recurrent,
                            num_layers = num_layers,
                            batch_first = True)
        self.fc_o2y = nn.Linear(dim_recurrent, dim_output)
    def forward(self, input):

        input = self.Embedding_src(input)
        
        # Get the last layer's last time step activation
        output, _ = self.lstm(input)
        #output = output[-1]
        return self.fc_o2y(F.relu(output))

class GRUNet(nn.Module):
    def __init__(self, dim_input, dim_recurrent, num_layers, dim_output,src_padding_idx):
        super().__init__()

        self.Embedding_src = nn.Embedding(src_vocab_size,d_model, padding_idx=src_padding_idx)
        #self.Softmax = nn.Softmax(dim = 1)

        self.gru = nn.GRU(input_size = dim_input,
                          hidden_size = dim_recurrent,
                          num_layers = num_layers,
                          batch_first = True)
        self.fc_y = nn.Linear(dim_recurrent, dim_output)
    def forward(self, input):

        input = self.Embedding_src(input)
        # Get the last layer's last time step activation
        output, _ = self.gru(input)
        output = self.fc_y(F.relu(output))
        #output = output[-1]
        return output

In [18]:
d_model = 128
dim_input = d_model
dim_recurrent = d_model
num_layers = 27
dim_output = tgt_vocab_size
src_padding_idx = Vocab_en.__getitem__('<Pad>')



LSTM = LSTMNet(
    dim_input = dim_input,
    dim_recurrent = dim_recurrent,
    num_layers = num_layers,
    dim_output = dim_output,
    src_padding_idx = src_padding_idx
)

GRU = GRUNet(
    dim_input = dim_input,
    dim_recurrent = dim_recurrent,
    num_layers = num_layers,
    dim_output = dim_output,
    src_padding_idx = src_padding_idx
)

In [19]:
# Define Training Function
def train(model, optimizer, loss_fn, n_epochs, n_batches, X_train_batches, Y_train_batches, src_padding_idx):
    for epoch in range(n_epochs):
        loss_epoch = 0
        for batch in range(n_batches):
            # Get the data
            X_batch = X_train_batches[batch]
            Y_batch = Y_train_batches[batch]

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            y_pred = model(X_batch)

            # Compute the loss
            loss = 0 
            for sample in range(batch_size):
                loss += loss_fn(y_pred[sample], Y_batch[sample])

            # Backward pass
            loss.backward()

            # Update the parameters
            optimizer.step()

            # Print the loss
            loss_epoch += loss.item()

        print('Epoch: %d, Batch: %d, Loss: %f' % (epoch, batch, loss_epoch/n_train))

In [20]:
# Define the loss function
loss_fn = nn.CrossEntropyLoss()

# Define the optimizer LSTM
optimizer_LSTM = th.optim.Adam(LSTM.parameters(), lr=0.001)

# Define the optimizer GRU
optimizer_GRU = th.optim.Adam(GRU.parameters(), lr=0.001)

# Define the number of epochs
n_epochs = 10

# Define the number of batches
n_batches = 20

#train(LSTM, optimizer_LSTM, loss_fn, n_epochs, n_batches, X_train_batches, Y_train_batches, src_padding_idx)

In [21]:
train(GRU, optimizer_GRU, loss_fn, n_epochs, n_batches, X_train_batches, Y_train_batches, src_padding_idx)

Epoch: 0, Batch: 19, Loss: 5.322143
Epoch: 1, Batch: 19, Loss: 5.298316
Epoch: 2, Batch: 19, Loss: 5.273137
Epoch: 3, Batch: 19, Loss: 5.247820
Epoch: 4, Batch: 19, Loss: 5.228071
Epoch: 5, Batch: 19, Loss: 5.212155


In [12]:
out = LSTM(X_train_batches[0][0])

In [16]:
out

tensor([[ 1.3011e+00, -1.2285e+00, -1.4818e+00,  ..., -1.5636e+00,
          2.4369e-03,  5.2926e+00],
        [ 2.0830e+00, -1.7962e+00, -2.8730e+00,  ..., -3.2828e+00,
         -2.2303e+00,  2.9376e+00],
        [ 3.3724e+00, -9.3915e-01, -2.7735e+00,  ..., -3.4465e+00,
         -2.8060e+00, -6.5243e-02],
        ...,
        [ 6.2368e-02, -4.7177e-01,  2.2333e+00,  ...,  2.9843e+00,
          7.3382e+00, -1.8834e+00],
        [ 6.2177e-02, -4.7159e-01,  2.2331e+00,  ...,  2.9842e+00,
          7.3389e+00, -1.8836e+00],
        [ 6.2074e-02, -4.7150e-01,  2.2331e+00,  ...,  2.9842e+00,
          7.3392e+00, -1.8838e+00]], grad_fn=<AddmmBackward0>)

In [14]:
out.argmax(dim=1)

tensor([206, 206,   0,   0,   5,   5,   1,   1,   1,   4,   0, 205, 205, 205,
        205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205])

In [15]:
Y_train_batches[0][0]

tensor([206,  18,  24,   0,   9,  68,   5,  40,   1,   8,   4,   0,  56,   3,
         45,   2, 204, 205, 205, 205, 205, 205, 205, 205, 205, 205, 205])