In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import nltk
import re
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, TensorDataset

from torch.utils.data import DataLoader

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata

In [3]:
import collections
from collections import Counter

In [4]:
import random

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [6]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jesan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jesan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jesan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Jesan\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [7]:
pad = 0
sos = 1
eos = 2

In [8]:
lines = open('eng-fra.txt', encoding='utf-8').read().strip().split('\n')
print(len(lines))

135842


In [9]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [10]:
x = ([])
y = ([])

for l in lines:
    temp1, temp2 = [], []
    s1, s2 = l.split('\t')
    temp1.append(normalizeString(s1))
    temp2.append(normalizeString(s2))
    x.append(temp1)
    y.append(temp2)

print(len(x))
print(len(y))
#for i in range(0, 5):
    #print(x[i])
    #print(y[i])

135842
135842


In [11]:
def onehotdic(data, language):
    vocab = []
    stop_words = set(stopwords.words(language))
    for i in range(0, len(data)):
        sent = data[i]
        temp = ''
        temp = ''.join(sent)
        for word in temp.split():
            if word not in stop_words:
                vocab.append(word)
    corpus = Counter(vocab)
    corpus_ = sorted(corpus,key=corpus.get,reverse=True)

    onehot_dic = {'PAD': 0, 'SOS': 1, 'EOS': 2}

    for i,w in enumerate(corpus_):
        onehot_dic[w] = i

    return onehot_dic

In [12]:
engonehot_dic = onehotdic(x, 'english')
fronehot_dic = onehotdic(y, 'french')
#print(engonehot_dic)
#print(fronehot_dic)

In [13]:
def tokenToindex(data, onehot_dict):
    final = []
    for i in range(0, len(data)):
        sent = data[i]
        temp = ''
        temp = ''.join(sent)
        final.append([onehot_dict[word] for word in temp.split()
                             if word in onehot_dict.keys()])

    return final


In [14]:
x_final = tokenToindex(x, engonehot_dic)
y_final = tokenToindex(y, fronehot_dic)
for i in range(0, 3):
    print(x_final[i])

[5]
[280, 33]
[280, 33]


In [15]:
seqLen = 200

In [16]:
def padding(data, seqLen):
    features = np.zeros((len(data), seqLen),dtype=int)
    for i, rev in enumerate(data):
        if len(rev) != 0:
            features[i, 0] = 1
            features[i, 1:len(rev)+1] = np.array(rev)
            features[i, len(rev)+1] = 2
    return features

In [17]:
x_final_numpy = np.array(x_final, dtype = object)
y_final_numpy = np.array(y_final, dtype = object)

print(x_final_numpy[-1])
x_final_pad = padding(x_final_numpy, seqLen)
y_final_pad = padding(y_final_numpy, seqLen)
print(x_final_pad[-1])

[66, 746, 8, 628, 2406, 296, 6945, 1144, 1625, 154, 12882, 1528, 2118, 5914, 1427, 3449, 2431, 945, 470, 1984, 945, 943, 171, 229, 12883, 2889]
[    1    66   746     8   628  2406   296  6945  1144  1625   154 12882
  1528  2118  5914  1427  3449  2431   945   470  1984   945   943   171
   229 12883  2889     2     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x_final_pad, y_final_pad, test_size=0.2, random_state=42)
print(len(x_train))
print(len(y_train))

108673
108673


In [19]:
x_train_tensor = torch.from_numpy(x_train).to(device)
y_train_tensor = torch.from_numpy(y_train).to(device)

x_test_tensor = torch.from_numpy(x_test).to(device)
y_test_tensor = torch.from_numpy(y_test).to(device)

In [20]:
print(x_train_tensor.device)

cuda:0


In [21]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
test_data = TensorDataset(x_test_tensor, y_test_tensor)

In [22]:
batchSize = 64

In [23]:
train_loader = DataLoader(train_data, shuffle=True, batch_size=batchSize)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batchSize)

In [24]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print('Sample output: \n', sample_y)

Sample input size:  torch.Size([64, 200])
Sample input: 
 tensor([[  1,   7,   0,  ...,   0,   0,   0],
        [  1, 306, 406,  ...,   0,   0,   0],
        [  1, 191,  88,  ...,   0,   0,   0],
        ...,
        [  1, 159,  19,  ...,   0,   0,   0],
        [  1,  44, 634,  ...,   0,   0,   0],
        [  1,  52,  22,  ...,   0,   0,   0]], device='cuda:0',
       dtype=torch.int32)
Sample output: 
 tensor([[   1, 2299,   74,  ...,    0,    0,    0],
        [   1,  161,    8,  ...,    0,    0,    0],
        [   1, 3300, 5735,  ...,    0,    0,    0],
        ...,
        [   1,   12,  352,  ...,    0,    0,    0],
        [   1,   64,   36,  ...,    0,    0,    0],
        [   1,   67,  176,  ...,    0,    0,    0]], device='cuda:0',
       dtype=torch.int32)


In [25]:
import torchtext
glove = torchtext.vocab.GloVe(name='6B', dim=300)

In [26]:
inputDimension = 300
hiddenSize = 256
numLayer = 2
batchFirst = True
biDirectional = False
direction = 1
encoderDropout = 0.3
decoderDropout = 0.3

In [27]:
class Encoder(torch.nn.Module):
    def __init__(self, inputDimension, hiddenSize, numLayer, batchFirst, p):
        super(Encoder, self).__init__()

        self.input_size = inputDimension
        self.hidden_size = hiddenSize
        self.num_layers = numLayer
        self.batch_first = batchFirst
        self.embedding = nn.Embedding.from_pretrained(glove.vectors)
        self.dropout = nn.Dropout(p)

        self.lstm = torch.nn.LSTM(self.input_size, self.hidden_size, self.num_layers, bidirectional=False,
                          batch_first=self.batch_first, dropout=p)

    def forward(self, input):
        #print(f'input shape: {input.shape}')
        self.batch_size = input.size(0)
        hidden = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(device)
        cell = torch.zeros(self.num_layers, self.batch_size, self.hidden_size).to(device)
        #print(f'hidden shape: {hidden.shape}')
        embd = self.dropout(self.embedding(input))
        #print(f'embedding shape: {embd.shape}')
        output, (hidden, cell) = self.lstm(embd, (hidden, cell))
        return hidden, cell


In [28]:
from torchtext.vocab import FastText

In [29]:
language = 'fr'
# the dimensionality of the embeddings is determined by the pre-trained FastText model itself.
#The dimensionality is typically set during the training of the FastText model and is not a configurable parameter
#when loading the embeddings.
ft_vectors = FastText(language=language, cache='cache.txt')

In [30]:
pretrained_embeddings = torch.tensor(ft_vectors.vectors)

  pretrained_embeddings = torch.tensor(ft_vectors.vectors)


In [31]:
class Decoder(torch.nn.Module):
    def __init__(self, inputDimension, hiddenSize, numLayer, batchFirst, p, outputSize):
        super(Decoder, self).__init__()
        self.input_size = inputDimension
        self.hidden_size = hiddenSize
        self.num_layers = numLayer
        self.batch_first = batchFirst
        self.output_size = outputSize

        self.embedding = nn.Embedding.from_pretrained(torch.tensor(ft_vectors.vectors))
        self.dropout = nn.Dropout(p)

        self.lstm = torch.nn.LSTM(self.input_size, self.hidden_size, self.num_layers, bidirectional=False,
                          batch_first=self.batch_first, dropout=p)

        self.fc = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, cell):
        # shape of input: (batchSize) but we want it as (batchsize, seqLen) -> (batchsize, 1)
        input = input.unsqueeze(1)
        #print(f'shape of input: {input.shape}')
        embd = self.dropout(self.embedding(input))
        #print(f'embedding shape: {embd.shape}')
        output, (hidden, cell) = self.lstm(embd, (hidden, cell))
        # shape of output -> (batchSize, seqLen = 1, hiddenSize)
        prediction = self.fc(output)
        #print(f'prediction shape: {prediction.shape}')
        prediction = prediction.squeeze(1)

        return prediction, hidden, cell

In [32]:
inputDimensionFr = pretrained_embeddings.shape[1] # embedding dimension for french words
outputSize = len(fronehot_dic)

In [33]:
class Seq2Seq(torch.nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
    def forward(self, sourceLang, targetLang, seqLen, outputSize):
        #print(sourceLang.shape)
        batch_size = sourceLang.shape[0]
        target_len = seqLen

        outputs = torch.zeros(target_len, batch_size, outputSize).to(device)

        hidden, cell = self.encoder(sourceLang)

        x = targetLang[:, 0]

        for t in range(1, 100):
            output, hidden, cell = self.decoder(x, hidden, cell)
            #output = F.softmax(output, dim=1)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = targetLang[:, t] if random.random() < 0.5 else best_guess
            #print(f'shape of best guess: {best_guess.shape}')
        return outputs


In [34]:
test = torch.randint(2, 10, size = (5, 7))
print(test)
first = test[:, 0]
print(first)
print(first.shape)

first = first.unsqueeze(1)

print(first)
print(first.shape)

tensor([[8, 9, 4, 4, 6, 2, 9],
        [4, 6, 6, 9, 6, 6, 3],
        [6, 7, 7, 6, 6, 2, 9],
        [6, 9, 6, 4, 9, 5, 7],
        [9, 4, 4, 4, 5, 6, 9]])
tensor([8, 4, 6, 6, 9])
torch.Size([5])
tensor([[8],
        [4],
        [6],
        [6],
        [9]])
torch.Size([5, 1])


In [35]:
encoderNet = Encoder(inputDimension, hiddenSize, numLayer, batchFirst, encoderDropout).to(device)
decoderNet = Decoder(inputDimensionFr, hiddenSize, numLayer, batchFirst, decoderDropout, outputSize).to(device)
modelNet = Seq2Seq(encoderNet, decoderNet).to(device)

  self.embedding = nn.Embedding.from_pretrained(torch.tensor(ft_vectors.vectors))


In [36]:
total_params = sum(p.numel() for p in modelNet.parameters())
total_params /= 1000000
print(f"Total number of parameters: {total_params} million")

Total number of parameters: 473.378042 million


In [37]:
learningRate = 0.001
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(modelNet.parameters(), lr=learningRate)

In [38]:
import os

In [40]:
clip = 1
epochs = 2

valid_loss_min = np.Inf
epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

checkpoint_dir = 'checkpoints/'
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

checkpoint_path = os.path.join(checkpoint_dir, 'model_checkpoint.pth')

for epoch in range(epochs):
    print(f'Number of epoch: {epoch}')
    train_loss = 0.0
    train_correct = 0
    train_total = 0
    modelNet.train()

    for source, target in train_loader:
        optimizer.zero_grad()

        outputs = modelNet(source, target, seqLen, outputSize).to(device)
        # outputs shape -> seqLen, batchsize, output size but want it 2 dimensional
        #print(outputs.shape)
        #print(target.shape)

        outputs = outputs[1:].reshape(-1, outputs.shape[2])
        target = target[:, 1:].reshape(-1)

        #print(outputs.shape)
        #print(target.shape)

        target = target.long()

        loss = criterion(outputs, target)
        loss.backward()
        nn.utils.clip_grad_norm_(modelNet.parameters(), clip)
        optimizer.step()

        train_loss += loss.item()
        print(loss.item())

    epoch_tr_loss.append(train_loss / len(train_loader))
    print(f'Training Loss: {epoch_tr_loss[-1]:.4f}')
    '''
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': modelNet.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': train_loss,
        # Add other relevant information you want to save
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Saved checkpoint at epoch {epoch}")

    # Optionally, load a checkpoint later for resuming training or inference
    # Example of loading a checkpoint:
    # checkpoint = torch.load(checkpoint_path)
    # model.load_state_dict(checkpoint['model_state_dict'])
    # optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    # start_epoch = checkpoint['epoch'] + 1
    # train_loss = checkpoint['loss']
    '''

Number of epoch: 0
6.00710391998291
5.681663513183594
5.45096492767334
5.3096232414245605


KeyboardInterrupt: 

In [41]:
print(outputs.device)

cuda:0
