<a href="https://colab.research.google.com/github/Jerryson520/NLP-Projects/blob/main/Seq2Seq_Model_Machine_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

# Change to MPS logic if on mac
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Put the data in your Google Drive
# You ca get the data here: https://www.kaggle.com/competitions/titanic/data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


For this notebook, use the hints to fill in the missing code.

In [None]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r"\1 ", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [None]:
# s = "Hello!How are you?I am fine."
# normalizeString(s)

In [None]:
! ls drive/MyDrive/IEOR4573_Deep_Learning_for_NLP/HW7

eng-fra.txt  NLP_HW_7_Seq2Seq_MT.ipynb


In [None]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open("/content/drive/MyDrive/IEOR4573_Deep_Learning_for_NLP/HW7/%s-%s.txt" % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines] # [['eng1', 'fra1'], ['eng2', 'fra2'], ..., [...]]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs] # [['fra1', 'eng1'], ['fra2', 'eng2'], ..., [...]]
        # French to English
        input_lang = Lang(lang2) # input = fra
        output_lang = Lang(lang1) # output = eng
    else:
        # English to French
        input_lang = Lang(lang1) # input = eng
        output_lang = Lang(lang2) # output = fra

    return input_lang, output_lang, pairs

In [None]:
MAX_LENGTH = 10

# We'll just read in data with these prefixes so we have easier data to deal with
# These are "target" prefixes
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

# Only use pairs where the english data (pair[1]) has the prefix above
# Also, only consider data where pair[0] and pair[1] have length less than MAX_LENGTH
# Split on space first here also as MAX_LENGTH means the number of tokens (words), not number of chars
def filterPair(p):
  # return FILL_IN
  return (len(p[0].split()) < MAX_LENGTH) and (len(p[1].split()) < MAX_LENGTH) and (p[1].startswith(eng_prefixes))

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
pairs = [['dsjdjsjdf', 'i am get hi my'], ['dsdsjiigdf', 'she is a cat']]
pairs[:1]

[['dsjdjsjdf', 'i am get hi my']]

In [None]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    # print(f"pairs are: {pairs[:2]}")
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'fra', True) # Our goal is to translate French to English
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 11449 sentence pairs
Counting words...
Counted words:
fra 5714
eng 3803
['elles sont parties. ', 'they re gone. ']


In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # Initialize the embedding and lstm; use batch_first=True
        # self.embedding = FILL_IN
        # self.lstm = FILL_IN
        self.embedding = nn.Embedding(input_size, hidden_size) # embed_size = hidden_size

        self.lstm = nn.LSTM(
            hidden_size,
            hidden_size, # hidden state of RNN
            batch_first = True
        )

    def forward(self, input, hidden_cell):
        # Get the embeddings and reshape to be (1, 1, -1)
        # Why? remember we use batch size = 1 in this HW for simplicity
        # embedded = FILL_IN
        embedded = self.embedding(input).view(1, 1, -1) # 1 * 1 * embed_size (we translate word by word)
        output = embedded
        # Push through the lstm
        # output, hidden_cell = FILL_IN
        output, hidden_cell = self.lstm(output, hidden_cell)
        return output, hidden_cell

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device) # h_t with size (N, D*num_layers, hidden_size)

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        # Initialize the embedding
        # self.embedding = FILL_IN
        self.embedding = nn.Embedding(output_size, embed_size)
        # Initialize an LSTM with yt and kt dimensions hidden_size
        # Use batch_first=True
        # self.lstm = FILL_IN
        self.lstm = nn.LSTM(
            embed_size,
            hidden_size,
            batch_first = True
        )
        # Initialize a Linear layer going to the appropriate vocabulary size
        # self.out = FILL_IN
        self.out = nn.Linear(hidden_size, output_size)
        # Optional: If you use NLLLoss, initialize LogSoftmax here
        # What dimension?
        # self.softmax = FILL_IN
        # self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden_cell): # hidden_cell = (hidden_state, cell_state)
        output = self.embedding(input).view(1, 1, -1)
        # Push through relu
        # output = FILL_IN
        output = F.relu(output) # 1 * 1 * embed_size
        # Push output and hidden_cell tuple through the lstm
        output, hidden_cell = self.lstm(output, hidden_cell)
        # Apply LogSoftmax to output (?)
        # Note you can leave this out if you wor with logits and CrossEntropyLoss
        # output = self.softmax(output)
        output = self.out(output)
        return output, hidden_cell

    def initHidden(self):
        # Used to initialize the hidden state (or cell state) to a tensor of dimension (1, 1, hidden_size)
        # Just return a tensor here
        # return FILL_IN
        return torch.zeros(1, 1, self.hidden_size, device = device)

In [None]:
# Split a sentence by ' ' and return a list of the tokens (int ids) for each word
# Use word2index
def indexesFromSentence(lang, sentence):
    # return FILL_IN
    return [lang.word2index[word] for word in sentence.split(' ')]

# Call the above on a sentence
# After calling, add the EOS_token (int id) to the gotten list
# Return a tensor, but reshape it so it's dimensions (-1, 1)
def tensorFromSentence(lang, sentence):
    # indexes = FILL_IN
    # FILL_IN
    # return FILL_IN
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device = device).view(-1,1)

# For a source, target pair, call the above. Return a tuple of 2 tensors, one input_tensor and another an output_tensor
def tensorsFromPair(pair):
    # input_tensor = FILL_IN
    # target_tensor = FILL_IN
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor) # (num_of_word + 1, )

In [None]:
# For 50% of pairs, use teacher forcing so that we predict (y_1, y_2, ..., y_{T}) from (y_0, y_1, ..., y_{T-1}) on the decoder side
# Without teacher forcing, we start with y_0 = SOS_token and then use \hat{y}_1, the prediction at time step 0 as the input to time step 1 on the decoder side
# For this case, we'll predict (\hat{y}_1, \hat{y}_2, ..., \hat{y}_{T}) from (y_0, \hat{y}_1, ..., \hat{y}_{T-1}) on the decoder side
# The crucial thing to realize here is that \hat{y}_t is stochastic, and dependent on what the model predicts - mistakes propogate!
random.seed(42)
teacher_forcing_ratio = 0.5

def train(
    input_tensor,
    target_tensor,
    encoder,
    decoder,
    encoder_optimizer,
    decoder_optimizer,
    criterion,
    max_length=MAX_LENGTH
):
    # Initialize the hidden and cell states
    # encoder_hidden = FILL_IN
    # encoder_cell = FILL_IN
    encoder_hidden = encoder.initHidden() # (1, 1, hidden_size)
    encoder_cell = encoder.initHidden() # (1, 1, hidden_size)

    # Reset the optimizer gradients to 0
    # FILL_IN
    # FILL_IN
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0) # seq_len
    target_length = target_tensor.size(0)
    # print(input_tensor.size())

    loss = 0

    # Reverse the input here, see below
    # input_tensor = FILL_IN
    input_tensor = torch.flip(input_tensor, dims = (0,))

    # If we want to predict [x, y, z] from [a, b, c], we should feed in [c, b, a] on the encoder side so that a is as close to x as possible
    # The above trick was noted in making a big difference
    encoder_outputs = torch.zeros(max_length+1, encoder.hidden_size, device = device) # outputs = (num_of_words, hidden_size(embed_size))

    for it in range(input_length):
        # encoder_output, (encoder_hidden, encoder_cell) = FILL_IN
        encoder_output, (encoder_hidden, encoder_cell) = encoder(input_tensor[it], (encoder_hidden, encoder_cell))
        # print(f"it = {it}")
        encoder_outputs[it] = encoder_output[0,0]

    # Initialize the decoder input to the SOS_token
    # decoder_input = FILL_IN
    decoder_input = torch.tensor([[SOS_token]], device = device)

    # Initialize the hidden states of the decoder with the hidden states of the encoder
    # decoder_hidden = FILL_IN
    # decoder_cell = FILL_IN
    decoder_hidden = decoder.initHidden() # (1, 1, hidden_size)
    decoder_cell = decoder.initHidden()

    # For this pair, use teacher forcing with 50% probability, else don't
    # use_teacher_forcing = FILL_IN
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    target_length_used = 0

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        target_length_used = target_length

        for jt in range(target_length):
            # Push decoder_input, decoder_hidden, and decoder_cell through the decoder
            # decoder_output, (decoder_hidden, decoder_cell) = FILL_IN
            decoder_output, (decoder_hidden, decoder_cell) = decoder(decoder_input, (decoder_hidden, decoder_cell))
            # Update the loss
            # loss += FILL_IN
            # print(f"jt = {jt}")
            # print(f"With teacher-forcing, decorder's output shape is {decoder_output.size()}")
            # print(f"With teacher-forcing, the target tensor's shape is {target_tensor.size()}")
            loss += criterion(decoder_output.squeeze(0), target_tensor[jt])
            # Set the next decoder_input to the current y_t
            # decoder_input = FILL_IN  # Teacher forcing
            decoder_input = target_tensor[jt]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        # Note that in this case we feed in at most target_length words
        # If, however, we predict EOS_token, we break out
        # You need to also carefully get the target lenght used since it might not be target_length
        for jt in range(target_length):
            # As before
            # decoder_output, (decoder_hidden, decoder_cell) = FILL_IN
            decoder_output, (decoder_hidden, decoder_cell) = decoder(decoder_input, (decoder_hidden, decoder_cell))

            # Get the top index, \hat{y}_t; this will be the next decoder_input
            # topv, topi = FILL_IN
            # decoder_input = FILL_IN  # Detach from history as input
            topv, topi = decoder_output.topk(1)
            decoder_input = topi[0].detach() # Detach from history as input

            # loss += FILL_IN
            # print(f"jt = {jt}")
            # print(f"Without teacher-forcing, decorder's output shape is {decoder_output.size()}")
            # print(f"Without teacher-forcing, target tensors shape is {target_tensor.size()}")
            loss += criterion(decoder_output.squeeze(0), target_tensor[jt])

            # Update the target_length_used
            # target_length_used += FILL_IN
            target_length_used += 1

            # If the EOS_token was generated, exit
            if decoder_input.item() == EOS_token: break

    # Collect gradients
    # FILL_IN
    loss.backward()

    # Do a step; do this both for the encoder and the decoder
    # FILL_IN
    # FILL_IN
    encoder_optimizer.step()
    decoder_optimizer.step()

    # Return the loss for this pair. Note that you'll need to use target_length_used not target_length; why?
    return loss.item() / target_length_used # because of potential early stopping

In [None]:
""

''

In [None]:
# Helper functions used to time the optimizations
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

trainIters below does SGD (batch 1) on the data.

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

def trainIters(
    encoder,
    decoder,
    n_iters,
    print_every=1000,
    plot_every=100,
    learning_rate=0.01
):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    # Initialize the encoder and decoder optimizers with the above learning rate
    # encoder_optimizer = FILL_IN
    # decoder_optimizer = FILL_IN
    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

    # Get n_iters training pairs
    # In this example, we are effectively doing SGD with batch size 1
    # training_pairs = FILL_IN
    training_pairs = [tensorsFromPair(random.choice(pairs)) for _ in range(n_iters)]

    # The loss; either NLLLoss if you use log sigmoids or CrossEntropyLoss if you use logits
    # criterion = FILL_IN
    criterion = nn.CrossEntropyLoss()

    for it in range(1, n_iters + 1):
        training_pair = training_pairs[it - 1]
        # input_tensor = FILL_IN
        # target_tensor = FILL_IN
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        # Train on the input, target pair
        # loss = FILL_IN
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

        # Update the total loss and the plot loss
        # We can plot and print at different granularities
        print_loss_total += loss
        plot_loss_total += loss

        if it % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(
                '%s (%d %d%%) %.4f' % (
                    timeSince(start, it / n_iters),
                    it, it / n_iters * 100, print_loss_avg)
            )

        if it % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

            showPlot(plot_losses)

Fill in the evaluation function using the hints below.

In [None]:
# For a certain input, get the predicted output sentence
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        # Transform the input sentence into a tensor
        # input_tensor = tensorFromSentence(FILL_IN)
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        # Initilize the hidden and cell states of the LSTM
        # encoder_hidden = FILL_IN
        # encoder_cell = FILL_IN
        encoder_hidden = encoder.initHidden()
        encoder_cell = encoder.initHidden()


        # Run the data through the LSTM word by word manually
        # At each step, feed in the input, the hidden state, and the cell state and calture the new hidden / cell states
        encoder_outputs = torch.zeros(max_length+1, encoder.hidden_size, device = device) # because num(words) < max_length, but after append,num(words+EOS) < max_length + 1

        for it in range(input_length):
            # encoder_output, (encoder_hidden, encoder_cell) = FILL_IN
            encoder_output, (encoder_hidden, encoder_cell) = encoder(input_tensor[it], (encoder_hidden, encoder_cell))
            encoder_outputs[it] = encoder_output[0,0]

        # Initialize the decoder input with the SOS token
        # This is y_0
        # decoder_input = FILL_IN  # SOS
        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        # Initialize the decoder hidden and cell states with the final encoder hidden and cell states
        # decoder_hidden = FILL_IN
        # decoder_cell = FILL_IN
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        decoded_words = []

        for jt in range(max_length):
            # As with the encoder run the \hat{y}_{t-1}, hidden, and state cells through the decoder
            # Capture the new hidden and cell states and the logits or log_softmax for the vocabulary
            # decoder_output, (decoder_hidden, decoder_cell) = FILL_IN
            decoder_output, (decoder_hidden, decoder_cell) = decoder(decoder_input, (decoder_hidden, decoder_cell))
            # Get the top y for the decoder, this will be the new \hat{y}_t which we can use at the next step
            # _, topi = FILL_IN
            _, topi = decoder_output.data.topk(1)

            # Put logic so that if we get topi == EOS_token, we add this and break
            # Otherwise, we map the index topi to the word in output_lang via index2word
            # FILL_IN
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])


            decoder_input = topi.squeeze().detach()

        return decoded_words

In [None]:
hidden_size = 256
cell_size = 256
embed_size = hidden_size
# Initialize the encoder and decoder and run them through the trainIters function
# encoder = FILL_IN
# decoder = FILL_IN
encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, output_lang.n_words).to(device)

trainIters(encoder, decoder, 75000, print_every=5000)

  plt.figure()


0m 55s (- 12m 51s) (5000 6%) 3.8022
1m 45s (- 11m 25s) (10000 13%) 3.5051
2m 34s (- 10m 19s) (15000 20%) 3.3614
3m 25s (- 9m 24s) (20000 26%) 3.2810
4m 15s (- 8m 30s) (25000 33%) 3.2376
5m 5s (- 7m 38s) (30000 40%) 3.2033
5m 56s (- 6m 47s) (35000 46%) 3.1911
6m 46s (- 5m 55s) (40000 53%) 3.1335
7m 36s (- 5m 4s) (45000 60%) 3.1483
8m 26s (- 4m 13s) (50000 66%) 3.0758
9m 17s (- 3m 22s) (55000 73%) 3.0748
10m 7s (- 2m 31s) (60000 80%) 3.0640
10m 57s (- 1m 41s) (65000 86%) 3.0434
11m 48s (- 0m 50s) (70000 93%) 2.9983
12m 39s (- 0m 0s) (75000 100%) 3.0087


In [None]:
# max_index = max([output_lang.word2index[word] for sentence in pairs for word in sentence[1].split()])
# print("Maximum index in target data:", max_index)

Here we will do an evaluation.
Gather up n=7500 random pairs and for each pair get the BLEU score.
For this exercise, don't use BLUE based on 4-grams, use just 2-grams (you need to figure out how to specify this in $sentence\_bleu$ below).
Investigate how this can be done with "sentence_bleu" in nltk - you can also find other implementations.
Print the average BLEU score after you've randomly drawn the sentences. You should be a BLEU of about 10 or slightly more.
Note that is is training BLEU, which is all I'd like you to get.

In [None]:
from nltk.translate.bleu_score import sentence_bleu

def evaluateRandomly(encoder, decoder, n=7500, debug=False):
    bleu_scores = []
    for i in range(n):
        # Randomly choose a pair of sentences
        # pair = FILL_IN
        pair = random.choice(pairs)
        if debug:
            print('French Original: ', pair[0])
            print('English Reference: ', pair[1])
        # Leave out the <EOS> symbol
        # Run the source French sentence through the encoder-decoder and get the output_words
        # output_words = FILL_IN
        output_words = evaluate(encoder, decoder, pair[0])

        # If <EOS> is at the end of output_words, remove it
        # FILL_IN
        if output_words[-1] == '<EOS>':
          output_words = output_words[:-1]

        output_sentence = ' '.join(output_words)
        # Get the BLEU score based on 1 and 2 grams (words, bigrams); use 50% weight on each
        # Use pair[1] as the reference
        # score = FILL_IN
        score = sentence_bleu([pair[1].split()], output_words, weights = (0.5, 0.5))
        # Append the BLEU score you got to the list of BLEU scores you keep
        # FILL_IN
        bleu_scores.append(score)
        if debug:
            print('Candidate Translation: ', output_sentence)
            print('BLEU: ', score)
            print('')
    # Return the mean of the BLEU scores
    # print('The mean BLEU score is: ', FILL_IN)
    print('The mean BLEU score is: ', sum(bleu_scores) / len(bleu_scores))

In [None]:
# BLEU > 10 % is expected here
evaluateRandomly(encoder, decoder)

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


The mean BLEU score is:  0.10399325960656876
