In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import unicodedata
import string
import re
import random
import time
import datetime
import math
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import torchtext

import spacy
import numpy as np


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# Data


In [None]:

# import spacy.cli

# spacy.cli.download("en_core_web_sm")
# spacy.cli.download("fr_core_news_sm")

import fr_core_news_sm
import en_core_web_sm
from torchtext.data.utils import get_tokenizer

spacy_fr = fr_core_news_sm.load()
spacy_en = en_core_web_sm.load()

spacy_en_tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
spacy_fr_tokenizer = get_tokenizer("spacy", language="fr_core_news_sm")

In [None]:
from collections import OrderedDict, Counter
from torchtext.vocab import vocab
import io

path = './data/eng-fre/'
train_fn = 'train_eng_fre.tsv'
valid_fn = 'val_eng_fre.tsv'
test_fn = 'test_eng_fre.tsv'


def build_vocab(filepath, src_tokenizer, trg_tokenizer):
  """Generate vocabulary objects for source and target languages."""
  src_counter, trg_counter = Counter(), Counter()
  with open(filepath, encoding="utf-8") as f:
    for i, line in enumerate(f.readlines()):
      if i == 0:  # skip header
        continue
      # split line and tokenize accordingly
      trg_line, src_line = line.strip("\n").split("\t")
      src_counter.update(src_tokenizer(src_line.lower()))
      trg_counter.update(trg_tokenizer(trg_line.lower()))
    
    # sort and wrap as OrderedDict
    ordered_src = OrderedDict(sorted(src_counter.items(), key=lambda x: x[1], reverse=True))
    ordered_trg = OrderedDict(sorted(trg_counter.items(), key=lambda x: x[1], reverse=True))
    
    # build src and trg vocab objects
    src_vocab = vocab(
      ordered_src, 
      min_freq=2, 
      specials=('<unk>', '<pad>', '<bos>', '<eos>')
    )

    trg_vocab = vocab(
      ordered_trg, 
      min_freq=2,
      specials=('<unk>', '<pad>', '<bos>', '<eos>')
    )
    
    return src_vocab, trg_vocab

src_vocab, trg_vocab = build_vocab(
  path + train_fn, 
  spacy_fr_tokenizer,
  spacy_en_tokenizer
)

In [None]:
import pickle

# with open("./ckpt/src_vocab","rb") as f: 
#      src_vocab = pickle.load(f)
# with open("./ckpt/trg_vocab","rb") as f:
#      trg_vocab = pickle.load(f)

with open("./ckpt/src_vocab", "wb") as f:
     pickle.dump(src_vocab, f)

with open("./ckpt/trg_vocab", "wb") as f:
     pickle.dump(trg_vocab, f)

In [None]:
import io
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

# Define batch size for each split
BATCH_SIZE = {
  "train": 16,
  "val": 256,
  "test": 256
}

# Extract idx for special tokens required for tensor batching
PAD_IDX = trg_vocab['<pad>']
BOS_IDX = trg_vocab['<bos>']
EOS_IDX = trg_vocab['<eos>']

# Define default index to assign to OOV tokens
unk_token = '<unk>'
src_vocab.set_default_index(src_vocab[unk_token])
trg_vocab.set_default_index(trg_vocab[unk_token])


def data_process(path, split):
  """Convert raw source and target sentences into tensors."""
  raw_iter = iter(io.open(path + split, encoding="utf-8"))
  data = []
  for i, item in enumerate(raw_iter):
    if i == 0:
      continue
    trg_raw, src_raw = item.strip("\n").split("\t")
    src_tensor = torch.tensor(
        [src_vocab[token] for token in spacy_fr_tokenizer(src_raw.lower())],
        dtype=torch.long
      )
    trg_tensor = torch.tensor(
        [trg_vocab[token] for token in spacy_en_tokenizer(trg_raw.lower())],
        dtype=torch.long
      )
    data.append((src_tensor, trg_tensor))

  return data

def generate_batch(data_batch):
  """Take a batch of tensors and turn them into fixed-sized tensors."""
  src_batch, trg_batch = [], []
  for (src_item, trg_item) in data_batch:
    src_batch.append(torch.cat([torch.tensor([BOS_IDX]), src_item, torch.tensor([EOS_IDX])], dim=0))
    trg_batch.append(torch.cat([torch.tensor([BOS_IDX]), trg_item, torch.tensor([EOS_IDX])], dim=0))
  src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
  trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX)

  return src_batch, trg_batch

train_data = data_process(path, train_fn)
val_data = data_process(path, valid_fn)
test_data = data_process(path, test_fn)

train_iter = DataLoader(
    train_data, 
    batch_size=BATCH_SIZE["train"],
    shuffle=True, 
    collate_fn=generate_batch
)

valid_iter = DataLoader(
    val_data, 
    batch_size=BATCH_SIZE["val"],
    shuffle=True, 
    collate_fn=generate_batch
  )

test_iter = DataLoader(
    test_data, 
    batch_size=BATCH_SIZE["test"],
    shuffle=True, 
    collate_fn=generate_batch
  )


with open("./ckpt/train_iter", "wb") as f:
     pickle.dump(train_iter, f)

with open("./ckpt/valid_iter", "wb") as f:
     pickle.dump(valid_iter, f)

with open("./ckpt/test_iter", "wb") as f:
     pickle.dump(test_iter, f)

## load iter

In [None]:

# with open("./drive/My Drive/COLX_531_lab2_jxkuang/ckpt/train_iter", "rb") as f:
#     train_iter = pickle.load(f)

# with open("./drive/My Drive/COLX_531_lab2_jxkuang/ckpt/valid_iter", "rb") as f:
#     valid_iter = pickle.load(f)

# with open("./drive/My Drive/COLX_531_lab2_jxkuang/ckpt/test_iter", "rb") as f:
#     test_iter = pickle.load(f)

# Instantiation

## Notes: models from Seq2Seq_models.ipynb

In [None]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)

# hyperparameters
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512  # NOTE: enc_hid_dim and dec_hid_dim should be equal
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
N_LAYERS = 1
LEARNING_RT = 0.001
BIDIRECTIONAL = True ### your code here ###

# model components
#device = "cpu"  # for more precise debugging
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BIDIRECTIONAL) #<--------------
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)
model_n = Seq2Seq(enc, dec, device).to(device)



# # model
# enc = Encoder1(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
# dec = Decoder1(OUTPUT_DIM, DEC_EMB_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT)
# model1 = Seq2Seq1(enc, dec, device).to(device)

# # model2 components
# attn = Attention2(ENC_HID_DIM, DEC_HID_DIM)
# enc = Encoder2(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT)
# dec = Decoder2(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)
# model2 = Seq2Seq2(enc, dec, device).to(device)

# Train

In [None]:
model = model_n #################################### change

def init_weights(m):
    for name, param in m.named_parameters():
        if 'weight' in name:
            nn.init.normal_(param.data, mean=0, std=0.01)
        else:
            nn.init.constant_(param.data, 0)
            
model.apply(init_weights)

optimizer = optim.Adam(model.parameters(), lr = LEARNING_RT)
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)


In [None]:
# training procedure goes here as needed. (can use the tutorial as a guide)
# BE SURE TO USE THE SAME SEED EACH TIME YOU RUN!
manual_seed = 531
torch.manual_seed(manual_seed)

def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    i = 0
    for i, (src, trg) in enumerate(iterator):
        if i % 500 == 0:
          print(i)
        i+=1
        src, trg = src.to(device), trg.to(device)
        optimizer.zero_grad()
        # output = model(src, trg)
        output,_ = model(src, trg)
        output_dim = output.shape[-1]
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for i, (src, trg) in enumerate(iterator):

            src, trg = src.to(device), trg.to(device)

            # output = model(src, trg, 0)
            output,_ = model(src, trg, 0)

            output_dim = output.shape[-1]
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            loss = criterion(output, trg)

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')


for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iter, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iter, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    # Create checkpoint at end of each epoch
    state_dict_model = model.state_dict() 
    state = {
        'epoch': epoch,
        'state_dict': state_dict_model,
        'optimizer': optimizer.state_dict()
        }
    torch.save(state, "./ckpt/seq2seq_"+str(epoch+1)+".pt")

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')


# Evaluation of the model using BLEU

In [None]:
import pickle

with open("./ckpt/src_vocab","rb") as f:
     src_saved = pickle.load(f)

with open("./ckpt/trg_vocab","rb") as f:
     trg_saved = pickle.load(f)

# with open("./ckpt/test_iter", "rb") as f:
#     test_iter = pickle.load(f)

In [None]:
INPUT_DIM = len(src_vocab)
OUTPUT_DIM = len(trg_vocab)


# hyperparameters
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
ENC_HID_DIM = 512
DEC_HID_DIM = 512
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5
N_LAYERS = 1
LEARNING_RT = 0.001
BIDIRECTIONAL = True

# model components
attn = Attention(ENC_HID_DIM, DEC_HID_DIM)
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, ENC_HID_DIM, N_LAYERS, ENC_DROPOUT, BIDIRECTIONAL)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, ENC_HID_DIM, DEC_HID_DIM, N_LAYERS, DEC_DROPOUT, attn)
model_best = Seq2Seq(enc, dec, device)

In [None]:
model_best.load_state_dict(torch.load('./ckpt/seq2seq_5.pt')['state_dict']) # choose the best
model_best = model_best.to(device)

In [None]:
def new_inference(model, trg_vocab, test_iter, attention=False, max_trg_len=64):
    '''
    Function for translation inference

    Input: 
    model: translation model;
    trg_vocab: Target torchtext Vocab.
    test_iter: iterator object with test data.
    attention: the model returns attention weights or not.
    max_trg_len: the maximal length of translation text (optinal), default = 64

    Output:
    Corpus BLEU score.
    '''
    from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

    # initializes smoothing function
    chencherry = SmoothingFunction()

    # convert index to text string
    def convert_itos(convert_vocab, token_ids):
        list_string = []
        for i in token_ids:
            if i == convert_vocab.get_stoi()['<eos>']:
                break
            else:
                token = convert_vocab.get_itos()[i]
                list_string.append(token)
        return list_string

    model.eval()
    all_trg = []
    all_translated_trg = []

    TRG_PAD_IDX = trg_vocab['<pad>']

    with torch.no_grad():
    
        for i, (src, trg) in enumerate(test_iter):

            src, trg = src.to(device), trg.to(device)

            batch_size = trg.shape[1]

            # create a placeholder for target language with shape of [max_trg_len, batch_size] where all the elements are the index of <pad>. Then send to device
            trg_placeholder = torch.Tensor(max_trg_len, batch_size)
            trg_placeholder.fill_(TRG_PAD_IDX)
            trg_placeholder = trg_placeholder.long().to(device)
            if attention == True:
                output,_ = model(src, trg_placeholder, 0) #turn off teacher forcing
            else:
                #original 
                #output,_ = model(src, trg_placeholder, 0) #turn off teacher forcing
                
                # update:
                output = model(src, trg_placeholder, 0) #turn off teacher forcing
            # get translation results, we ignor first token <sos> in both translation and target sentences. 
            # output_translate = [(trg len - 1), batch, output dim] output dim is size of target vocabulary.
            output_translate = output[1:]
            # store gold target sentences to a list 
            all_trg.append(trg[1:].cpu())

            # Choose top 1 word from decoder's output, we get the probability and index of the word
            prob, token_id = output_translate.data.topk(1)
            translation_token_id = token_id.squeeze(2).cpu()

            # store gold target sentences to a list 
            all_translated_trg.append(translation_token_id)
      
    all_gold_text = []
    all_translated_text = []
    for i in range(len(all_trg)): 
        cur_gold = all_trg[i]
        cur_translation = all_translated_trg[i]
        for j in range(cur_gold.shape[1]):
            gold_convered_strings = convert_itos(trg_vocab, cur_gold[:, j])
            trans_convered_strings = convert_itos(trg_vocab, cur_translation[:, j])

            all_gold_text.append(gold_convered_strings)
            all_translated_text.append(trans_convered_strings)

    corpus_all_gold_text = [[item] for item in all_gold_text]
    # compute bleu with smoothing function (chencherry method 0)
    # see: https://www.nltk.org/api/nltk.translate.bleu_score.html#nltk.translate.bleu_score.SmoothingFunction
    corpus_bleu_score = corpus_bleu(corpus_all_gold_text, all_translated_text, smoothing_function=chencherry.method0)  
    return corpus_bleu_score

# NOTE: Don't forget to run the below line to get the results you need, passing in 'model' with the proper weight initiatlization
# print(new_inference(model, trg_vocab, test_iter, attention=False, max_trg_len=64))

In [None]:
print(new_inference(model_best, trg_saved, test_iter, attention=True, max_trg_len=64))