In [None]:
%pip install datasets transformers
%pip install bert_score
%pip install evaluate
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

In [None]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from typing import Iterable, List
from datasets import load_dataset
from torchtext.datasets import multi30k, Multi30k
from torch import Tensor
import torch
import evaluate as E
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Transformer
import math
from torch.nn.utils.rnn import pad_sequence
from timeit import default_timer as timer
from torch.utils.data import DataLoader
import numpy as np
import math
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import T5Tokenizer, T5Model
import pandas as pd
from transformers import pipeline
from transformers import BertTokenizer, BertModel, BertForSequenceClassification, BertConfig
from sentence_transformers import SentenceTransformer


# Loading 1 dataset

In [None]:
path = '/content/drive/MyDrive/A3_task1_data_files'

task1_train_data =  pd.read_csv(path + '/train.csv', sep = "\t")
task1_train_data.dropna(inplace=True)
task1_train_data.reset_index(drop=True, inplace=True)

path = '/content/drive/MyDrive/A3_task1_data_files'
task1_val_data =pd.read_csv(path +'/dev.csv', sep = "\t")
task1_val_data = task1_val_data.rename(columns = {'setence1':'sentence1'})
task1_val_data.dropna(inplace=True)
task1_val_data.reset_index(drop=True, inplace=True)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)


## 1B

In [None]:
def metrics_1(data, model):
    values = []
    for i in range(len(data)):
        embed1 = model.encode(data.iloc[i]['sentence1'])
        embed2 = model.encode(data.iloc[i]['sentence2'])
        similarity = np.dot(embed1, embed2)/(np.linalg.norm(embed1)*np.linalg.norm(embed2))
        values.append(similarity*5)
    pearson = np.corrcoef(values, data['score'])
    print("Pearson Correlation: ", pearson[0][1])

In [None]:
model_1B = SentenceTransformer('distilbert-base-nli-mean-tokens')
model_1B = model_1B.to(DEVICE)
print("Val Metrics")
metrics_1(task1_val_data, model_1B)
# Pearson Correlation:  0.7923100957816998

In [None]:
# path = "/content/drive/MyDrive/"
path = ""
model_1C = SentenceTransformer(path + 'model_1C')
model_1C = model_1C.to(DEVICE)
print("Val Metrics")
metrics_1(task1_val_data, model_1C)
# Pearson Correlation:  0.8572838200887052


# Loading 2 dataset

In [None]:
task2_train_data = load_dataset("wmt16", "de-en", split="train[:50000]")
task2_val_data = load_dataset("wmt16", "de-en", split="validation")
task2_test_data = load_dataset("wmt16", "de-en", split="test")


In [None]:
bleu = E.load("bleu")
meteor_metric = E.load("meteor")
bertscore_metric = E.load("bertscore")

## 2A

In [None]:
SRC_LANGUAGE = 'de'
TGT_LANGUAGE = 'en'
token_transform = {}
vocab_transform = {}

token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='de_core_news_sm')
token_transform[TGT_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')


def yield_tokens(data_iter: Iterable, language: str) -> List[str]:
    for data_sample in data_iter:
        yield token_transform[language](data_sample['translation'][language])


UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    train_iter = task2_train_data
    vocab_transform[ln] = build_vocab_from_iterator(yield_tokens(train_iter, ln),
                                                    min_freq=1,
                                                    specials=special_symbols,
                                                    special_first=True)

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
  vocab_transform[ln].set_default_index(UNK_IDX)


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class PositionalEncoding(nn.Module):
    def __init__(self,
                 emb_size: int,
                 dropout: float,
                 maxlen: int = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding: Tensor):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size)

class Seq2SeqTransformer(nn.Module):
    def __init__(self,
                 num_encoder_layers: int,
                 num_decoder_layers: int,
                 emb_size: int,
                 nhead: int,
                 src_vocab_size: int,
                 tgt_vocab_size: int,
                 dim_feedforward: int = 512,
                 dropout: float = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size,
                                       nhead=nhead,
                                       num_encoder_layers=num_encoder_layers,
                                       num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward,
                                       dropout=dropout)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(
            emb_size, dropout=dropout)

    def forward(self,
                src: Tensor,
                trg: Tensor,
                src_mask: Tensor,
                tgt_mask: Tensor,
                src_padding_mask: Tensor,
                tgt_padding_mask: Tensor,
                memory_key_padding_mask: Tensor):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)
        return self.generator(outs)

    def encode(self, src: Tensor, src_mask: Tensor):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt: Tensor, memory: Tensor, tgt_mask: Tensor):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def create_mask(src, tgt):
    src_seq_len = src.shape[0]
    tgt_seq_len = tgt.shape[0]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len),device=DEVICE).type(torch.bool)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    tgt_padding_mask = (tgt == PAD_IDX).transpose(0, 1)
    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

def sequential_transforms(*transforms):
    def func(txt_input):
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

def tensor_transform(token_ids: List[int]):
    return torch.cat((torch.tensor([BOS_IDX]),
                      torch.tensor(token_ids),
                      torch.tensor([EOS_IDX])))

text_transform = {}
for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln],
                                               vocab_transform[ln],
                                               tensor_transform)

def collate_fn(batch):
    src_batch, tgt_batch = [], []

    for pair in batch:
        src_sample = pair['translation']['de']
        tgt_sample = pair['translation']['en']
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample.rstrip("\n")))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample.rstrip("\n")))

    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch, tgt_batch

def decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


def translate_2a(model: torch.nn.Module, src_sentence: str):
    model.eval()
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    num_tokens = src.shape[0]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = decode(
        model,  src, src_mask, max_len=num_tokens + 5, start_symbol=BOS_IDX).flatten()
    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

def test_metrics_2a(model, data):
  model.eval()
  references = []
  predictions = []
  for id, pair in enumerate(data):

      src_sample = pair['translation']['de']
      tgt_sample = pair['translation']['en']

      predictions.append(translate_2a(model, src_sample))
      references.append([tgt_sample])

  bleu_score = bleu.compute(predictions=predictions, references=references)
  meteor_score = meteor_metric.compute(predictions=predictions, references=references)
  bertscore_score = bertscore_metric.compute(predictions=predictions, references=references, lang="en")
  
  bs_pre = np.mean(np.array(bertscore_score['precision']))
  bs_recall = np.mean(np.array(bertscore_score['recall']))
  bs_f1 = np.mean(np.array(bertscore_score['f1']))

  print(f'BLEU Score: {bleu_score}')
  print(f'METEOR Score: {meteor_score}')
  print(f'BERTScore-precision: {bs_pre}')
  print(f'BERTScore-recall: {bs_recall}')
  print(f'BERTScore-f1: {bs_f1}')

In [None]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = len(vocab_transform[SRC_LANGUAGE])
TGT_VOCAB_SIZE = len(vocab_transform[TGT_LANGUAGE])
EMB_SIZE = 256
NHEAD = 8
FFN_HID_DIM = 512
BATCH_SIZE = 64
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

model_2A = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE,
                                 NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
model_2A.load_state_dict(torch.load('model_2A.pt'))
model_2A = model_2A.to(DEVICE)

print("Val Metrics")
test_metrics_2a(model_2A, task2_val_data)
print("Test Metrics")
test_metrics_2a(model_2A, task2_test_data)

## 2B

In [None]:
model_name = "t5-small"
model_2b = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer_2b = T5Tokenizer.from_pretrained(model_name)
model_2b = model_2b.to(DEVICE)

In [None]:
def translate_2b(sentence):
    prefix = "translate English to German: "
    input_text = prefix + sentence
    input_ids = tokenizer_2b.encode(input_text, return_tensors="pt")
    input_ids = input_ids.to(DEVICE)
    output_ids = model_2b.generate(input_ids, max_length=100, num_beams=4, early_stopping=True)
    translated_sentence = tokenizer_2b.decode(output_ids[0], skip_special_tokens=True)
    return translated_sentence
    
def test_metrics_2b(data):
  references = []
  predictions = []
  for id, pair in enumerate(data):
      # print(id,"/", len(data))

      tgt_sample = pair['translation']['de']
      src_sample = pair['translation']['en']

      predictions.append(translate_2b(src_sample))
      references.append([tgt_sample])

  bleu_score = bleu.compute(predictions=predictions, references=references)
  meteor_score = meteor_metric.compute(predictions=predictions, references=references)
  bertscore_score = bertscore_metric.compute(predictions=predictions, references=references, lang="de")

  bs_pre = np.mean(np.array(bertscore_score['precision']))
  bs_recall = np.mean(np.array(bertscore_score['recall']))
  bs_f1 = np.mean(np.array(bertscore_score['f1']))


  print(f'BLEU Score: {bleu_score}')
  print(f'METEOR Score: {meteor_score}')
  print(f'BERTScore-precision: {bs_pre}')
  print(f'BERTScore-recall: {bs_recall}')
  print(f'BERTScore-f1: {bs_f1}')

In [None]:
model_2b.eval()
print("2B-Val Metrics")
test_metrics_2b(task2_val_data)
print("2B-Test Metrics")
test_metrics_2b(task2_test_data)