In [1]:
import unicodedata as ucd
import re
import random
import pandas as pd
import collections
import torch
import torch.nn as nn
from torch import optim
from torch.utils import data

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def train_test_split(lang1, lang2, ratio=0.8):
    instances = lang1.shape[0]
    train_size = int(instances * ratio)
    train_hin = []
    train_eng = []
    val_hin = []
    val_eng = []
    indices = random.sample(range(instances), train_size)
    for index in range(instances):
        if index in indices:
            train_hin.append(lang1[index])
            train_eng.append(lang2[index])
        else:
            val_hin.append(lang1[index])
            val_eng.append(lang2[index])
    return train_hin, train_eng, val_hin, val_eng

In [4]:
def tokenize(s, lang):
    if lang == 'hin':
        s = re.sub(r'[^\s\u0900-\u0963\u0970-\u097f]+', ' ', s)
        s = re.sub(r' +', ' ', s)
    elif lang == 'eng':
        s = re.sub(r'[^\s\u0041-\u005a\u0061-\u007a]+', ' ', s)
        s = re.sub(r' +', ' ', s)
        s = s.lower()
    tokens = s.split(' ')
    return tokens

**Reference:** https://d2l.ai/

In [5]:
class Vocab:
    def __init__(self, tokens=None, min_freq=0, reserved_tokens=None):
        if tokens is None:
            tokens = []
        if reserved_tokens is None:
            reserved_tokens = []
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(tokens):  
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [6]:
def truncate_pad(line, num_steps, padding_token):
    if len(line) > num_steps:
        return line[:num_steps]  
    return line + [padding_token] * (num_steps - len(line)) 

In [7]:
def build_array(lines, vocab, num_steps):
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

In [8]:
def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [9]:
class EpochLoss:
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [0.0] * len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

In [10]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, **kwargs):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, *args):
        embedded = self.dropout(self.embedding(input))
        embedded = embedded.permute(1, 0, 2)
        outputs, (hidden, cell) = self.rnn(embedded)        
        return outputs, (hidden, cell)

In [11]:
def masked_softmax(X, valid_lens):
    if valid_lens is None:
        return nn.functional.softmax(X, dim=-1)
    else:
        shape = X.shape
        if valid_lens.dim() == 1:
            valid_lens = torch.repeat_interleave(valid_lens, shape[1])
        else:
            valid_lens = valid_lens.reshape(-1)
        X = sequence_mask(X.reshape(-1, shape[-1]), valid_lens, value=-1e6)
        return nn.functional.softmax(X.reshape(shape), dim=-1)

In [12]:
class AdditiveAttention(nn.Module):
    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
        super(AdditiveAttention, self).__init__(**kwargs)
        self.W_k = nn.Linear(key_size, num_hiddens, bias=False)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=False)
        self.w_v = nn.Linear(num_hiddens, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        queries, keys = self.W_q(queries), self.W_k(keys)
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        scores = self.w_v(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [13]:
class AttentionDecoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0, **kwargs):
        super().__init__(**kwargs)
        self.attention = AdditiveAttention(hid_dim, hid_dim, hid_dim, dropout)
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim + hid_dim, hid_dim, n_layers, dropout=dropout)
        self.dense = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def init_state(self, enc_outputs, enc_valid_lens, *args):
        outputs, (hidden, cell) = enc_outputs
        return [outputs.permute(1, 0, 2), (hidden, cell), enc_valid_lens]

    def forward(self, X, state):
        enc_outputs, (hidden, cell), enc_valid_lens = state
        X = self.dropout(self.embedding(X)).permute(1, 0, 2)
        outputs, self._attention_weights = [], []
        for x in X:
            query = torch.unsqueeze(hidden[-1], dim=1)
            context = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens)
            x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1)
            out, (hidden, cell) = self.rnn(x.permute(1, 0, 2), (hidden, cell))
            outputs.append(out)
            self._attention_weights.append(self.attention.attention_weights)
        outputs = self.dense(torch.cat(outputs, dim=0))
        return outputs.permute(1, 0, 2), [enc_outputs, (hidden, cell), enc_valid_lens]

    @property
    def attention_weights(self):
        return self._attention_weights

In [14]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, *args):
        enc_outputs = self.encoder(src, *args)
        state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(trg, state)

In [15]:
def train(model, data_iter, lr, num_epochs, eng_vocab, device):
    model.train()
    for epoch in range(num_epochs):
        metric = EpochLoss(2)
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]
            sos = torch.tensor([eng_vocab['<sos>']] * Y.shape[0],
                                device=device).reshape(-1, 1)
            dec_input = torch.cat([sos, Y[:, :-1]], 1)  
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = criterion(Y_hat, Y, Y_valid_len)
            l.sum().backward() 
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                metric.add(l.sum(), num_tokens)
        print(epoch + 1, ' Loss: ', metric[0] / metric[1])
    print(f'Final Loss: {metric[0] / metric[1]:.4f}')

In [16]:
def sequence_mask(X, valid_len, value=0):
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32, device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

In [17]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction = 'none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [None]:
'''df = pd.read_csv('/content/drive/MyDrive/AssignmentNLP/train/train.csv')
hindi = df.iloc[:, 1].values
english = df.iloc[:, 2].values
train_hin, train_eng, val_hin, val_eng = train_test_split(hindi, english)'''

In [19]:
with open('/content/drive/MyDrive/AssignmentNLP/Trial1/train_hin.txt', 'r') as f:
    train_hin = f.read().splitlines()

with open('/content/drive/MyDrive/AssignmentNLP/Trial1/train_eng.txt', 'r') as f:
    train_eng = f.read().splitlines()

with open('/content/drive/MyDrive/AssignmentNLP/Trial1/val_hin.txt', 'r') as f:
    val_hin = f.read().splitlines()

with open('/content/drive/MyDrive/AssignmentNLP/Trial1/val_eng.txt', 'r') as f:
    val_eng = f.read().splitlines()

In [20]:
hin = []
eng = []
for h, e in zip(train_hin, train_eng):
    str_h = tokenize(h, 'hin')
    str_h = list(filter(('').__ne__, str_h))
    if len(str_h) != 0:
        str_e = tokenize(e, 'eng')
        str_e = list(filter(('').__ne__, str_e))
        hin.append(str_h)
        eng.append(str_e)

In [21]:
hin_vocab = Vocab(hin, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])
eng_vocab = Vocab(eng, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])

In [22]:
INPUT_DIM = len(hin_vocab)
OUTPUT_DIM = len(eng_vocab)
ENC_EMB_DIM = 50
DEC_EMB_DIM = 50 
HID_DIM = 128
N_LAYERS = 2
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
LR = 0.001
NUM_EPOCHS = 300
NUM_STEPS = 15 
BATCH_SIZE = 64

In [23]:
hin_array, hin_valid_len = build_array(hin, hin_vocab, NUM_STEPS)
eng_array, eng_valid_len = build_array(eng, eng_vocab, NUM_STEPS)
data_arrays = (hin_array, hin_valid_len, eng_array, eng_valid_len)
data_iter = load_array(data_arrays, BATCH_SIZE)

In [24]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = AttentionDecoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights).to(device)

In [26]:
optimizer = optim.Adam(model.parameters(), lr=LR)

In [27]:
criterion = MaskedSoftmaxCELoss()

In [None]:
train(model, data_iter, LR, NUM_EPOCHS, eng_vocab, device)

In [None]:
torch.save(model.state_dict(), '/content/drive/MyDrive/AssignmentNLP/Trial12/train_model.pt')

In [None]:
def predict_seq2seq(model, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False):
    model.eval()
    str_h = normalize(src_sentence, 'hin')
    str_h = list(filter(('').__ne__, str_h))
    src_tokens = src_vocab[str_h] + [src_vocab['<eos>']]
    enc_valid_len = torch.tensor([len(src_tokens)], device=device)
    src_tokens = truncate_pad(src_tokens, num_steps, src_vocab['<pad>'])
    enc_X = torch.unsqueeze(torch.tensor(src_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = model.encoder(enc_X, enc_valid_len)
    state = model.decoder.init_state(enc_outputs, enc_valid_len)
    dec_X = torch.unsqueeze(torch.tensor([tgt_vocab['<bos>']], dtype=torch.long, device=device), dim=0)
    output_seq, attention_weight_seq = [], []
    for _ in range(num_steps):
        Y, state = model.decoder(dec_X, state)
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        if save_attention_weights:
            attention_weight_seq.append(model.decoder.attention_weights)
        if pred == tgt_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq

In [None]:
val_ans = []
for l in val_hin:
    val_ans.append(predict_seq2seq(model, l, hin_vocab, eng_vocab, NUM_STEPS, device)[0])

In [None]:
with open('/content/drive/MyDrive/AssignmentNLP/Trial12/val_ans.txt', 'w') as f:
    for l in val_ans[:-1]:
        f.write(l +'\n')
    f.write(val_ans[-1])

In [None]:
!pip install -U nltk

In [None]:
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

references = val_eng
hypotheses = val_ans

total_num = len(references)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([references[i].split(" ")], hypotheses[i].split(" "))
  total_meteor_scores+=single_meteor_score(references[i], hypotheses[i])

bleu_result = total_bleu_scores/total_num
meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
print("meteor score: ",meteor_result)

In [None]:
import pandas as pd
data_frame = pd.read_csv('/content/drive/MyDrive/AssignmentNLP/Week2/hindistatements.csv')
hindi_st = list(data_frame.iloc[:, 2].values)

In [None]:
ans = []
for l in hindi_st:
    ans.append(predict_seq2seq(model, l, hin_vocab, eng_vocab, NUM_STEPS, device)[0])

In [None]:
with open('/content/drive/MyDrive/AssignmentNLP/Trial12/answer.txt', 'w') as f:
    for l in ans[:-1]:
        f.write(l +'\n')
    f.write(ans[-1])