In [None]:
import re
import csv
import torch
import random
import collections
import torch.nn as nn
from torch import optim
from torch.utils import data

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#**Required Functions**

In [None]:
def train_test_split(lang1, lang2, ratio=0.8):
    '''split data into train and test sets with the given ratio'''
    if ratio == 1:
        train_hin, train_eng, val_hin, val_eng = lang1[:], lang2[:], [], []
    else:
        instances = len(lang1)
        train_size = int(instances * ratio)
        train_hin, train_eng, val_hin, val_eng = [], [], [], []

        #randomly sample indices 
        indices = random.sample(range(instances), train_size)
        for index in range(instances):
            if index in indices:
                train_hin.append(lang1[index])
                train_eng.append(lang2[index])
            else:
                val_hin.append(lang1[index])
                val_eng.append(lang2[index])
    return train_hin, train_eng, val_hin, val_eng

In [None]:
def tokenize(s, lang):
    '''tokenize the string'''
    if lang == 'hin':
        #replace everything except hindi alphabets and space
        s = re.sub(r'[^\s\u0900-\u0963\u0970-\u097f]+', ' ', s)
        s = re.sub(r' +', ' ', s)

    elif lang == 'eng':
        #replace everything except english alphabets and space
        s = re.sub(r'[^\s\u0041-\u005a\u0061-\u007a]+', ' ', s)
        s = re.sub(r' +', ' ', s)
        #convert to lowercase
        s = s.lower()

    #tokenize by spliting at space
    tokens = s.split(' ')
    return tokens

**Reference:** https://d2l.ai/

In [None]:
class Vocab:
    '''Create vocabulary'''

    def __init__(self, tokens, min_freq, reserved_tokens):
        #count the frequency of tokens
        counter = count_corpus(tokens)

        self.token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens

        #make a list of tokens with frequency > min_freq
        uniq_tokens += [
            token for token, freq in self.token_freqs
            if freq >= min_freq and token not in uniq_tokens]

        #generate dict to create token_id
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        '''convert token to id'''
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        '''convert id to token'''
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]

def count_corpus(tokens):  
    '''count the frequency of tokens'''
    if len(tokens) == 0 or isinstance(tokens[0], list):
        tokens = [token for line in tokens for token in line]
    return collections.Counter(tokens)

In [None]:
def truncate_pad(line, num_steps, padding_token):
    '''make string length = num_steps by truncating or padding'''
    if len(line) > num_steps:
        return line[:num_steps]  
    return line + [padding_token] * (num_steps - len(line)) 

In [None]:
def build_array(lines, vocab, num_steps):
    '''add <eos> to the string and convert it into tensor'''
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = torch.tensor([
        truncate_pad(l, num_steps, vocab['<pad>']) for l in lines])
    valid_len = (array != vocab['<pad>']).type(torch.int32).sum(1)
    return array, valid_len

In [None]:
def load_array(data_arrays, batch_size, is_train=True):
    '''returns the data iterator'''
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [None]:
class EpochLoss:
    '''sum the epoch loss and no. of tokens'''
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]
    def __getitem__(self, idx):
        return self.data[idx]

In [None]:
def sequence_mask(X, valid_len, value=0):
    '''mask padding added to the string'''
    maxlen = X.size(1)
    mask = torch.arange((maxlen), dtype=torch.float32,
                        device=X.device)[None, :] < valid_len[:, None]
    X[~mask] = value
    return X

In [None]:
class MaskedSoftmaxCELoss(nn.CrossEntropyLoss):
    '''softmax cross-entropy loss with mask'''
    #'pred': ('batch_size', 'num_steps', 'vocab_size')
    #'label': ('batch_size', 'num_steps')
    #'valid_len': ('batch_size', )
    def forward(self, pred, label, valid_len):
        weights = torch.ones_like(label)
        weights = sequence_mask(weights, valid_len)
        self.reduction = 'none'
        unweighted_loss = super(MaskedSoftmaxCELoss, self).forward(pred.permute(0, 2, 1), label)
        weighted_loss = (unweighted_loss * weights).mean(dim=1)
        return weighted_loss

In [None]:
def masked_softmax(X, valid_len):
    '''Perform softmax function after masking padding in the last axis'''
    #'X': ('batch_size', 1, 'num_steps') 
    #'valid_len': ('batch_size', )
    shape = X.shape
    valid_len = torch.repeat_interleave(valid_len, shape[1])
    X = sequence_mask(X.reshape(-1, shape[-1]), valid_len, value=-1e6)
    return nn.functional.softmax(X.reshape(shape), dim=-1)

###**Bidirectional GRU Encoder** 
##Check comments

In [None]:
class Encoder(nn.Module):
    '''Bidirectional GRU encoder'''

    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout, **kwargs):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        #embedding layer
        self.embedding = nn.Embedding(input_dim, emb_dim)
        #Bidirectional GRU 
        self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, dropout = dropout, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, *args):
        #'input': ('batch_size', 'num_steps') 
        #'embedded': ('batch_size', 'num_steps', 'emb_dim') 
        embedded = self.dropout(self.embedding(input))
        #RNN accepts input with first dim as num_steps
        embedded = embedded.permute(1, 0, 2)
        output, hidden = self.rnn(embedded)   
        #concate the hidden states of forward and backward GRU cells
        concated = torch.cat((hidden[0:hidden.size(0):2], hidden[1:hidden.size(0):2]), dim=2)
        hidden = torch.tanh(self.fc(concated)) 
        #'output': ('num_steps', 'batch_size', 2*'hid_dim') 
        #'hidden': ('n_layers', 'batch_size', 'hid_dim') 
        return output, hidden

###**GRU Decoder with Bahdanau Attention**

In [None]:
class BahdanauAttention(nn.Module):
    def __init__(self, key_size, query_size, num_hiddens, dropout, **kwargs):
        super().__init__()
        self.W_k = nn.Linear(2 * key_size, num_hiddens, bias=False)
        self.W_q = nn.Linear(query_size, num_hiddens, bias=False)
        self.w_v = nn.Linear(num_hiddens, 1, bias=False)
        self.dropout = nn.Dropout(dropout)

    def forward(self, queries, keys, values, valid_lens):
        queries, keys = self.W_q(queries), self.W_k(keys)  
        features = queries.unsqueeze(2) + keys.unsqueeze(1)
        features = torch.tanh(features)
        scores = self.w_v(features).squeeze(-1)
        self.attention_weights = masked_softmax(scores, valid_lens)
        return torch.bmm(self.dropout(self.attention_weights), values)

In [None]:
class AttentionDecoder(nn.Module):
    '''GRU Decoder with Attention'''
    
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout=0, **kwargs):
        super().__init__(**kwargs)
        #Bahdanau Attention
        self.attention = BahdanauAttention(hid_dim, hid_dim, hid_dim, dropout)
        #embedding layer
        self.embedding = nn.Embedding(output_dim, emb_dim)
        #GRU 
        self.rnn = nn.GRU(emb_dim + 2 * hid_dim, hid_dim, n_layers, dropout=dropout)
        self.dense = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
 
    def init_state(self, enc_output, enc_valid_len, *args):
        #'outputs': ('num_steps', 'batch_size', 2*'hid_dim').
        #'hidden': ('n_layers', 'batch_size', 'hid_dim')
        output, hidden = enc_output
        return [output.permute(1, 0, 2), hidden, enc_valid_len]
 
    def forward(self, output, state):
        #'enc_outputs': ('batch_size', 'num_steps', 2*'hid_dim')
        #'hidden': ('n_layers', 'batch_size', 'hid_dim')
        #'enc_valid_lens': ('batch_size', )
        enc_outputs, hidden, enc_valid_lens = state
        #'output': ('num_steps, 'batch_size', 'emb_dim') (After the following step)
        output = self.dropout(self.embedding(output)).permute(1, 0, 2)
        outputs = []
        for x in output:
            #'query': ('batch_size', 1, 'hid_dim')
            query = torch.unsqueeze(hidden[-1], dim=1)
            #'context': ('batch_size', 1, 2*'hid_dim')
            context = self.attention(query, enc_outputs, enc_outputs, enc_valid_lens)
            #Concatenate the context and x
            x = torch.cat((context, torch.unsqueeze(x, dim=1)), dim=-1)
            #Reshape 'x' as (1, 'batch_size', 'emb_dim' + 2*'hid_dim')
            out, hidden = self.rnn(x.permute(1, 0, 2), hidden)
            outputs.append(out)
        outputs = self.dense(torch.cat(outputs, dim=0))
        #'outputs': ('num_steps', 'batch_size', 'vocab_size')
        return outputs.permute(1, 0, 2), [enc_outputs, hidden, enc_valid_lens]

###**Seq2Seq Model**

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, **kwargs):
        super().__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, *args):
        enc_outputs = self.encoder(src, *args)
        state = self.decoder.init_state(enc_outputs, *args)
        return self.decoder(trg, state)

###Train

In [None]:
def train(model, data_iter, lr, num_epochs, tgt_vocab, device):
    '''train a seq2seq model'''
    model.train()
    for epoch in range(num_epochs):
        #sum the training loss
        eloss = EpochLoss(2)
        for batch in data_iter:
            X, X_valid_len, Y, Y_valid_len = [x.to(device) for x in batch]

            #add <sos> to the start of output string 
            sos = torch.tensor([tgt_vocab['<sos>']] * Y.shape[0],
                                device=device).reshape(-1, 1)
            #teacher forcing
            dec_input = torch.cat([sos, Y[:, :-1]], 1)  
            Y_hat, _ = model(X, dec_input, X_valid_len)
            l = loss(Y_hat, Y, Y_valid_len)
            #propogate loss backward
            l.sum().backward() 
            #clip the gradient to avoid exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            num_tokens = Y_valid_len.sum()
            optimizer.step()
            with torch.no_grad():
                eloss.add(l.sum(), num_tokens)
        print(epoch + 1,' Loss: ', eloss[0] / eloss[1])
    print(f'Final Loss: {eloss[0] / eloss[1]:.4f}')

###Prediction

In [None]:
def predict_seq2seq(model, input, hin_vocab, eng_vocab, num_steps, device):
    '''prediction for seq2seq model'''
    model.eval()

    str_h = tokenize(input, 'hin')
    str_h = list(filter(('').__ne__, str_h))
    hin_tokens = hin_vocab[str_h] + [hin_vocab['<eos>']] 
    enc_valid_len = torch.tensor([len(hin_tokens)], device=device)
    hin_tokens = truncate_pad(hin_tokens, num_steps, hin_vocab['<pad>'])
    
    #add the batch axis
    enc_X = torch.unsqueeze(torch.tensor(hin_tokens, dtype=torch.long, device=device), dim=0)
    enc_outputs = model.encoder(enc_X, enc_valid_len)
    state = model.decoder.init_state(enc_outputs, enc_valid_len)
    
    #add the batch axis
    dec_X = torch.unsqueeze(torch.tensor([eng_vocab['<sos>']], dtype=torch.long, device=device), dim=0)
    
    output_seq = []
    for _ in range(num_steps):
        Y, state = model.decoder(dec_X, state)
        #greedy decoding (use the token with the highest prediction likelihood as the input of decoder at next time step)
        dec_X = Y.argmax(dim=2)
        pred = dec_X.squeeze(dim=0).type(torch.int32).item()
        #break when the <eos> token is predicted
        if pred == eng_vocab['<eos>']:
            break
        output_seq.append(pred)
    return ' '.join(eng_vocab.to_tokens(output_seq))

#**Main** **Code**

Read train dataset

In [None]:
hindi, english = [], []
#read train dataset
with open('/content/drive/MyDrive/AssignmentNLP/train/train.csv', 'r') as file:
    lines = csv.reader(file)
    for line in lines:
        hindi.append(line[1])
        english.append(line[2])
hindi = hindi[1:]
english = english[1:]

Train Test Split

In [None]:
train_hin, train_eng, val_hin, val_eng = train_test_split(hindi, english, ratio=1)

Tokenize dataset

In [None]:
hin_tokens = []
eng_tokens = []
for h, e in zip(train_hin, train_eng):
    str_h = tokenize(h, 'hin')
    str_h = list(filter(('').__ne__, str_h))
    if len(str_h) != 0:
        str_e = tokenize(e, 'eng')
        str_e = list(filter(('').__ne__, str_e))
        hin_tokens.append(str_h)
        eng_tokens.append(str_e)

Create Vocabulary

In [None]:
hin_vocab = Vocab(hin_tokens, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])
eng_vocab = Vocab(eng_tokens, min_freq=2, reserved_tokens=['<pad>', '<sos>', '<eos>'])

Parameters

In [None]:
INPUT_DIM = len(hin_vocab)
OUTPUT_DIM = len(eng_vocab)
ENC_EMB_DIM = 64
DEC_EMB_DIM = 64 
HID_DIM = 128
N_LAYERS = 2
ENC_DROPOUT = 0.25
DEC_DROPOUT = 0.25
LR = 0.001
NUM_EPOCHS = 10
num_steps = 20 
batch_size = 64

Create Tensors and Data iterators

In [None]:
hin_array, hin_valid_len = build_array(hin_tokens, hin_vocab, num_steps)
eng_array, eng_valid_len = build_array(eng_tokens, eng_vocab, num_steps)
data_arrays = (hin_array, hin_valid_len, eng_array, eng_valid_len)
data_iter = load_array(data_arrays, batch_size)

Define Model

In [None]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = AttentionDecoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
model = Seq2Seq(enc, dec, device).to(device)

Initialize weights

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

model.apply(init_weights).to(device)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(20453, 64)
    (rnn): GRU(64, 128, num_layers=2, dropout=0.25, bidirectional=True)
    (fc): Linear(in_features=256, out_features=128, bias=True)
    (dropout): Dropout(p=0.25, inplace=False)
  )
  (decoder): AttentionDecoder(
    (attention): BahdanauAttention(
      (W_k): Linear(in_features=256, out_features=128, bias=False)
      (W_q): Linear(in_features=128, out_features=128, bias=False)
      (w_v): Linear(in_features=128, out_features=1, bias=False)
      (dropout): Dropout(p=0.25, inplace=False)
    )
    (embedding): Embedding(18317, 64)
    (rnn): GRU(320, 128, num_layers=2, dropout=0.25)
    (dense): Linear(in_features=128, out_features=18317, bias=True)
    (dropout): Dropout(p=0.25, inplace=False)
  )
)

Define optimizer

In [None]:
optimizer = optim.Adam(model.parameters(), lr=LR)

Define loss function

In [None]:
loss = MaskedSoftmaxCELoss()

Training phase

In [None]:
train(model, data_iter, LR, NUM_EPOCHS, eng_vocab, device)

1  Loss:  0.2801650961754046
2  Loss:  0.24657784568008312
3  Loss:  0.23073072298620254
4  Loss:  0.21871594098685376


KeyboardInterrupt: ignored

Save model

In [None]:
#torch.save(model.state_dict(), '/content/drive/MyDrive/AssignmentNLP/TrialFinal/train_model.pt')

Predict validation set answers

In [None]:
'''answers = []
for l in val_hin:
    answers.append(predict_seq2seq(model, l, hin_vocab, eng_vocab, num_steps, device, True)[0])'''

In [None]:
'''with open('/content/drive/MyDrive/AssignmentNLP/TrialFinal/val_ans.txt', 'w') as f:
    for l in answers[:-1]:
        f.write(l +'\n')
    f.write(answers[-1])'''

Load testset

In [None]:
hindi_st = []
with open('/content/drive/MyDrive/AssignmentNLP/Testset/testhindistatements.csv', 'r') as file:
    lines = csv.reader(file)
    for line in lines:
        hindi_st.append(line[2])
hindi_st = hindi_st[1:]

Predict testset answers

In [None]:
hind = 'आप का शुक्रिया'

In [None]:
predict_seq2seq(model, hind, hin_vocab, eng_vocab, num_steps, device)

'thank you for you'

In [None]:
ans = []
for l in hindi_st:
    ans.append(predict_seq2seq(model, l, hin_vocab, eng_vocab, num_steps, device))

In [None]:
with open('/content/drive/MyDrive/AssignmentNLP/TrialFinal/answer.txt', 'w') as f:
    for l in ans[:-1]:
        f.write(l +'\n')
    f.write(ans[-1])

In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/AssignmentNLP/TrialFinal/train_model.pt'))

<All keys matched successfully>