### Connect to drive and setup training file:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Getting the training dataset file from gdrive
!unzip -o ./drive/MyDrive/train.zip >> /dev/null
!unzip -o ./drive/MyDrive/hindistatements.zip >> /dev/null

### Importing the libraries:

In [None]:
installed = False

if not installed:
    !rm -rf indic_nlp_library indic_nlp_resources >> /dev/null
    !git clone "https://github.com/anoopkunchukuttan/indic_nlp_resources.git" --quiet
    !git clone "https://github.com/anoopkunchukuttan/indic_nlp_library" --quiet
    !pip install -r "./indic_nlp_library/requirements.txt" >> /dev/null
    !pip install indic-nlp-library >> /dev/null
    !pip install Morfessor >> /dev/null

[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m


In [None]:
from indicnlp.tokenize import indic_tokenize
import torch.nn.functional as F
from indicnlp import loader
from indicnlp import common
import torch.nn as nn
import numpy as np
import indicnlp
import random
import torch
import nltk
import time
import sys
import csv

In [None]:
nltk.download('punkt', quiet=True)

True

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
INDIC_NLP_LIB_HOME   =  "./indic_nlp_library"
INDIC_NLP_RESOURCES  =  "./indic_nlp_resources"

# Add indicnlp to system path:
sys.path.append(INDIC_NLP_LIB_HOME)

# Point the indicnlp resources:
common.set_resources_path(INDIC_NLP_RESOURCES)

In [None]:
hi_word2id = {}
hi_id2word = {}
en_word2id = {}
en_id2word = {}

hi_word2freq = {}
en_word2freq = {}

### Adding Start Word and Stop Word:

In [None]:
# Adding start, stop words in hindi vocab:
hi_word2id.update({'__<<init>>__': 0})
hi_id2word.update({0: '__<<init>>__'})
hi_word2id.update({'__<<stop>>__': 1})
hi_id2word.update({1: '__<<stop>>__'})
hi_word2id.update({'__<<unknown>>__': 2})
hi_id2word.update({2: '__<<unknown>>__'})
hi_word2id.update({'__<<padding>>__': 3})
hi_id2word.update({3: '__<<padding>>__'})

# Adding start, stop words in english vocab:
en_word2id.update({'__<<init>>__': 0})
en_id2word.update({0: '__<<init>>__'})
en_word2id.update({'__<<stop>>__': 1})
en_id2word.update({1: '__<<stop>>__'})
en_word2id.update({'__<<unknown>>__': 2})
en_id2word.update({2: '__<<unknown>>__'})
en_word2id.update({'__<<padding>>__': 3})
en_id2word.update({3: '__<<padding>>__'})

### Loading the dataset:

In [None]:
dataset = []
with open('./train.csv', 'r') as file:
    dataset = np.array([[r[1], r[2]] for r in csv.reader(file)])[1::]

In [None]:
hi_word_seq = []
en_word_seq = []

hi_counter = 4
en_counter = 4
for row in dataset:
    
    # Hindi Sentences:
    temp = indic_tokenize.trivial_tokenize(row[0])
    hi_word_seq += [temp]
    
    for word in temp:
        if word not in hi_word2id.keys():
            hi_word2id.update({word: hi_counter})
            hi_id2word.update({hi_counter: word})
            hi_counter += 1

    # English Sentences:
    temp = nltk.word_tokenize(row[1])
    en_word_seq += [temp]

    for word in temp:
        if word not in en_word2id.keys():
            en_word2id.update({word: en_counter})
            en_id2word.update({en_counter: word})
            en_counter += 1

In [None]:
hi_max = max([len(l) for l in hi_word_seq])
en_max = max([len(l) for l in en_word_seq])

In [None]:
print('Hi-Vocabulary Size:', len(hi_id2word))
print('En-Vocabulary Size:', len(en_id2word))

Hi-Vocabulary Size: 46384
En-Vocabulary Size: 40802


In [None]:
def get_indices_seq(seq, vocab):
    
    temp = []
    for word in seq:
        if word in vocab.keys():
            temp += [vocab[word]]
        else:
            temp += [vocab['__<<unknown>>__']]

    seq = torch.tensor(
        [vocab['__<<init>>__']] + temp + [vocab['__<<stop>>__']]
    )

    return seq

In [None]:
def get_word_seq(seq, vocab):

    temp = []
    for index in seq:
        temp += [vocab[int(index)]]

    return temp

In [None]:
train_seq_pairs = []

for sent_id in range(len(hi_word_seq)):
    train_seq_pairs += [[
        get_indices_seq(hi_word_seq[sent_id], hi_word2id).to(device),
        get_indices_seq(en_word_seq[sent_id], en_word2id).to(device) 
    ]]

In [None]:
train_seq_pairs_sorted = sorted(train_seq_pairs, key=lambda x: len(x[0]) + len(x[1]))

In [None]:
BATCH_SIZE = 64

In [None]:
input_batches = []
output_batches = []
for i in range(0, len(train_seq_pairs_sorted) - (len(train_seq_pairs_sorted) % BATCH_SIZE), BATCH_SIZE):

    hi_indices_sequences = [pair[0] for pair in train_seq_pairs_sorted[i:i+BATCH_SIZE]]
    en_indices_sequences = [pair[1] for pair in train_seq_pairs_sorted[i:i+BATCH_SIZE]]

    input_batches += [
        torch.nn.utils.rnn.pad_sequence(
            hi_indices_sequences, 
            batch_first=False, 
            padding_value=3
        )
    ]

    output_batches += [
        torch.nn.utils.rnn.pad_sequence(
            en_indices_sequences, 
            batch_first=False, 
            padding_value=3
        )
    ]

In [None]:
!nvidia-smi

Sat May  8 07:55:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.19.01    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    40W / 300W |   1455MiB / 16160MiB |     13%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

### Encoder Model:

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, p):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, 1, bidirectional=True)

        self.fc_hidden = nn.Linear(hidden_size * 2, hidden_size)
        self.fc_cell = nn.Linear(hidden_size * 2, hidden_size)
        self.dropout = nn.Dropout(p)

    def forward(self, x):
        # x: (seq_length, N) where N is batch size

        # Convert integers to embedding vectors:
        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        # 
        encoder_states, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        # Use forward, backward cells and hidden through a linear layer
        # so that it can be input to the decoder which is not bidirectional
        # Also using index slicing ([idx:idx+1]) to keep the dimension
        hidden = self.fc_hidden(torch.cat((hidden[0:1], hidden[1:2]), dim=2))
        cell = self.fc_cell(torch.cat((cell[0:1], cell[1:2]), dim=2))

        return encoder_states, hidden, cell

In [None]:
class DecoderRNN(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, p
    ):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(hidden_size * 2 + embedding_size, hidden_size, 1)

        self.energy = nn.Linear(hidden_size * 3, 1)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(p)
        self.softmax = nn.Softmax(dim=0)
        self.relu = nn.ReLU()

    def forward(self, x, encoder_states, hidden, cell):
        x = x.unsqueeze(0)
        # x: (1, N) where N is the batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        sequence_length = encoder_states.shape[0]
        h_reshaped = hidden.repeat(sequence_length, 1, 1)
        # h_reshaped: (seq_length, N, hidden_size*2)

        energy = self.relu(self.energy(torch.cat((h_reshaped, encoder_states), dim=2)))
        # energy: (seq_length, N, 1)

        attention = self.softmax(energy)
        # attention: (seq_length, N, 1)

        # attention: (seq_length, N, 1), snk
        # encoder_states: (seq_length, N, hidden_size*2), snl
        # we want context_vector: (1, N, hidden_size*2), i.e knl
        context_vector = torch.einsum("snk,snl->knl", attention, encoder_states)

        rnn_input = torch.cat((context_vector, embedding), dim=2)
        # rnn_input: (1, N, hidden_size*2 + embedding_size)

        outputs, (hidden, cell) = self.rnn(rnn_input, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs).squeeze(0)
        # predictions: (N, hidden_size)

        return predictions, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):

        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(en_word2id)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be start token
        x = target[0]

        for t in range(1, target_len):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

    def predict(self, source):

        batch_size = source.shape[1]
        source_len = source.shape[0]
        target_vocab_size = len(en_word2id)

        outputs = torch.zeros(int(1.3 * source_len), batch_size, target_vocab_size).to(device)
        encoder_states, hidden, cell = self.encoder(source)

        # First input will be <SOS> token
        x = source[0]

        for t in range(1, int(1.3 * source_len)):
            # At every time step use encoder_states and update hidden, cell
            output, hidden, cell = self.decoder(x, encoder_states, hidden, cell)

            # Store prediction for current time step
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = best_guess

        return outputs

In [None]:
load_model = False
save_model = True

# Training hyperparameters
num_epochs = 2
learning_rate = 1e-3
batch_size = BATCH_SIZE
step = 0

In [None]:
# Model hyperparameters
input_size_encoder = len(hi_word2id)
input_size_decoder = len(en_word2id)
output_size = len(en_word2id)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 256
enc_dropout = 0.5
dec_dropout = 0.5

In [None]:
encoder_net = EncoderRNN(
    input_size_encoder, encoder_embedding_size, hidden_size, enc_dropout
).to(device)

In [None]:
decoder_net = DecoderRNN(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    dec_dropout,
).to(device)

In [None]:
model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
pad_idx = 3
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

In [None]:
model.train(True)

ctr = 0
for epoch in range(num_epochs):
    # print(f"[Epoch {epoch} / {num_epochs}]")

    indices = list(range(len(input_batches)))
    random.shuffle(indices)

    counter = 0
    minibatch_X = input_batches[0]
    minibatch_Y = output_batches[0]
    
    for i in indices:
        minibatch_X = input_batches[i]
        minibatch_Y = output_batches[i]
        # Get input and targets and get to cuda
        inp_data = minibatch_X
        target = minibatch_Y

        # Print batch number:
        # print('BATCH_NUMBER:', i)

        # Forward prop
        output = model(inp_data, target, 0.5 / (1 + 0.1 * ctr))

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        
        # print('INP_SHAPE:', inp_data.shape)
        # print('OUT_SHAPE:', target.shape)
        # print('HYP_SHAPE:', output.shape)
        
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)
        # print(loss)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        #writer.add_scalar("Training loss", loss, global_step=step)
        ctr += 1
        step += 1
        counter += 1

        if counter % 50 == 0:
            print('(EPOCH, BATCH, LOSS) = ({}, {}, {})'.format(epoch, counter, loss))

### Link to weights: [Here](https://drive.google.com/drive/folders/1BirGILg_Y68CaSp6-7jhMHfCMemUjz9U?usp=sharing)

In [None]:
torch.save(model.state_dict(), 'drive/MyDrive/gt_50_bi_dir_model')
torch.save(optimizer.state_dict(), 'drive/MyDrive/gt_50_bi_dir_optimizer')

In [None]:
model.load_state_dict(torch.load('drive/MyDrive/gt_50_bi_dir_model'))
optimizer.load_state_dict(torch.load('drive/MyDrive/gt_50_bi_dir_optimizer'))

In [None]:
model.train(False);

In [None]:
!unzip './drive/MyDrive/hindistatements.zip'

In [None]:
sents = []
with torch.no_grad():

    test_data = []
    with open('./testhindistatements.csv', 'r') as file:
        test_data = np.array([r[2] for r in csv.reader(file)])[1::]

    for sent in test_data:
        sent = indic_tokenize.trivial_tokenize(sent)
        query = get_indices_seq(sent, hi_word2id).reshape(-1, 1)
        response = model.predict(query.to(device))

        # print('===================================================================')
        # print(' '.join(sent))
        r = get_word_seq(torch.argmax(response[:, 0, :], dim=1), en_id2word)
        if r[-1] == '__<<stop>>__':
            sents += [' '.join(r[1:r.index('__<<stop>>__')]) + '\n']
        else:
            sents += [' '.join(r[1::]) + '\n']

In [None]:
in_data = []
with open('./testhindistatements.csv', 'r') as file:
    in_data = np.array([r[2] for r in csv.reader(file)])[1::]

for i in range(len(in_data)):
    print(in_data[i])
    print(sents[i])
    print('===================================================================')

In [None]:
with open('answer.txt', 'w') as file:
    file.writelines(sents)

In [None]:
!zip GT_20111407.zip answer.txt

  adding: answer.txt (deflated 69%)
