In [72]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import tensorflow as tf
import keras
from keras.layers import LSTM, Input, Dense, Masking
from keras.models import Model
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from transformers import TFAutoModel
from transformers import AutoTokenizer
import re
from nltk.translate.bleu_score import sentence_bleu
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# Task 5 : Machine translation with encoder and decoder, each built using a single hidden layer LSTM network.

## Data Reading

In [91]:
train5 = pd.read_csv('/content/drive/MyDrive/team2_te/team2_te_train.csv')    # Reading data
val5 = pd.read_csv('/content/drive/MyDrive/team2_te/team2_te_valid.csv')
test5 = pd.read_csv('/content/drive/MyDrive/team2_te/team2_te_test.csv')

In [3]:
train5.head()

Unnamed: 0,source,target
0,The coalition government he confected under At...,అటల్‌ బిహారీ వాజ్‌పేయి నాయకత్వంలో ఆడ్వాణీ రూపు...
1,India on Wednesday extended the ban on flights...,"తన ఉత్తర్వులను మరోసారి సవరిస్తూ, బ్రిటన్ కు వి..."
2,Coronavirus is growing rapidly in India.,భారతదేశంలో కరోనా వైరస్‌ సైలెంట్‌గా విజృంభిస్తో...
3,"The film also starred Saif Ali Khan, Riteish D...","సైఫ్ ఆలీఖాన్ , రితేష్ దేశ్ ముఖ్ , బిపాసా బసు ,..."
4,Oh my world!,ఓ విశ్వపతీ !


In [92]:
train5 = train5[:10000]
val5 = val5[:2000]
test5 = test5[:1000]

In [93]:
train5['source'] = train5['source'].str.lower()   # Converting to lowercase
val5['source'] = val5['source'].str.lower()
test5['source'] = test5['source'].str.lower()

In [94]:
max_len_eng = 0
min_len_eng = 10
eng_lengths=[]
for string in train5['source']:
  max_len_eng = np.max((len(string.split()), max_len_eng))
  min_len_eng = np.min((len(string.split()), min_len_eng))
  eng_lengths.append(len(string.split()))
print('Max length of sentence in Eng : ',max_len_eng)
print('Min length of sentence in Eng : ',min_len_eng)
print('Avg length of sentence in Ind : ',np.mean(eng_lengths))

Max length of sentence in Eng :  170
Min length of sentence in Eng :  1
Avg length of sentence in Ind :  9.5639


In [95]:
max_len_ind = 0
min_len_ind = 10
ind_lengths = []
for string in train5['target']:
  max_len_ind = np.max((len(string.split()), max_len_ind))
  min_len_ind = np.min((len(string.split()), min_len_ind))
  ind_lengths.append(len(string.split()))
print('Max length of sentence in Ind : ',max_len_ind)
print('Min length of sentence in Ind : ',min_len_ind)
print('Avg length of sentence in Ind : ',np.mean(ind_lengths))

Max length of sentence in Ind :  85
Min length of sentence in Ind :  1
Avg length of sentence in Ind :  7.349


In [96]:
max_len_eng = 10
max_len_ind = 10

### Embedding English sentences



In [97]:
eng_embedding_dim = 50

In [98]:
def read_glove_vector(glove_vec):
  '''
  Function to read GloVe embeddings
  '''
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
  return word_to_vec_map

In [99]:
word_to_vec_map = read_glove_vector('/content/drive/My Drive/glove.6B.50d.txt')

In [100]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'"')
tokenizer.fit_on_texts(train5['source'])
punctuations = ["'",'"',',','!', '#', '$', '%','&','(',')','*','+','-','.','/',':',';','<','=','>','?','@','[','\\',']','^','_','`','{','}','|','~','\t','\n']
for punctuation in punctuations:
    tokenizer.word_index[punctuation] = len(tokenizer.word_index) + 1   # Tokeinzing the words and puctuations
index_word = {index: word for word, index in tokenizer.word_index.items()}

In [101]:
def pad_and_seq(data):
    '''
    Function to pad and tokenize the data
    '''
    data_padded = np.zeros((len(data), max_len_eng))
    for i in range(len(data)):
      for j in range(max_len_eng):
          tokens = re.split(r'(\w+|[^\w\s])', data[i])    # Converting the tokens to indices
          tokens = [token for token in tokens if token.strip()]
          if j < len(tokens):
              if tokens[j] in tokenizer.word_index.keys():
                  data_padded[i][j] = tokenizer.word_index[tokens[j]]
    return data_padded

index_word[0] = 'thisiszerovector'

def vectorize_seq(data):
    '''
    Function to embed/vectorize data using GloVe
    '''
    data_new = np.zeros((data.shape[0], data.shape[1], eng_embedding_dim))
    for i in range(data_new.shape[0]):
      for j in range(data_new.shape[1]):
          word = index_word[int(data[i,j])]
          if word in word_to_vec_map.keys():
            data_new[i,j,:] = word_to_vec_map[word]
    return data_new

In [102]:
train_english_padded = pad_and_seq(train5['source'])
val_english_padded = pad_and_seq(val5['source'])
test_english_padded = pad_and_seq(test5['source'])

In [103]:
# Vectorize sentences
train_english = vectorize_seq(train_english_padded)
val_english = vectorize_seq(val_english_padded)
test_english = vectorize_seq(test_english_padded)

In [104]:
print('Size of vocabulary for English sentences (training): ', len(tokenizer.word_index))

Size of vocabulary for English sentences (training):  13202


In [105]:
train_english.shape, val_english.shape, test_english.shape

((10000, 10, 50), (2000, 10, 50), (1000, 10, 50))

We have now successfully embedded the given english data using GloVe. The first dim is number of training samples, second dim is the maximum length of the sentence, and the third dim is the dim of the emdedding vector, which is 50.

### Embedding Indic sentences

In [106]:
tokenizer_ind = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model_indbert = TFAutoModel.from_pretrained('ai4bharat/indic-bert', from_pt=True).get_input_embeddings()

config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFAlbertModel: ['predictions.dense.bias', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'sop_classifier.classifier.bias', 'predictions.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'sop_classifier.classifier.weight']
- This IS expected if you are initializing TFAlbertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFAlbertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFAlbertModel

In [107]:
train_indic_text = train5['target'].tolist()
val_indic_text = val5['target'].tolist()
test_indic_text = test5['target'].tolist()

train_indic_text_tokens = tokenizer_ind(train_indic_text, padding='max_length', truncation=True, max_length=max_len_ind, return_tensors='tf')
val_indic_text_tokens = tokenizer_ind(val_indic_text, padding='max_length', truncation=True, max_length=max_len_ind, return_tensors='tf')
test_indic_text_tokens = tokenizer_ind(test_indic_text, padding='max_length', truncation=True, max_length=max_len_ind, return_tensors='tf')

In [108]:
vocab_size_ind_train = np.unique(train_indic_text_tokens['input_ids']).shape[0]
vocab_size_ind_val = np.unique(val_indic_text_tokens['input_ids']).shape[0]
vocab_size_ind_test = np.unique(test_indic_text_tokens['input_ids']).shape[0]

print('Vocabulary size of Indic language : ',vocab_size_ind_train)

Vocabulary size of Indic language :  1111


In [109]:
ind_sequence_train = range(vocab_size_ind_train)
ind_sequence_val = range(vocab_size_ind_val)
ind_sequence_test = range(vocab_size_ind_test)

sequence_dict_train = {}
sequence_dict_val = {}
sequence_dict_test = {}

for i,j in zip(np.unique(train_indic_text_tokens['input_ids']), ind_sequence_train):
    sequence_dict_train[i] = j

for i,j in zip(np.unique(val_indic_text_tokens['input_ids']), ind_sequence_val):
    sequence_dict_val[i] = j

for i,j in zip(np.unique(test_indic_text_tokens['input_ids']), ind_sequence_test):
    sequence_dict_test[i] = j

mapped_function_train = np.vectorize(lambda x: sequence_dict_train.get(x, x))
mapped_function_val = np.vectorize(lambda x: sequence_dict_val.get(x, x))
mapped_function_test = np.vectorize(lambda x: sequence_dict_test.get(x, x))

train_indic_text_tokens['input_ids'] = mapped_function_train(train_indic_text_tokens['input_ids'].numpy())
val_indic_text_tokens['input_ids'] = mapped_function_val(val_indic_text_tokens['input_ids'].numpy())
test_indic_text_tokens['input_ids'] = mapped_function_test(test_indic_text_tokens['input_ids'].numpy())

train_indic_text_tokens['input_ids'] = tf.constant(train_indic_text_tokens['input_ids'])
val_indic_text_tokens['input_ids'] = tf.constant(val_indic_text_tokens['input_ids'])
test_indic_text_tokens['input_ids'] = tf.constant(test_indic_text_tokens['input_ids'])

In [110]:
train_indic = model_indbert(train_indic_text_tokens['input_ids'], train_indic_text_tokens['attention_mask']).numpy()
val_indic = model_indbert(val_indic_text_tokens['input_ids'], val_indic_text_tokens['attention_mask']).numpy()
test_indic = model_indbert(test_indic_text_tokens['input_ids'], test_indic_text_tokens['attention_mask']).numpy()

In [111]:
print(train_indic.shape, val_indic.shape, test_indic.shape)

(10000, 10, 128) (2000, 10, 128) (1000, 10, 128)


In [112]:
print(train5['target'][2])
print(train_indic_text_tokens['input_ids'][2])
print(train_indic[2])

భారతదేశంలో కరోనా వైరస్‌ సైలెంట్‌గా విజృంభిస్తోంది.
tf.Tensor([  2 984 654  72 451  81   6 810 970   3], shape=(10,), dtype=int64)
[[-0.602987    0.25132552 -0.02279774 ...  0.07270921 -0.07406937
   0.08301478]
 [-1.0659171   0.13505325 -0.27841157 ... -0.10180465 -0.21177185
   0.5742666 ]
 [-0.01316994 -0.17785558  0.4233597  ...  0.15304899 -0.05257851
  -0.76144445]
 ...
 [ 0.19513088  0.38181195 -0.34263095 ... -0.43292102  0.02467063
  -0.33424437]
 [-1.0138708  -0.14222197  0.11988396 ...  0.22667918 -0.5726075
   1.1208166 ]
 [-0.34749624  0.25073248 -0.62306756 ...  0.2737821   0.7896164
   0.31719106]]


We have now also successfully embedded Indic sentences using IndicBERT, and can be clearly seen from the above values. The first dim is number of samples, the second is the maximum indic sentence length and the third dim is the size of the embedding vector, which is 128 in this case.

### Model building

In [113]:
hidden_units = 200
ind_embedding_dim = 128
eng_embedding_dim = 50

In [114]:
decoder_targets_one_hot = np.zeros((train_english.shape[0], max_len_ind, vocab_size_ind_train), dtype='int8')

In [115]:
for i, d in enumerate(train_indic_text_tokens['input_ids']):
    for t, word in enumerate(d):
        decoder_targets_one_hot[i, t, word] = 1

In [121]:
#encoder
encoder_inputs = Masking(mask_value=0)(Input(shape=(max_len_eng, eng_embedding_dim)))
encoder = LSTM(hidden_units, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

In [122]:
#Decoder
decoder_inputs = Masking(mask_value=0)(Input(shape=(max_len_ind, ind_embedding_dim)))
decoder = LSTM(hidden_units, return_sequences=True, return_state=True)
decoder_outputs,_,_ = decoder(decoder_inputs, initial_state = encoder_states)
decoder_dense = Dense(vocab_size_ind_train, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [126]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [127]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 10, 50)]             0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 10, 128)]            0         []                            
                                                                                                  
 lstm_2 (LSTM)               [(None, 200),                200800    ['input_7[0][0]']             
                              (None, 200),                                                        
                              (None, 200)]                                                        
                                                                                            

In [128]:
history = model.fit([train_english, train_indic],
                        decoder_targets_one_hot, batch_size=200, epochs=20,
                        validation_split=0.1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [129]:
model.save('my_model_lstm6.keras')

In [132]:
model = tf.keras.models.load_model('my_model_lstm6.keras')

## Making predictions

In [133]:
encoder_inputs = model.input[0]
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]

In [134]:
encoder_model = Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]

decoder_state_input_h = tf.keras.layers.Input(shape=(hidden_units,), name='anotherInput1')
decoder_state_input_c = tf.keras.layers.Input(shape=(hidden_units,), name='anotherInput2')
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder = model.layers[3]
decoder_dense = model.layers[4]

decoder_outputs, state_h, state_c = decoder(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [135]:
def embed_word(word):
    tokenized_word = tokenizer_ind(word, padding='max_length', max_length = max_len_ind, return_tensors='tf')
    tokenized_word['input_ids'] = mapped_function_train(tokenized_word['input_ids'].numpy())
    tokenized_word['input_ids'] = tf.constant(tokenized_word['input_ids'], dtype=tf.int32)
    target_word = model_indbert(tokenized_word['input_ids'], tokenized_word['attention_mask']).numpy()
    return target_word

In [136]:
rev_seq_dict = {value: key for key, value in sequence_dict_train.items()}

In [141]:
def decode_sequence(input_seq):
    enc_out = encoder_model.predict(input_seq, verbose=0)
                              # BOS Token index is 2 in IndicBERT
    eos = 3                   # EOS Token index is 3 in IndicBERT
    s = np.zeros((1, hidden_units))
    c = np.zeros((1, hidden_units))

    target_word = np.zeros((1,10,128))
    output_sentence = []
    for x in range(max_len_ind):
      o, s, c = decoder_model.predict([target_word]+enc_out, verbose=0)
      # Get next word
      idx = np.argmax(o[0,0,:])
      idx = rev_seq_dict[idx]
      # End sentence of EOS
      if eos == idx:
        break
      word = ''
      if idx > 0:
        word = tokenizer_ind.decode(idx)
        output_sentence.append(word)

      # Update the decoder input
      # which is just the word just generated
      target_word = np.expand_dims(embed_word(word)[0][0], axis=0)
      target_word = np.expand_dims(target_word, axis=0)
      enc_out = [s,c]

    return ' '.join(output_sentence)

### Predictions and Calculating BLEU Scores

In [None]:
pred_train = []
for i in range(train5.shape[0]):
    input_seq = train_english[i:i+1]
    translation = decode_sequence(input_seq)
    pred_train.append(translation)

actual_train = train5['target'].tolist()
np.save('pred_train.npy', np.array(pred_train))

In [None]:
pred_test = []
for i in range(test5.shape[0]):
    input_seq = test_english[i:i+1]
    translation = decode_sequence(input_seq)
    pred_test.append(translation)

actual_test = test5['target'].tolist()
np.save('pred_test.npy', np.array(pred_test))

In [81]:
pred_train = np.load('pred_train.npy')
pred_test = np.load('pred_test.npy')

def calculate_bleu_score(predicted_sentences, actual_sentences, k=1):
  total_bleu_score = 0
  for predicted_sentence, actual_sentence in zip(predicted_sentences, actual_sentences):
    reference = [actual_sentence.split()]
    candidate = predicted_sentence.split()
    weights = [1 / k for _ in range(k)]
    bleu_score = sentence_bleu(reference, candidate, weights = tuple(weights))
    total_bleu_score += bleu_score
  average_bleu_score = total_bleu_score / len(predicted_sentences)
  return average_bleu_score

In [90]:
for i in np.arange(6,9):
  print('Input sentence : ', train5['source'].tolist()[i])
  print('Actual output : ', actual_train[i])
  print('Predicted output : ', pred_train[i])

Input sentence :  Later the mosque was destroyed.
Actual output :  ఆ తరవాత ప్రాంగణంలో ఉన్న మసీదును కూల్చివేశారు.
Predicted output :  ఆ కూల్చివేశారు.
Input sentence :  The total value of the contract is Rs
Actual output :  మొత్తం డీల్‌ విలువ రూ
Predicted output :  
Input sentence :  Hyderabad: Andhra Pradesh Chief Minister Chandrababu Naidu is going to shift to his own farmhouse in Madinaguda in Hyderabad on Sunday
Actual output :  ఆంధ్రప్రదేశ్ ముఖ్యమంత్రి చంద్రబాబు ఆదివారం విశాఖ ఉత్సవాల్లో పాల్గొనున్నారు
Predicted output :  


As we can see, it predicts some sentences well (at least the starting/ending), but it seems to predicting blanks or padding for many sentences. This might be because the model is learning the padding vectors since there are many padded values in spite of using a Masking layer.

In [82]:
print('Training Dataset')
for i in range(4):
  print('BLEU-',i+1,'score : ',calculate_bleu_score(pred_train, actual_train, i+1))
print('------------------------------------------')
print('Testing Dataset')
for i in range(4):
  print('BLEU-',i+1,'score : ',calculate_bleu_score(pred_test, actual_test, i+1))

Training Dataset
BLEU- 1 score :  0.20209220408368953
BLEU- 2 score :  0.06210313934670278
BLEU- 3 score :  0.00957548453267699
BLEU- 4 score :  1.1694909541403812e-79
------------------------------------------
Testing Dataset
BLEU- 1 score :  0.1201556595530594
BLEU- 2 score :  0.030899881546674505
BLEU- 3 score :  8.691037524673306e-105
BLEU- 4 score :  4.6092369025765457e-156
