#### Name: MansiMrugen Shah
#### NetID: ws2865

### Build a character-level model Seq2Seq Language Translation model for any language pair of your choice.

In English, all words are formed by 26 (or 52 if including both upper and lower case character, or even more if including special characters). Having the character embedding, every single word’s vector can be formed even for words not in the vocabulary. On the other hand, word embedding can only handle the words that have been seen.
Character embedding fits better for misspelt words and new words.

It handles infrequent words better than word embedding as the latter suffers from lack of enough training opportunity on those rare words.
Another benefit is that as the vector is smaller compared to word embedding, it reduces model complexity and improves the performance (in terms of speed).

### English to Marathi language translation

In [0]:
# Import Libraries
from __future__ import print_function
import string
import matplotlib.pyplot as plt
from string import digits
%matplotlib inline
import re
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

### Load the dataset and set other parameters

In [0]:
batch_size = 64 
epochs = 120 
latent_dim = 256 
num_samples = 38000 
data_path = '/content/mar.txt'

### Clean the data by removing punctuations, digits and convert to lower case

In [0]:
exclude = set(string.punctuation)
remove_digits = str.maketrans('', '', digits)
Eng_texts = []
Mar_texts = []
Eng_characters = set()
Mar_characters = set()

with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
    Eng_text, Mar_text, _ = line.split('\t')
    Eng_text = Eng_text.lower()
    Mar_text = Mar_text.lower()
    Eng_text = re.sub("'", '', Eng_text)
    Eng_text = re.sub(",", ' COMMA', Eng_text)
    Mar_text = re.sub("'", '', Mar_text)
    Mar_text = re.sub(",", ' COMMA', Mar_text)
    Eng_text = ''.join(x for x in Eng_text if x not in exclude)
    Mar_text = ''.join(x for x in Mar_text if x not in exclude)
    Eng_text = Eng_text.translate(remove_digits)
    Mar_text = Mar_text.translate(remove_digits)
    Mar_text = '\t' + Mar_text + '\n'
    Eng_texts.append(Eng_text)
    Mar_texts.append(Mar_text)
    for char in Eng_text:
        if char not in Eng_characters:
            Eng_characters.add(char)
    for char in Mar_text:
        if char not in Mar_characters:
            Mar_characters.add(char)

In [20]:
Eng_characters = sorted(list(Eng_characters))
Mar_characters = sorted(list(Mar_characters))
num_encoder_tokens = len(Eng_characters)
num_decoder_tokens = len(Mar_characters)
max_encoder_seq_length = max([len(txt) for txt in Eng_texts])
max_decoder_seq_length = max([len(txt) for txt in Mar_texts])

print('Number of samples:', len(Eng_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

Number of samples: 38000
Number of unique input tokens: 37
Number of unique output tokens: 98
Max sequence length for inputs: 61
Max sequence length for outputs: 85


### Create a dictionary for english and marathi characters

In [0]:

input_token_index = dict(
    [(char, i) for i, char in enumerate(Eng_characters)])
target_token_index = dict(
    [(char, i) for i, char in enumerate(Mar_characters)])


encoder_input_data = np.zeros(
    (len(Eng_texts), max_encoder_seq_length, num_encoder_tokens),
    dtype='float32')
decoder_input_data = np.zeros(
    (len(Eng_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')
decoder_target_data = np.zeros(
    (len(Eng_texts), max_decoder_seq_length, num_decoder_tokens),
    dtype='float32')

###  One hot encoding

In [0]:

for i, (Eng_text, Mar_text) in enumerate(zip(Eng_texts, Mar_texts)):
    for t, char in enumerate(Eng_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.
    encoder_input_data[i, t + 1:, input_token_index[' ']] = 1.
    for t, char in enumerate(Mar_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.
    decoder_input_data[i, t + 1:, target_token_index[' ']] = 1.
    decoder_target_data[i, t:, target_token_index[' ']] = 1.

### Create the model

In [0]:


encoder_inputs = Input(shape=(None, num_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True, dropout = 0.3)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True, dropout=0.3)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

### Compile and fit model

In [24]:

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',
              metrics=['accuracy'])
history = model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.1)


Epoch 1/120
Epoch 2/120
Epoch 3/120
Epoch 4/120
Epoch 5/120
Epoch 6/120
Epoch 7/120
Epoch 8/120
Epoch 9/120
Epoch 10/120
Epoch 11/120
Epoch 12/120
Epoch 13/120
Epoch 14/120
Epoch 15/120
Epoch 16/120
Epoch 17/120
Epoch 18/120
Epoch 19/120
Epoch 20/120
Epoch 21/120
Epoch 22/120
Epoch 23/120
Epoch 24/120
Epoch 25/120
Epoch 26/120
Epoch 27/120
Epoch 28/120
Epoch 29/120
Epoch 30/120
Epoch 31/120
Epoch 32/120
Epoch 33/120
Epoch 34/120
Epoch 35/120
Epoch 36/120
Epoch 37/120
Epoch 38/120
Epoch 39/120
Epoch 40/120
Epoch 41/120
Epoch 42/120
Epoch 43/120
Epoch 44/120
Epoch 45/120
Epoch 46/120
Epoch 47/120
Epoch 48/120
Epoch 49/120
Epoch 50/120
Epoch 51/120
Epoch 52/120
Epoch 53/120
Epoch 54/120
Epoch 55/120
Epoch 56/120
Epoch 57/120
Epoch 58/120
Epoch 59/120
Epoch 60/120
Epoch 61/120
Epoch 62/120
Epoch 63/120
Epoch 64/120
Epoch 65/120
Epoch 66/120
Epoch 67/120
Epoch 68/120
Epoch 69/120
Epoch 70/120
Epoch 71/120
Epoch 72/120
Epoch 73/120
Epoch 74/120
Epoch 75/120
Epoch 76/120
Epoch 77/120
Epoch 78

### Maximum validation accuracy

In [25]:
print("Maximum accuracy of validation: ", max(history.history['val_accuracy']))

Maximum accuracy of validation:  0.8262848258018494


In [0]:

# Define sampling models
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())


### Predictions

In [0]:


def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    target_seq[0, 0, target_token_index['\t']] = 1.
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence



### Inference Samples

In [28]:

for seq_index in range(53,100):
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)


    print('-')
    print('Input sentence:', Eng_texts[seq_index])
    print('Actual sentence:', Mar_texts[seq_index])
    print('Decoded sentence:', decoded_sentence)




-
Input sentence: we won
Actual sentence: 	आम्ही जिंकलो

Decoded sentence: आपण हसलो

-
Input sentence: we won
Actual sentence: 	आपण जिंकलो

Decoded sentence: आपण हसलो

-
Input sentence: why me
Actual sentence: 	मीच का

Decoded sentence: मी का आले

-
Input sentence: why me
Actual sentence: 	मी का

Decoded sentence: मी का आले

-
Input sentence: ask tom
Actual sentence: 	टॉमला विचार

Decoded sentence: टॉमला बोलवा

-
Input sentence: ask tom
Actual sentence: 	टॉमला विचारा

Decoded sentence: टॉमला बोलवा

-
Input sentence: call me
Actual sentence: 	मला बोलव

Decoded sentence: मला बोलवा

-
Input sentence: call me
Actual sentence: 	मला बोलवा

Decoded sentence: मला बोलवा

-
Input sentence: call me
Actual sentence: 	मला फोन करा

Decoded sentence: मला बोलवा

-
Input sentence: call me
Actual sentence: 	मला फोन कर

Decoded sentence: मला बोलवा

-
Input sentence: call us
Actual sentence: 	आम्हाला फोन कर

Decoded sentence: आम्हाला बोलवा

-
Input sentence: call us
Actual sentence: 	आम्हाला फोन करा

Deco