#Seq2Seq Machine Translation

In [24]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from sklearn.model_selection import train_test_split
from tensorflow import keras
import numpy as np
from tensorflow.keras.models import load_model
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Function to convert source tokens into lists of complete sentences  

In [25]:
def conv_source_sentences(data):
  sentence = []
  sentences=[]
  for line in data:
    token = line.rstrip("\n")
    if token == '<s>':
      sentence = []
      sen=''
    elif token=='</s>':
      sen+=' '.join(sentence)
      sentences.append(sen)
    else:
      sentence.append(token)
  return sentences

Function to convert target tokens into list of complete sentences 

In [26]:
def conv_target_sentences(data):
  sentence = []
  sentences=[]
  for line in data:
    token = line.rstrip("\n")
    if token == '<s>':
      sentence = []
      sen='\t'
    elif token=='</s>':
      sen+=' '.join(sentence)
      sen+='\n'
      sentences.append(sen)
    else:
      sentence.append(token)
  return sentences

List of Source/Target Characters Function

In [27]:
def inp_out_characters(source_sentences,target_sentences):
  input_characters = set()
  target_characters = set()
  for sentence in source_sentences:
      for char in sentence:
          if char not in input_characters:
              input_characters.add(char)

  for sentence in target_sentences:
      for char in sentence:
          if char not in target_characters:
              target_characters.add(char)
  input_characters = sorted(list(input_characters))
  target_characters = sorted(list(target_characters))
  return input_characters,target_characters

Dataset Stats Function

In [28]:
def data_stats(input_characters,target_characters,source_sentences,target_sentences): 
  num_encoder_tokens = len(input_characters) 
  num_decoder_tokens = len(target_characters)
  max_encoder_seq_length = max([len(txt) for txt in source_sentences])
  max_decoder_seq_length = max([len(txt) for txt in target_sentences])
  return num_encoder_tokens,num_decoder_tokens,max_encoder_seq_length,max_decoder_seq_length


Read Data

In [29]:
train_source=open('/content/drive/MyDrive/Colab Notebooks/train-source.txt','r',encoding = "UTF-8").readlines()
test_source=open('/content/drive/MyDrive/Colab Notebooks/train-target.txt','r',encoding = "UTF-8").readlines()

In [30]:
source_sentences=conv_source_sentences(train_source)
target_sentences=conv_target_sentences(test_source)
print("Source Sentences: \n",source_sentences[:1])
print("Target Sentences: \n",target_sentences[:1])

Source Sentences: 
 ['Cinnte go leór , thiocfadh dóbhtha bás a fhagháil ar imeall an phuill udaí .']
Target Sentences: 
 ['\tCinnte go leor , thiocfadh dóibh bás a fháil ar imeall an phoill úd .\n']


In [31]:
input_characters,target_characters=inp_out_characters(source_sentences,target_sentences)
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])

In [32]:
num_encoder_tokens,num_decoder_tokens,max_encoder_seq_length,max_decoder_seq_length=data_stats(input_characters,target_characters,source_sentences,target_sentences)
print("Number of samples:", len(source_sentences))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)

Number of samples: 45171
Number of unique input tokens: 107
Number of unique output tokens: 96
Max sequence length for inputs: 1190
Max sequence length for outputs: 1115


Train/Validation Split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(source_sentences, target_sentences, test_size=0.1, random_state=42)

In [34]:
def generate_batch(X = X_train, y = y_train, batch_size = 128) :
  #' Generate a batch of data 
  while True:
    for j in range(0, len(X), batch_size):
      encoder_input_data = np.zeros((batch_size, max_encoder_seq_length), dtype='float32')
      decoder_input_data = np.zeros((batch_size, max_decoder_seq_length) , dtype= 'float32')
      decoder_target_data = np.zeros((batch_size, max_decoder_seq_length, num_decoder_tokens), dtype='float32')
      for i,(input_text, target_text) in enumerate(zip (X[j:j+batch_size], y[j:j+batch_size])):
        #print(input_text)
        for t, word in enumerate(input_text):
          #print(word)
          encoder_input_data[i, t] = input_token_index[word] # encoder input seg
        for t, word in enumerate(target_text):
          if t<len(target_text)-1:
            decoder_input_data[i, t] = target_token_index[word] # decoder input seq
          if t>0:
            # decoder target sequence (one hot encoded)
            # does not include the sTART token
            # Offset by one timestep
            decoder_target_data[i, t - 1, target_token_index [word]] = 1.
      yield([encoder_input_data, decoder_input_data], decoder_target_data)

Encoder Architecture

In [35]:
# Define an input sequence and process it.
latent_dim=256
encoder_inputs = keras.Input(shape=(None,))
enc_emb=Embedding(num_encoder_tokens,latent_dim,mask_zero=True)(encoder_inputs)
encoder_lstm = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]



Decoder Architecture

In [36]:

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None,))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
dec_emb_layer=Embedding(num_decoder_tokens,latent_dim,mask_zero=True)
dec_emb=dec_emb_layer(decoder_inputs)
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)


In [23]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 400
epochs = 20
#print(val_samples//batch_size)

Generator function to iteratively pull data and fit the model

In [24]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size)

  """


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1dca7f8050>

In [25]:
model.save_weights('/content/drive/MyDrive/Colab Notebooks/machine_translation_model.h5')

In [37]:
model.load_weights('/content/drive/MyDrive/Colab Notebooks/machine_translation_model.h5')

In [38]:
encoder_model = keras.Model(encoder_inputs, encoder_states)

#decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2=dec_emb_layer(decoder_inputs)
#decoder_lstm = model.layers[3]

decoder_outputs2, state_h_dec, state_c_dec = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h_dec, state_c_dec]
#decoder_dense = model.layers[4]
decoder_outputs2 = decoder_dense(decoder_outputs2)

decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2
)



In [39]:
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

Decode Sequence Function

In [40]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    #print("state value predictions: ",states_value)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0]= target_token_index['\t']

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True
            print("Inside break cond")

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] =sampled_token_index

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [41]:
train_gen=generate_batch(X_train,y_train,batch_size=1)
k=-1

In [42]:
X_train[0]

'Cé go rabh siad ag bogadaigh go scaolmhar níor theich siad go rabh mé fá leath-duisín slat dóbhtha .'

In [43]:
y_train[0]

'\tCé go raibh siad ag bogadaigh go scaollmhar níor theith siad go raibh mé fá leathdhoisín slat dóibh .\n'

In [47]:
k+=1
print(k)
(input_seq,actual_output),_=next(train_gen)
decoded_sentence=decode_sequence(input_seq)
print('Input Source sentence:', X_train[k:k+1])
print('Actual Target Translation:', y_train[k:k+1])
print('Predicted Target Translation:', decoded_sentence)

2
Inside break cond
Input Source sentence: ['Ag Droichead an Sceárdáin sceinn sí trasna an bhealaigh mhóir comh gasta le cairr-fhiadh .']
Actual Target Translation: ['\tAg Droichead an Scairdeáin scinn sí trasna an bhealaigh mhóir chomh gasta le carria .\n']
Predicted Target Translation: Is iomaí an t-am sin agus an t-am sin agus an t-am sin agus an t-am sin agus an t-am .

