# Connecting to Google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Packages & dependencies

In [None]:
from collections import Counter
import string
import tensorflow as tf
from tensorflow import keras
from keras.layers import TextVectorization , Embedding , Dense , TimeDistributed , LSTM , GRU , RNN , Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.losses import sparse_categorical_crossentropy
import numpy as np

# Importing Data files

In [None]:
with open("/content/drive/MyDrive/Datasets/language_translation/small_vocab_en") as f :
  data_eng = f.read() 

with open("/content/drive/MyDrive/Datasets/language_translation/small_vocab_fr") as f :
  data_fr = f.read() 

# Exploring Data

In [None]:
def print_stats(text,language) :
   print("Number of lines in the {} dataset = {}".format(language,len(text.split("\n"))))
   print("Number of words in the {} dataset = {}".format(language,len(text.split())))
   print("Number of unique words in the {} dataset = {}".format(language,len(set(text.split()))))

In [None]:
print_stats(data_eng,"english")
print_stats(data_fr,"french")

Number of lines in the english dataset = 137861
Number of words in the english dataset = 1823250
Number of unique words in the english dataset = 227
Number of lines in the french dataset = 137861
Number of words in the french dataset = 1961295
Number of unique words in the french dataset = 355


# Preprocessing

## 1-Cleaning texts


*   Lowering the text
*   Punctuation Removal





In [None]:
def clean_text(text) :
  text = text.lower()
  text = text.translate(str.maketrans('','',string.punctuation))
  return text

In [None]:
data_eng = clean_text(data_eng).split("\n")
data_fr = clean_text(data_fr).split("\n")

## 2-Tokenization

In [None]:
def tokenize(text) : 
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)
  sequences = tokenizer.texts_to_sequences(text)
  return sequences , tokenizer

## 3-Padding

In [None]:
def pad(sequence , max_length = None ) :
  return pad_sequences(sequence , maxlen = max_length , padding = "post" )

# Preprocess Pipeline

## 1-Using the predefined functions

In [None]:
def preprocess(x, y ) :
  preprocess_x , x_tk = tokenize(x)
  preprocess_y , y_tk = tokenize(y)
  preprocess_x = pad(preprocess_x)
  preprocess_y = pad(preprocess_y)
  preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)
  return preprocess_x , preprocess_y , x_tk , y_tk

In [None]:
preproc_english_sentences, preproc_french_sentences , english_tokenizer, french_tokenizer = preprocess(data_eng,data_fr)

In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab = english_tokenizer.word_index
french_vocab = french_tokenizer.word_index
english_vocab_size = len(english_vocab)
french_vocab_size = len(french_vocab)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 344


## 2- Using tokenization through "TextVectorization"

In [None]:
def text_vectorization(text_data,max_length) : 
  vectorize_layer = tf.keras.layers.TextVectorization(output_mode='int', output_sequence_length=max_length)
  vectorize_layer.adapt(text_data)
  model = tf.keras.models.Sequential()
  model.add(vectorize_layer)
  return model.predict(text_data),(vectorize_layer.get_vocabulary())

In [None]:
def get_max_padding_length(text) :
  max = 0
  for item in text :
    length = len(item.split())
    if max < length :
      max = length
  return max

In [None]:
max_english_sequence_length = get_max_padding_length(data_eng)
max_french_sequence_length = get_max_padding_length(data_fr)
preproc_english_sentences_2 , english_vocab_2 = text_vectorization(data_eng,max_english_sequence_length)
preproc_french_sentences_2 , french_vocab_2 = text_vectorization(data_fr,max_french_sequence_length)
english_vocab_size = len(english_vocab_2)
french_vocab_size = len(french_vocab_2)

In [None]:
preproc_french_sentences_2 = preproc_french_sentences_2.reshape(*preproc_french_sentences_2.shape,)

In [None]:
print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 201
French vocabulary size: 346


In [None]:
preproc_english_sentences_2.shape

(137861, 15)

In [None]:
preproc_french_sentences_2.shape

(137861, 21)

# Building Models

# Ids Back to Text

In [None]:
def logits_to_text(logits, tokenizer):
  index_to_words = {id : word for word , id in tokenizer.word_index.items()}
  index_to_words[0] = "<PAD>"
  return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])

## Model 1: RNN

In [None]:
def simple_model(input_shape, french_vocab_size):

  # HyperParameter 
  learning_rate = .005

  # Build the layers 

  model = keras.models.Sequential()
  model.add(GRU(256,return_sequences=True,input_shape=input_shape[1:]))
  model.add(TimeDistributed(Dense(1024,activation = "relu")))
  model.add(keras.layers.Dropout(.5))
  model.add(TimeDistributed(Dense(french_vocab_size , activation = "softmax")))
  model.compile(loss = sparse_categorical_crossentropy , metrics = ["accuracy"] , optimizer = keras.optimizers.Adam(learning_rate))
  return model


tmp_x = pad(preproc_english_sentences , max_french_sequence_length )
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2], 1))

# Training simple model 
simple_rnn_model  = simple_model(tmp_x.shape, french_vocab_size)

print(simple_rnn_model.summary())


Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 21, 256)           198912    
                                                                 
 time_distributed (TimeDistr  (None, 21, 1024)         263168    
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 21, 1024)          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 21, 346)          354650    
 tributed)                                                       
                                                                 
Total params: 816,730
Trainable params: 816,730
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
simple_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=10, validation_split=0.2)
simple_rnn_model.save("sinple_rnn_model")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f639f530590>

In [None]:
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

new jersey est parfois chaud en mois de il et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>


In [None]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(simple_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fr[:1])

print("\nOriginal text:")
print(data_eng[:1])

Prediction:
new jersey est parfois chaud en mois de il et il est en en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['new jersey est parfois calme pendant l automne  et il est neigeux en avril ']

Original text:
['new jersey is sometimes quiet during autumn  and it is snowy in april ']


## Model 2: Embedding (IMPLEMENTATION)

In [None]:
def embed_model(input_shape,english_vocab_size,max_french_sequence_length,french_vocab_size) :

  # Hyperparameter
  learning_rate = .005
  # Building model
  model = keras.models.Sequential()
  model.add(Embedding(english_vocab_size,300 ,input_length= max_french_sequence_length))
  model.add(GRU(256,return_sequences = True))
  model.add(TimeDistributed(Dense(1024,activation = "relu")))
  model.add(keras.layers.Dropout(.5))
  model.add(TimeDistributed(Dense(french_vocab_size , activation = "softmax")))
  model.compile(loss = sparse_categorical_crossentropy , metrics = ["accuracy"] , optimizer = keras.optimizers.Adam(learning_rate))

  return model

In [None]:
tmp_x  = pad(preproc_english_sentences , max_french_sequence_length)
tmp_x  = tmp_x .reshape((-1,preproc_french_sentences.shape[-2]))

embed_rnn_model = embed_model(tmp_x,english_vocab_size,max_french_sequence_length,french_vocab_size)

In [None]:
embed_rnn_model.fit(tmp_x,preproc_french_sentences , epochs = 30 , batch_size = 512 , validation_split= .2 )
embed_rnn_model.save("embed_rnn_model")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f63a1724bd0>

In [None]:
logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0],french_tokenizer)

'new jersey est parfois calme en l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [None]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fr[:1])

print("\nOriginal text:")
print(data_eng[:1])

Prediction:
new jersey est parfois calme en l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['new jersey est parfois calme pendant l automne  et il est neigeux en avril ']

Original text:
['new jersey is sometimes quiet during autumn  and it is snowy in april ']


## Model 3: Bidirectional RNNs

In [None]:
def bd_model(input_shape, french_vocab_size):

  # HyperParameter 
  learning_rate = .003

  # Build the layers 
  model = keras.models.Sequential()
  model.add(Bidirectional(GRU(256,return_sequences=True),input_shape=input_shape[1:]))
  model.add(TimeDistributed(Dense(1024,activation = "relu")))
  model.add(keras.layers.Dropout(.5))
  model.add(TimeDistributed(Dense(french_vocab_size , activation = "softmax")))
  model.compile(loss = sparse_categorical_crossentropy , metrics = ["accuracy"] , optimizer = keras.optimizers.Adam(learning_rate))
  return model


tmp_x = pad(preproc_english_sentences , max_french_sequence_length )
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2],1))


bidirectional_model = bd_model(tmp_x.shape, french_vocab_size)

In [None]:
bidirectional_model.fit(tmp_x, preproc_french_sentences, batch_size=512, epochs=30, validation_split=0.2)
bidirectional_model.save("bidirectional_model")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f63a0a830d0>

In [None]:
logits_to_text(bidirectional_model.predict(tmp_x[:1])[0],french_tokenizer)

'new jersey est parfois calme pendant lautomne de l automne il il en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [None]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(bidirectional_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fr[:1])

print("\nOriginal text:")
print(data_eng[:1])

Prediction:
new jersey est parfois calme pendant lautomne de l automne il il en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['new jersey est parfois calme pendant l automne  et il est neigeux en avril ']

Original text:
['new jersey is sometimes quiet during autumn  and it is snowy in april ']


## Model 4: Encoder-Decoder

In [None]:
def encdec_model(input_shape, french_vocab_size , output_sequence_length):

  # HyperParameter 
  learning_rate = .003

  # Build the layers 
  model = keras.models.Sequential()

  # Encoder
  model.add(GRU(256, go_backwards=True,input_shape=input_shape[1:]))
  model.add(keras.layers.RepeatVector(output_sequence_length))
  # Decoder
  model.add(GRU(256 , return_sequences = True ))
  model.add(TimeDistributed(Dense(1024,activation = "relu")))
  model.add(keras.layers.Dropout(.5))
  model.add(TimeDistributed(Dense(french_vocab_size , activation = "softmax")))

  # compile model
  model.compile(loss = sparse_categorical_crossentropy , metrics = ["accuracy"] , optimizer = keras.optimizers.Adam(learning_rate))
  return model


tmp_x = pad(preproc_english_sentences , max_french_sequence_length )
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2],1))


encdec_rnn_model  = encdec_model(tmp_x.shape, french_vocab_size , max_french_sequence_length )
encdec_rnn_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 256)               198912    
                                                                 
 repeat_vector (RepeatVector  (None, 21, 256)          0         
 )                                                               
                                                                 
 gru_1 (GRU)                 (None, 21, 256)           394752    
                                                                 
 time_distributed (TimeDistr  (None, 21, 1024)         263168    
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 21, 1024)          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 21, 344)          3

In [None]:
encdec_rnn_model.fit(tmp_x, preproc_french_sentences, batch_size=1024, epochs=30, validation_split=0.2)
encdec_rnn_model.save("encdec_rnn_model")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7fef287b7f50>

In [None]:
logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0],french_tokenizer)

'new jersey est parfois calme au cours et il est il est avril en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [None]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(encdec_rnn_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fr[:1])

print("\nOriginal text:")
print(data_eng[:1])

Prediction:
new jersey est parfois calme au cours et il est il est avril en <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['new jersey est parfois calme pendant l automne  et il est neigeux en avril ']

Original text:
['new jersey is sometimes quiet during autumn  and it is snowy in april ']


## Model 5: Custom ( Using all of the above models )

In [None]:
def model_final(input_shape, english_vocab_size , french_vocab_size , output_sequence_length):

  # HyperParameter 
  learning_rate = .003

  # Build the layers 
  model = keras.models.Sequential()

  # Encoder
  model.add(keras.layers.Embedding(english_vocab_size , 300 , input_length=input_shape[1] , input_shape = input_shape[1:] ))
  model.add(Bidirectional(GRU(256, go_backwards=True)))
  model.add(keras.layers.RepeatVector(output_sequence_length))

  # Decoder
  model.add(Bidirectional(GRU(256 , return_sequences = True )))
  model.add(TimeDistributed(Dense(1024,activation = "relu")))
  model.add(keras.layers.Dropout(.5))
  model.add(TimeDistributed(Dense(french_vocab_size , activation = "softmax")))

  # compile model
  model.compile(loss = sparse_categorical_crossentropy , metrics = ["accuracy"] , optimizer = keras.optimizers.Adam(learning_rate))
  return model

In [None]:
tmp_x = pad(preproc_english_sentences , max_french_sequence_length )
tmp_x = tmp_x.reshape((-1,preproc_french_sentences.shape[1]))

final_model = model_final( tmp_x.shape ,english_vocab_size , french_vocab_size , max_french_sequence_length )

In [None]:
final_model.fit(tmp_x , preproc_french_sentences , validation_split= .2 , epochs = 20 , batch_size = 512 )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fef289d9510>

In [None]:
# Saving model
final_model.save("final_model") 



In [None]:
logits_to_text(final_model.predict(tmp_x[:1])[0],french_tokenizer)

'new jersey est parfois calme pendant l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>'

In [None]:
# Print prediction(s)
print("Prediction:")
print(logits_to_text(final_model.predict(tmp_x[:1])[0], french_tokenizer))

print("\nCorrect Translation:")
print(data_fr[:1])

print("\nOriginal text:")
print(data_eng[:1])

Prediction:
new jersey est parfois calme pendant l automne et il est neigeux en avril <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
['new jersey est parfois calme pendant l automne  et il est neigeux en avril ']

Original text:
['new jersey is sometimes quiet during autumn  and it is snowy in april ']


# Making Predictions with the final model

In [None]:
def translate(text) :
  sequence = [english_tokenizer.word_index[word] for word in text.split()]
  padded_sequence =  pad_sequences([sequence] , maxlen = 21 , padding = "post")
  print(logits_to_text(encdec_rnn_model.predict(padded_sequence)[0], french_tokenizer))

In [None]:
translate("he saw a old yellow truck")

il a pas un camion voiture <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
