In [0]:
!pip install tensorflow==2.0.0



In [0]:
import tensorflow as tf
print(tf.__version__)

2.0.0


In [0]:
import pandas as pd
import re
from string import digits
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
import os
import time

In [0]:
# Read the data
df = pd.read_table("/content/drive/My Drive/french_to_english.txt",names=['source', 'target', 'comments'])
df.head()

Unnamed: 0,source,target,comments
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Hi.,Salut.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
3,Run!,Cours !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
4,Run!,Courez !,CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [0]:
df = df.iloc[0:50000,:]

Data Cleaning

In [0]:
num_digits= str.maketrans('','', digits)
df.replace(to_replace=" +", value= " ",  inplace=True, regex=True)
df.replace(to_replace="([?.!,¿])", value="", inplace=True, regex=True )
df['source'] = df.apply(lambda x: x['source'].translate(num_digits) ,axis = 1)
df['source'] = df.apply(lambda x: x['source'].rstrip().strip() ,axis = 1)
df['target'] = df.apply(lambda x: x['target'].translate(num_digits) ,axis = 1)
df['target'] = df.apply(lambda x: x['target'].rstrip().strip() ,axis = 1)
df['source'] = 'start ' + df['source'].astype(str) + ' end'
df['target'] = 'start ' + df['target'].astype(str) + ' end'

In [0]:
# Cleaned data
df.head()

Unnamed: 0,source,target,comments
0,start Go end,start Va end,CC-BY 20 (France) Attribution: tatoebaorg #287...
1,start Hi end,start Salut end,CC-BY 20 (France) Attribution: tatoebaorg #538...
2,start Hi end,start Salut end,CC-BY 20 (France) Attribution: tatoebaorg #538...
3,start Run end,start Cours end,CC-BY 20 (France) Attribution: tatoebaorg #906...
4,start Run end,start Courez end,CC-BY 20 (France) Attribution: tatoebaorg #906...


In [0]:
df.drop(['comments'], axis=1, inplace=True)

In [0]:
# Finding maximum length of source text and target text for padding
source = list(df['source'].values)
target = list(df['target'].values)

In [0]:
length = list(map(lambda x: len(x.split()),source))
source_max_length = max(length)

In [0]:
length = list(map(lambda x: len(x.split()),target))
target_max_length = max(length)

In [0]:
# Converting input to sequence
source_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
source_tokenizer.fit_on_texts(source)
source_seq = source_tokenizer.texts_to_sequences(source)
source_seq = tf.keras.preprocessing.sequence.pad_sequences(source_seq,padding='post')

In [0]:
target_tokenizer= tf.keras.preprocessing.text.Tokenizer(filters='')
target_tokenizer.fit_on_texts(target)
target_seq = target_tokenizer.texts_to_sequences(target)
target_seq = tf.keras.preprocessing.sequence.pad_sequences(target_seq,padding='post' )

Creating training and testing data

In [0]:
source_train, source_test, target_train, target_test = train_test_split(source_seq, target_seq,test_size=0.2)

In [0]:
source_val_train, source_val_test, target_val_train, target_val_test = train_test_split(source_seq, target_seq, test_size=0.2)

To give a visual of how each word is mapped to a number

In [0]:
def Index_to_Word_mapping(lang, tensor):
  for t in tensor:
    if t!=0:
      print ("%d ---->%s" % (t, lang.index_word[t]))

In [0]:
print ("Source Language: index to word mapping")
Index_to_Word_mapping(source_tokenizer, source_train[0])
print ()
print ("Target Language: index to word mapping")
Index_to_Word_mapping( target_tokenizer, target_train[0])

Source Language: index to word mapping
1 ---->start
113 ---->leave
14 ---->this
9 ---->to
13 ---->me
2 ---->end

Target Language: index to word mapping
1 ---->start
603 ---->laissez
2 ---->end


Building tensorflow Encoder-Decoder Attention Model

In [0]:
# Creating variables needed for the tf model
buffer_size = len(source_train) # to create shuffled dataset with the corpus
batch_size = 64
steps_per_epoch = len(source_train)
embedding_vec_dim = 256
units = 1024
source_vocab_size = len(source_tokenizer.word_index)+1
target_vocab_size = len(target_tokenizer.word_index)+1

In [0]:
tf_df = tf.data.Dataset.from_tensor_slices((source_train, target_train)).shuffle(buffer_size)
tf_df = tf_df.batch(batch_size, drop_remainder=True)

**Encoder**

Input->Embedding_layer->GRU->hidden_layer

In [0]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

  def call(self, x, hidden):
    x = self.embedding(x)
    output, state = self.gru(x, initial_state = hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [0]:
sample_source, sample_target = next(iter(tf_df))

In [0]:
encoder = Encoder(source_vocab_size, embedding_vec_dim, units, batch_size)

In [0]:
sample_hidden = encoder.initialize_hidden_state()

In [0]:
sample_output, sample_hidden = encoder(sample_source, sample_hidden)

Building Attention part 

In [0]:
class Attention(tf.keras.layers.Layer):
  def __init__(self, units):
    super(Attention, self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.V = tf.keras.layers.Dense(1)

  def call(self, query, values):
    hidden_with_time_axis = tf.expand_dims(query, 1)

    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    attention_weights = tf.nn.softmax(score, axis=1)

    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [0]:
attention_layer = Attention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

Building Decoder Part

In [0]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(self.dec_units, return_sequences=True, return_state=True, recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)

    self.attention = Attention(self.dec_units)

  def call(self, x, hidden, enc_output):

    context_vector, attention_weights = self.attention(hidden, enc_output)
    x = self.embedding(x)
    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
    output, state = self.gru(x)
    output = tf.reshape(output, (-1, output.shape[2]))
    x = self.fc(output)

    return x, state, attention_weights

In [0]:
decoder = Decoder(target_vocab_size, embedding_vec_dim, units, batch_size)

sample_decoder_output, _, _ = decoder(tf.random.uniform((batch_size, 1)), sample_hidden, sample_output)

Defining Optimizers and Loss function

In [0]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

Checkpoints

While Training, if any error occurs at some iteration, then all the weights calculated till that iteration will be lost.

If we create checkpoints, we can store the weights by defining after how many epochs I need to save my model

In [0]:
checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

Training

In [0]:
@tf.function # to avoid errors
def train_step(inp, targ, enc_hidden):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp, enc_hidden)

    dec_hidden = enc_hidden

    dec_input = tf.expand_dims([target_tokenizer.word_index['start']] * batch_size, 1)

    for t in range(1, targ.shape[1]):
      predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [32]:
import time
EPOCHS = 10

for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(tf_df.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_hidden)
    total_loss += batch_loss

  print('Epoch =', epoch + 1,' Batch =',batch, 'loss =',batch_loss.numpy())
   
      
  # saving (checkpoint) the model every 2 epochs
  # if (epoch + 1) % 2 == 0:
  checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch', epoch + 1, 'Loss', (total_loss / steps_per_epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch = 1  Batch = 624 loss = 1.1143413
Epoch 1 Loss tf.Tensor(0.02353998, shape=(), dtype=float32)
Time taken for 1 epoch 2792.7101757526398 sec

Epoch = 2  Batch = 624 loss = 0.7901312
Epoch 2 Loss tf.Tensor(0.014027792, shape=(), dtype=float32)
Time taken for 1 epoch 2670.784355163574 sec

Epoch = 3  Batch = 624 loss = 0.4438149
Epoch 3 Loss tf.Tensor(0.009164154, shape=(), dtype=float32)
Time taken for 1 epoch 2674.553356409073 sec

Epoch = 4  Batch = 624 loss = 0.4219109
Epoch 4 Loss tf.Tensor(0.0064113326, shape=(), dtype=float32)
Time taken for 1 epoch 2668.898288965225 sec

Epoch = 5  Batch = 624 loss = 0.30115694
Epoch 5 Loss tf.Tensor(0.004773234, shape=(), dtype=float32)
Time taken for 1 epoch 2648.8732085227966 sec

Epoch = 6  Batch = 624 loss = 0.23352095
Epoch 6 Loss tf.Tensor(0.0037679768, shape=(), dtype=float32)
Time taken for 1 epoch 2647.2204077243805 sec

Epoch = 7  Batch = 624 loss = 0.23210058
Epoch 7 Loss tf.Tensor(0.0031233924, shape=(), dtype=float32)
Time take

In [0]:
def preprocess_sentence(sentence):
    
    num_digits= str.maketrans('','', digits)
    
    sentence= sentence.lower()
    sentence= re.sub(" +", " ", sentence)
    sentence= re.sub("'", '', sentence)
    sentence= sentence.translate(num_digits)
    sentence= re.sub(r"([?.!,¿])", r" \1 ", sentence)
    sentence = sentence.rstrip().strip()
    sentence=  'start ' + sentence + ' end'
    
    return sentence

Testing

In [0]:
def Translate(sentence):

  sentence = preprocess_sentence(sentence)

  inputs = [source_tokenizer.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=source_max_length, padding='post')
  inputs = tf.convert_to_tensor(inputs)

  result = ''

  hidden = [tf.zeros((1, units))]
  enc_out, enc_hidden = encoder(inputs, hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([target_tokenizer.word_index['start']], 0)

  for t in range(target_max_length):
    predictions, dec_hidden, attention_weights = decoder(dec_input, dec_hidden, enc_out)

    predicted_id = tf.argmax(predictions[0]).numpy()

    result += target_tokenizer.index_word[predicted_id] + ' '

    if target_tokenizer.index_word[predicted_id] == 'end':
      return result, sentence

    # the predicted ID is fed back into the model
    dec_input = tf.expand_dims([predicted_id], 0)

  return result, sentence

In [58]:
result, sentence = Translate("I am going to sleep")
print(result)
print(sentence)

je vais dormir end 
start i am going to sleep end


Converting text to audio

In [52]:
!pip install gTTS

Collecting gTTS
  Downloading https://files.pythonhosted.org/packages/a1/0c/4ca77eca3b739a4a08360930643f58d714e302fee0d2f8c654e67d9af8e7/gTTS-2.1.1-py3-none-any.whl
Collecting gtts-token>=1.1.3
  Downloading https://files.pythonhosted.org/packages/e7/25/ca6e9cd3275bfc3097fe6b06cc31db6d3dfaf32e032e0f73fead9c9a03ce/gTTS-token-1.1.3.tar.gz
Building wheels for collected packages: gtts-token
  Building wheel for gtts-token (setup.py) ... [?25l[?25hdone
  Created wheel for gtts-token: filename=gTTS_token-1.1.3-cp36-none-any.whl size=4097 sha256=52e7a89b7acff648c2d4e64b63302fff693f753264ee777d64fe778382901fe2
  Stored in directory: /root/.cache/pip/wheels/dd/11/61/33f7e51bf545e910552b2255eead2a7cd8ef54064b46dceb34
Successfully built gtts-token
Installing collected packages: gtts-token, gTTS
Successfully installed gTTS-2.1.1 gtts-token-1.1.3


In [0]:
from gtts import gTTS 
import os

In [0]:
language = 'fr-FR'

In [0]:
speech = gTTS(result, lang = language, slow = False)

In [0]:
speech.save("text.mp3")

<a href="text.mp3"> Download File </a>