<a href="https://colab.research.google.com/github/Georgemburu/MACHINE-LEARNING/blob/master/(SPANISH_to_ENGLISH_)translation_revision_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# MACHINE LEARNING TRANSLATION WITH ATTENTION

In [1]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
from __future__ import absolute_import, print_function, division,unicode_literals
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import unicodedata
import re
import time

In [3]:
path_to_zip = tf.keras.utils.get_file(
    'spa-eng.zip',
    origin = 'http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip',
    extract=True
)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [4]:
os.listdir(os.path.dirname(path_to_zip)+'/spa-eng')

['spa.txt', '_about.txt']

In [0]:
path_to_file = os.path.join(os.path.dirname(path_to_zip),'spa-eng/spa.txt')

In [6]:
pd.read_csv(path_to_file,delimiter='\t')

Unnamed: 0,Go.,Ve.
0,Go.,Vete.
1,Go.,Vaya.
2,Go.,Váyase.
3,Hi.,Hola.
4,Run!,¡Corre!
...,...,...
118958,There are four main causes of alcohol-related ...,Hay cuatro causas principales de muertes relac...
118959,There are mothers and fathers who will lie awa...,Hay madres y padres que se quedan despiertos d...
118960,A carbon footprint is the amount of carbon dio...,Una huella de carbono es la cantidad de contam...
118961,Since there are usually multiple websites on a...,Como suele haber varias páginas web sobre cual...


In [0]:
def unicode_to_ascii(s):
  return ''.join(c for c in unicodedata.normalize('NFD',s)
    if unicodedata.category(c)!='Mn')

In [0]:
def preprocess_sentence(sentence):
  s = unicode_to_ascii(sentence.lower().strip())
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)

  # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
  s = re.sub(r"[^a-zA-Z?.!,¿]+", " ", s)

  s = s.rstrip().strip()

  # adding a start and an end token to the sentence
  # so that the model know when to start and stop predicting.
  s = '<start> ' + s + ' <end>'
  return s


In [0]:
def load_dataset(filepath,num_examples=None):
  lines = open(filepath,'r',encoding ='utf-8').read().strip().split('\n')
  # data = []
  inp = []
  targ = []
  for line in lines:
    inp_targ_arr =  line.split('\t')
    inp.append(preprocess_sentence(inp_targ_arr[1]))
    targ.append(preprocess_sentence(inp_targ_arr[0]))
    inp_targ_arr = None
  return inp,targ




In [10]:
inp,targ = load_dataset(path_to_file,10)
print(inp[-1])
print(targ[-1])

<start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>
<start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>


In [0]:
# Tokenize
def tokenize(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)
  tensors = lang_tokenizer.texts_to_sequences(lang)
  # Padding
  tensors = tf.keras.preprocessing.sequence.pad_sequences(tensors,padding='post')
  return tensors, lang_tokenizer
  


In [12]:
inp_tensor, inp_word_index = tokenize(inp)
inp_tensor[0]

array([  1, 364,   3,   2,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0], dtype=int32)

In [0]:
def max_len(tensor):
  return max(len(t) for t in tensor)

In [14]:
input_tensor_max_len = max_len(inp_tensor)
input_tensor_max_len

53

In [15]:
targ_tensor, targ_word_index = tokenize(targ)
targ_tensor[0]

array([ 1, 49,  3,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
      dtype=int32)

In [16]:
targ_tensor_max_len = max_len(targ_tensor)
targ_tensor_max_len

51

In [0]:
# Create dataset
BUFFER_SIZE = 3000
BATCH_SIZE = 64
dataset = tf.data.Dataset.from_tensor_slices((inp_tensor,targ_tensor)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(64, drop_remainder=True)

In [18]:
for sample_inp, sample_targ in dataset.take(1):
  print(sample_inp, sample_targ)

tf.Tensor(
[[    1  7609   633 ...     0     0     0]
 [    1    12     4 ...     0     0     0]
 [    1    40  4580 ...     0     0     0]
 ...
 [    1    38    98 ...     0     0     0]
 [    1 14770    57 ...     0     0     0]
 [    1  3675    20 ...     0     0     0]], shape=(64, 53), dtype=int32) tf.Tensor(
[[  1 678  46 ...   0   0   0]
 [  1  29  15 ...   0   0   0]
 [  1   4  40 ...   0   0   0]
 ...
 [  1   4 101 ...   0   0   0]
 [  1 222  23 ...   0   0   0]
 [  1 355  14 ...   0   0   0]], shape=(64, 51), dtype=int32)


In [0]:
# Create the encoder
class Encoder(tf.keras.Model):
  def __init__(self,vocab_size,embedding_dim,enc_units,batch_size):
    super(Encoder,self).__init__()
    self.enc_units = enc_units
    self.batch_size = batch_size
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.gru = tf.keras.layers.GRU(enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')
    
  def call(self,x,hidden):
    x = self.embedding(x)
    output,state = self.gru(x, initial_state=hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_size,self.enc_units))

In [0]:
# Attention layer
class AttentionLayer(tf.keras.layers.Layer):
  def __init__(self,units):
    super(AttentionLayer,self).__init__()
    self.W1 = tf.keras.layers.Dense(units)
    self.W2 = tf.keras.layers.Dense(units)
    self.FC = tf.keras.layers.Dense(1)
    
  def call(self, query, values):
    # hidden_with_time_axis
    hidden_with_time_axis = tf.expand_dims(query,1)
    # score
    score = self.FC(
        tf.tanh(self.W1(values)+ self.W2(hidden_with_time_axis))
    )
    # attention weights
    attention_weights = tf.nn.softmax(score,axis=1)

    # context vector
    # attention_weights * values
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector,axis=1)
    
    # return
    return context_vector, attention_weights



In [0]:
# Decoder
class Decoder(tf.keras.Model):
  def __init__(self,vocab_size,embedding_dims,dec_units,batch_size):
    super(Decoder,self).__init__()
    self.dec_units = dec_units
    self.batch_size = batch_size
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dims)
    self.gru = tf.keras.layers.GRU(dec_units,
                                   return_state=True,
                                   return_sequences = True,
                                   recurrent_initializer='glorot_uniform')
    self.fc = tf.keras.layers.Dense(vocab_size)
    # Attention
    self.attention = AttentionLayer(self.dec_units)

  def call(self,x,hidden,enc_output):
    # pass through attention
    context_vector, attention_weights = self.attention(hidden,enc_output)

    # pass x through embedding
    x = self.embedding(x)

    # concat context_vector with x
    x = tf.concat([tf.expand_dims(context_vector,axis=1),x],axis=-1)
    # Passing concatenated x to gru
    output, state = self.gru(x)

    # reshape output
    output = tf.reshape(output, (-1, output.shape[2]))
    # output shape == (batch_sze, vocab)

    x = self.fc(output)

    return x, state, attention_weights

    

In [0]:
# Optimizer and Loss function
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True,
    reduction='none'
)
def loss_function(real,pred):
  mask = tf.math.logical_not(tf.math.equal(real,0))

  loss_ = loss_object(real,pred)

  mask = tf.cast(mask,dtype=loss_.dtype)

  loss_ *= mask
  return tf.reduce_mean(loss_)

In [0]:
####################
# Split data
val_data_size = -20
inp_tensor_train = inp_tensor[:val_data_size]
targ_tensor_train = targ_tensor[:val_data_size]
# VARIABLES
##################
BUFFER_SIZE = len(inp_tensor_train)
BATCH_SIZE = 64
steps_per_epochs = BUFFER_SIZE//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(inp_word_index.word_index)+1
vocab_tar_size = len(targ_word_index.word_index)+1


# Create dataset
dataset = tf.data.Dataset.from_tensor_slices((inp_tensor_train,targ_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE,drop_remainder=True)



In [24]:
for sample_inp, sample_targ in dataset.take(1):
  print(sample_inp, sample_targ)
# Initialize the decoder encoder
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(sample_inp, sample_hidden)

attention_layer = AttentionLayer(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)


tf.Tensor(
[[   1    8   17 ...    0    0    0]
 [   1   12  454 ...    0    0    0]
 [   1   12  196 ...    0    0    0]
 ...
 [   1    9  822 ...    0    0    0]
 [   1    6  144 ...    0    0    0]
 [   1   47 9148 ...    0    0    0]], shape=(64, 53), dtype=int32) tf.Tensor(
[[  1   4  30 ...   0   0   0]
 [  1  55 123 ...   0   0   0]
 [  1  21   7 ...   0   0   0]
 ...
 [  1   8 709 ...   0   0   0]
 [  1   5 152 ...   0   0   0]
 [  1   4  39 ...   0   0   0]], shape=(64, 51), dtype=int32)


In [0]:
# Checkpoints
checkpoints_dir = './training_checkpoints'
checkpoints_prefix = os.path.join(checkpoints_dir, 'ckpts')
checkpoint = tf.train.Checkpoint(
    optimizer=optimizer,
    encoder=encoder,
    decoder=decoder
)

In [0]:
###############################
# TRAIN
@tf.function
def train_step(inp,targ,enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_output, enc_hidden = encoder(inp,enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([targ_word_index.word_index['<start>']]*BATCH_SIZE, axis=1)

    # Use Teacher Forcing
    for t in range(1,targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_hidden,_ = decoder(dec_input,dec_hidden,enc_output)
      loss += loss_function(targ[:,t], predictions)
      # using teacher forcing
      dec_input = tf.expand_dims(targ[:,t],axis=1)

    batch_loss = (loss/int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return batch_loss
    

In [27]:
EPOCHS = 10
for epoch in range(EPOCHS):
  start = time.time()
  enc_hidden = encoder.initialize_hidden_state()
  total_loss = 0

  for(batch,(inp,targ)) in enumerate(dataset.take(steps_per_epochs)):
    batch_loss = train_step(inp,targ,enc_hidden)
    total_loss += batch_loss

    if(batch % 100 == 0):
      print('Epoch {} Batch {} Loss {:.4f}'.format(
          epoch +1,
          batch,
          batch_loss.numpy()
      ))
      # saving (checkpoint) the model every 2 epochs
      if(epoch+1)%2 == 0:
        checkpoint.save(file_prefix=checkpoints_prefix)
      
      print('Epoch {} Loss {:.4f}'.format(epoch+1,
                                          total_loss / steps_per_epochs))
      print('Time taken for 1 epoch {} sec\n'.format(
          time.time() - start
      ))
      

Epoch 1 Batch 0 Loss 1.6650
Epoch 1 Loss 0.0009
Time taken for 1 epoch 46.687591552734375 sec

Epoch 1 Batch 100 Loss 0.8423
Epoch 1 Loss 0.0524
Time taken for 1 epoch 89.83947205543518 sec

Epoch 1 Batch 200 Loss 0.7311
Epoch 1 Loss 0.0971
Time taken for 1 epoch 133.11894607543945 sec

Epoch 1 Batch 300 Loss 0.7558
Epoch 1 Loss 0.1387
Time taken for 1 epoch 176.212660074234 sec

Epoch 1 Batch 400 Loss 0.7372
Epoch 1 Loss 0.1778
Time taken for 1 epoch 219.3749725818634 sec

Epoch 1 Batch 500 Loss 0.6097
Epoch 1 Loss 0.2146
Time taken for 1 epoch 262.59108424186707 sec

Epoch 1 Batch 600 Loss 0.6466
Epoch 1 Loss 0.2488
Time taken for 1 epoch 305.8367009162903 sec

Epoch 1 Batch 700 Loss 0.5608
Epoch 1 Loss 0.2813
Time taken for 1 epoch 348.96155834198 sec

Epoch 1 Batch 800 Loss 0.5991
Epoch 1 Loss 0.3117
Time taken for 1 epoch 392.02862524986267 sec

Epoch 1 Batch 900 Loss 0.5693
Epoch 1 Loss 0.3405
Time taken for 1 epoch 435.3520772457123 sec

Epoch 1 Batch 1000 Loss 0.4739
Epoch 1 Lo

In [33]:
# RESTORE THE LAST CHECKPOINT
os.listdir(checkpoints_dir)
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir=checkpoints_dir))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f8fb5279e80>

In [0]:
# PREDICTION
def evaluate(sentence):
  sentence = preprocess_sentence(sentence)
  inputs = [inp_word_index.word_index[i] for i in sentence.split(' ')]
  inputs = tf.keras.preprocessing.sequence.pad_sequences(
      [inputs],
      padding='post',
      maxlen=input_tensor_max_len)
  inputs = tf.convert_to_tensor(inputs)
  
  # results
  result = ''

  hidden = [tf.zeros((1,units))]
  enc_out, enc_hidden = encoder(inputs,hidden)

  dec_hidden = enc_hidden
  dec_input = tf.expand_dims([targ_word_index.word_index['<start>']],0)

  for t in range(targ_tensor_max_len):
    predictions,dec_hidden, attention_weights = decoder(
        dec_input,
        dec_hidden,
        enc_out
    )
    predicted_id = tf.argmax(predictions[0]).numpy()
    print('Predicted ID:',predicted_id)
    result += targ_word_index.index_word[predicted_id]+' '
    if(targ_word_index.index_word[predicted_id]=='<end>'):
      return result, sentence
    
    # THe predicted ID is fed back into the modedl
    dec_input = tf.expand_dims([predicted_id],0)
  return result, sentence

In [0]:
def translate(sentence):
  result, sentence = evaluate(sentence)

  print('Input: ',sentence)
  print('Predicted Translation: ', result)
  

In [61]:
# TRANSLATE
translate(u'hace mucho frio aqui.')


Predicted ID: 14
Predicted ID: 15
Predicted ID: 59
Predicted ID: 273
Predicted ID: 62
Predicted ID: 3
Predicted ID: 2
Input:  <start> hace mucho frio aqui . <end>
Predicted Translation:  it s very cold here . <end> 


In [62]:
translate(u'esta es mi vida.')


Predicted ID: 23
Predicted ID: 11
Predicted ID: 25
Predicted ID: 203
Predicted ID: 3
Predicted ID: 2
Input:  <start> esta es mi vida . <end>
Predicted Translation:  this is my life . <end> 


In [63]:
translate(u'¿todavia estan en casa?')


Predicted ID: 31
Predicted ID: 7
Predicted ID: 160
Predicted ID: 44
Predicted ID: 115
Predicted ID: 10
Predicted ID: 2
Input:  <start> ¿ todavia estan en casa ? <end>
Predicted Translation:  are you still at home ? <end> 


In [64]:
translate('¿todavia estan en casa?')


Predicted ID: 31
Predicted ID: 7
Predicted ID: 160
Predicted ID: 44
Predicted ID: 115
Predicted ID: 10
Predicted ID: 2
Input:  <start> ¿ todavia estan en casa ? <end>
Predicted Translation:  are you still at home ? <end> 


In [77]:
translate(u'mi amigo')


Predicted ID: 25
Predicted ID: 229
Predicted ID: 3
Predicted ID: 2
Input:  <start> mi amigo <end>
Predicted Translation:  my friend . <end> 


In [58]:
targ_word_index.index_word[14]

'it'