In [1]:
!pip install kaggle
from google.colab import files
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"hakgyunhong","key":"50913cd2335ebf13e16010fcf0c55c45"}'}

In [2]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
#!kaggle datasets download -d fedesoriano/wind-speed-prediction-dataset
!kaggle datasets download -d nageshsingh/englishportuguese-translation

Downloading englishportuguese-translation.zip to /content
  0% 0.00/5.41M [00:00<?, ?B/s]
100% 5.41M/5.41M [00:00<00:00, 109MB/s]


In [4]:
!unzip englishportuguese-translation.zip
!ls

Archive:  englishportuguese-translation.zip
  inflating: por.txt                 
englishportuguese-translation.zip  kaggle.json	por.txt  sample_data


**Attention**
> - Attention is an interface between the encoder and decoder that provides decoder with information from every encoder hidden state.

> - With this setting, the model is able to selectively focus on useful parts of the input sequence and hence, learn the alignment between them

> - Maybe this help the model to cope effectively with long input sentences.

In [5]:
import pandas as pd
import tensorflow as tf

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import time
import string

import os

In [6]:
file_path = '/content/por.txt'

In [7]:
lines = open(file_path, encoding='UTF-8').read().strip().split('\n')
lines[5000:5010]

['Will it rain?\tSerá que chove?\tCC-BY 2.0 (France) Attribution: tatoeba.org #8918600 (CK) & #8930552 (JGEN)',
 'Wish me luck.\tDeseje-me sorte.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2254917 (CK) & #872788 (alexmarcelo)',
 "Won't you go?\tVocê não vai?\tCC-BY 2.0 (France) Attribution: tatoeba.org #241051 (CK) & #6212788 (bill)",
 'Write in ink.\tEscreva à tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351595 (alexmarcelo)',
 'Write in ink.\tEscreva a tinta.\tCC-BY 2.0 (France) Attribution: tatoeba.org #3258764 (CM) & #7351606 (alexmarcelo)',
 'Write to Tom.\tEscreva para o Tom.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2240357 (CK) & #5985551 (Ricardo14)',
 'Years passed.\tPassaram os anos.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #977841 (alexmarcelo)',
 'Years passed.\tAnos se passaram.\tCC-BY 2.0 (France) Attribution: tatoeba.org #282197 (CK) & #2324530 (Matheus)',
 'You amuse me.\tVocê me diverte.\tCC-BY 2.0 (France) Attributio

In [8]:
print("total number of records: ", len(lines))

total number of records:  168903


In [9]:
exclude = set(string.punctuation)
# Set of all special characters.
remove_digits = str.maketrans('','',string.digits)
# Set of all digits.

# Function to preprocess English sentence

In [10]:
def preprocess_eng_sentence(sent):
  '''
  Function to preprocess English Sentence
  '''

  sent = sent.lower()
  # lower Casing

  sent = re.sub("'",'',sent)
  # Remove quotation mark.

  sent = ''.join(ch for ch in sent if ch not in exclude)
  sent = sent.translate(remove_digits)
  sent = sent.strip()
  sent = re.sub(" +"," ",sent)
  # Remove extra spaces

  sent = '<start> '+sent+' <end>' #add <start> and <end> tokens
  return sent


# Function to preprocess Portuguese sentence

In [11]:
def preprocess_port_sentence(sent):
  '''
  Function to preprocess Portuguese sentence
  '''

  #sent = sent.lower()
  # lower Casing

  sent = re.sub("'",'',sent)
  # Remove quotation mark.

  sent = ''.join(ch for ch in sent if ch not in exclude)
  #sent = sent.translate(remove_digits)
  sent = sent.strip()
  sent = re.sub(" +"," ",sent)
  # Remove extra spaces

  sent = '<start> '+sent+' <end>' #add <start> and <end> tokens
  return sent

## Generate pairs of cleaned English and Portuguese

In [12]:
sent_pairs=[]
for line in lines:
  sent_pair=[]
  eng=line.rstrip().split('\t')[0]
  port=line.rstrip().split('\t')[1]

  eng = preprocess_eng_sentence(eng)
  sent_pair.append(eng)

  port = preprocess_port_sentence(port)
  sent_pair.append(port)
  sent_pairs.append(sent_pair)
sent_pairs[5000:5010]

[['<start> will it rain <end>', '<start> Será que chove <end>'],
 ['<start> wish me luck <end>', '<start> Desejeme sorte <end>'],
 ['<start> wont you go <end>', '<start> Você não vai <end>'],
 ['<start> write in ink <end>', '<start> Escreva à tinta <end>'],
 ['<start> write in ink <end>', '<start> Escreva a tinta <end>'],
 ['<start> write to tom <end>', '<start> Escreva para o Tom <end>'],
 ['<start> years passed <end>', '<start> Passaram os anos <end>'],
 ['<start> years passed <end>', '<start> Anos se passaram <end>'],
 ['<start> you amuse me <end>', '<start> Você me diverte <end>'],
 ['<start> you are late <end>', '<start> Você está atrasado <end>']]

# Create a class to map every word to an index and vice versa for any given vocabulary

In [13]:
class LanguageIndex():
  def __init__(self, lang):
    self.lang = lang
    self.word2idx = {}
    self.idx2word = {}
    self.vocab = set()

    self.create_index()

  def create_index(self):
    for phrase in self.lang:
      self.vocab.update(phrase.split(' '))

    self.vocab = sorted(self.vocab)
    self.word2idx['<pad>']=0
    for index, word in enumerate(self.vocab):
      self.word2idx[word] = index +1
      
    for word, index in self.word2idx.items():
      self.idx2word[index]=word

> **LanguageIndex Class**
> - lang : init input
> - vocab : (set) vocabulary in phrase in self.lang
> - word2idx : vocab and each index. This is something like dictionary type
> - idx2word : same with word2idx but It is reversed.


In [14]:
def max_length(tensor):
  return max(len(t) for t in tensor)

- This refer suddenly 'tensor' type. But this cover all the data type maybe.

# Tokenization and Padding

In [15]:
def load_dataset(pairs, num_examples):
  inp_lang = LanguageIndex(en for en, ma in pairs)
  targ_lang = LanguageIndex(ma for en,ma in pairs)

  input_tensor = [[inp_lang.word2idx[s] for s in en.split(' ')] for en,ma in pairs]
  target_tensor = [[targ_lang.word2idx[s] for s in ma.split(' ')] for en,ma in pairs]

  # Calculate Max Length from all the data.
  max_length_inp, max_length_tar = max_length(input_tensor), max_length(target_tensor)


  # Padding the input and output tensor to the maximum length
  # We can notice that adding the padding to 'post' side
  input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor,
                                                               maxlen=max_length_inp,
                                                               padding='post')
  target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor,
                                                                maxlen=max_length_tar,
                                                                padding='post')
  
  return input_tensor, target_tensor, inp_lang,targ_lang,max_length_inp,max_length_tar

In [16]:
input_tensor, target_tensor, inp_lang, targ_lang,max_length_inp,max_length_targ = load_dataset(sent_pairs,len(lines))

In [17]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.1, random_state=101)

len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val)

(152012, 152012, 16891, 16891)

In [18]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE=64
N_BATCH = BUFFER_SIZE//BATCH_SIZE


embedding_dim = 256
# Traditionally?

units=1024
vocab_inp_size = len(inp_lang.word2idx)
vocab_tar_size = len(targ_lang.word2idx)


# **Method for making the dataset**
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train,target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

## Create GRU units

In [19]:
def gru(units):
  return tf.keras.layers.GRU(units, 
                             return_sequences=True,
                             return_state=True,
                             recurrent_activation='sigmoid',
                             recurrent_initializer='glorot_uniform')

In [20]:
class Encoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.embedding = tf.keras.layers.Embedding(vocab_size,embedding_dim)
    self.gru = gru(self.enc_units)

  def call(self, x, hidden):
    x=self.embedding(x)
    output, state = self.gru(x,initial_state=hidden)
    return output, state

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.enc_units))

In [32]:
class Decoder(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, dec_units,batch_sz):
    super(Decoder, self).__init__()
    self.batch_sz = batch_sz
    self.dec_units = dec_units
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = gru(self.dec_units)
    self.fc = tf.keras.layers.Dense(vocab_size)

    # used for attention
    self.W1 = tf.keras.layers.Dense(self.dec_units)
    self.W2 = tf.keras.layers.Dense(self.dec_units)
    # Role difference of W1 and W2

    self.V = tf.keras.layers.Dense(1)
    # What is V in here?
    # For make size 1 data.

  def call(self, x, hidden, enc_output):
    hidden_with_time_axis = tf.expand_dims(hidden,1)

    score = self.V(tf.nn.tanh(self.W1(enc_output)+self.W2(hidden_with_time_axis)))
    attention_weights = tf.nn.softmax(score,axis=1)

    context_vector = attention_weights*enc_output
    context_vector = tf.reduce_sum(context_vector,axis=1)

    x=self.embedding(x)

    x=tf.concat([tf.expand_dims(context_vector,1),x],axis=-1)

    output,state=self.gru(x)

    # output shape == (batch_size*1, hidden_size)
    output=tf.reshape(output,(-1,output.shape[2]))

    # output shape == (batch_size*1, vocab)
    x=self.fc(output)

    return x, state, attention_weights

  def initialize_hidden_state(self):
    return tf.zeros((self.batch_sz, self.dec_units))


In [33]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

# Define the optimizer and the loss function

In [34]:
optimizer = tf.optimizers.Adam()

def loss_function(real, pred):
  mask = 1-np.equal(real,0)
  loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask
  return tf.reduce_mean(loss_)

In [35]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir,"ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

# Train the Model

In [None]:
EPOCHS=10

for epoch in range(EPOCHS):
  start = time.time()

  hidden = encoder.initialize_hidden_state()
  total_loss=0

  for(batch, (inp, targ)) in enumerate(dataset):
    loss=0
    with tf.GradientTape() as tape:
      enc_output, enc_hidden = encoder(inp,hidden)
      dec_hidden= enc_hidden
      dec_input = tf.expand_dims([targ_lang.word2idx['<start>']]*BATCH_SIZE,1)

      for t in range(1, targ.shape[1]):
        predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
        loss +=loss_function(targ[:,t],predictions)

        dec_input = tf.expand_dims(targ[:,t],1)
    batch_loss = (loss/int(targ.shape[1]))
    total_loss+=batch_loss
    variables=encoder.variables + decoder.variables
    gradients = tape.gradient(loss, variables)
    
    optimizer.apply_gradients(zip(gradients,variables))

    if batch%100==0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch+1,batch,batch_loss.numpy()))
  checkpoint.save(file_prefix = checkpoint_prefix)
  print('Epoch {} Loss {:.4f}'.format(epoch))
  print('Time taken for 1 epoch {} sec\n'.format(time.time()-start))

Epoch 1 Batch 0 Loss 1.9582
Epoch 1 Batch 100 Loss 1.1484
Epoch 1 Batch 200 Loss 1.1518
Epoch 1 Batch 300 Loss 1.0234
Epoch 1 Batch 400 Loss 1.0192
Epoch 1 Batch 500 Loss 0.9424
Epoch 1 Batch 600 Loss 0.8888
Epoch 1 Batch 700 Loss 0.8389
Epoch 1 Batch 800 Loss 0.7891


# Restoring the latest checkpoint


In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

# Inference Setup and testing

In [None]:
def evaluate(inputs, encoder, decoder, inp_lang, targ_lang, max_length_inp,max_length_targ):
  attention_plot=np.zeros((max_length_targ,max_length_inp))
  sentence=''
  for i in inputs[0]:
    if i==0:
      break
    sentence=sentence+inp_lang_.idx2word + ' '
    inputs = tf.convert_to_tensor(inputs)
    result=''
    hidden=[tf.zeros((1,units))]
    enc_out, enc_hidden = encoder(inputs,hidden)

    dec_hidden=enc_hidden
    dec_input=tf.expand_dims([targ_lang.word2idx['<start>']],0)

    for t in range(max_length_targ):
      predictions, dec_hidden, attention_weights = decoder(dec_input,dec_hidden,enc_out)

      #storing the attention_weights to plot later on
      attention_weights = tf.reshape(attention_weights,(-1,))
      attention_plot[t]=attention_weights.numpy()

      predicted_id=tf.argmax(predictions[0]).numpy()

      result+=targ_lang_.idx2word[predicted_id] + ' '

      if targ_lang.idx2word[predicted_id] == '<end>':
        return result, sentence, attention_plot
      
      dec_input=tf.expand_dims([predicted_id],0)
    return result, sentence, attention_plot
    

# Function to predict

In [None]:
def predict_random_val_sentence():
  actual_sent= ''
  k = np.random.randint(len(input_tensor_val))
  random_input = input_tensor_val[k]
  random_output = target_tensor_val[k]
  random_input = np.expand_dims(random_input,0)
  result, sentence, attention_plot = evaluate(random_input, encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)
  print('Input: {}'.format(sentence[8:-6]))
  print('Predicted translation: {}'.format(result:[:-6]))

  for i in random_output:
    if i==0:
      break
    actual_sent = actual_sent + targ_lang.idx2word[i] + ' '
  actual_sent = actual_sent[8:-7]
  print('Acutal translation: {}'.format(actual_sent))
  attention_plot = attention_plot[:len(result.split(' '))-2, 1:len(sentence.split(' '))-1]
  sentence,result = sentence.split(' '), result.split(' ')
  sentence = sentence[1:-1]
  result = result[:-2]

  #use plotly to generate the heat map
  trace = go.Heatmap(z=attention_plot, x=sentence, y=result, colorscale='greens')
  data=[trace]
  iplot(data)


In [None]:
predict_random_val_sentence()