# Import API's

In [1]:
import collections
import logging
import os
import pathlib
import re
import string
import sys
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

# Preparing DataSets 

In [2]:
# Load Portuguese to English Datasets
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)

# Devide Train and Validation Datasets
train_examples, val_examples = examples['train'], examples['validation']



# Check Datasets

In [3]:
for pt_examples, en_examples in train_examples.batch(3).take(1):
    
    # Portuguese
    for pt in pt_examples.numpy():
        # Decode UTF-8
        print(pt.decode('utf-8'))
        
    # Print Blank    
    print()
    
    # English
    for en in en_examples.numpy():
        # Decode UTF-8
        print(en.decode('utf-8'))

os astrónomos acreditam que cada estrela da galáxia tem um planeta , e especulam que até um quinto deles tem um planeta do tipo da terra que poderá ter vida , mas ainda não vimos nenhum deles .
o problema é que nunca vivi lá um único dia .
agora aqui temos imagens sendo extraídas em tempo real diretamente do feed ,

astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth-like planet that might be able to harbor life , but we have n't seen any of them .
except , i 've never lived one day of my life there .
now here are live images being pulled straight from the feed .


# Text Tokenization & Detokenization

In [4]:
# Download Pretrained Model

model_name = "ted_hrlr_translate_pt_en_converter"

tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir='.', cache_subdir='', extract=True
)

'./ted_hrlr_translate_pt_en_converter.zip'

In [5]:
# Load Pretrained Model

tokenizers = tf.saved_model.load(model_name)

# Check Tokenizer's item

In [6]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

# Tokenize English Example

In [7]:
encoded = tokenizers.en.tokenize(en_examples)





In [8]:
for row in encoded.to_list():
    print(row)

[2, 3946, 110, 321, 75, 198, 1452, 77, 71, 2662, 144, 37, 580, 13, 72, 83, 5848, 5939, 1970, 75, 130, 73, 103, 3339, 74, 124, 89, 111, 462, 14, 106, 580, 75, 242, 97, 264, 73, 3487, 183, 13, 87, 78, 89, 50, 9, 56, 464, 225, 74, 124, 15, 3]
[2, 1533, 13, 45, 9, 142, 243, 752, 103, 204, 74, 99, 183, 96, 15, 3]
[2, 110, 137, 86, 301, 722, 222, 2404, 1473, 109, 71, 1559, 15, 3]


# Detokenize English Example

In [9]:
round_trip = tokenizers.en.detokenize(encoded)





In [10]:
for line in round_trip.numpy():
    print(line.decode('utf-8'))

astronomers now believe that every star in the galaxy has a planet , and they speculate that up to one fifth of them have an earth - like planet that might be able to harbor life , but we have n ' t seen any of them .
except , i ' ve never lived one day of my life there .
now here are live images being pulled straight from the feed .


**Make Lower Lookup**

In [11]:
tokens = tokenizers.en.lookup(encoded)





In [12]:
tokens

<tf.RaggedTensor [[b'[START]', b'astronomers', b'now', b'believe', b'that', b'every', b'star', b'in', b'the', b'galaxy', b'has', b'a', b'planet', b',', b'and', b'they', b'sp', b'##ec', b'##ulate', b'that', b'up', b'to', b'one', b'fifth', b'of', b'them', b'have', b'an', b'earth', b'-', b'like', b'planet', b'that', b'might', b'be', b'able', b'to', b'harbor', b'life', b',', b'but', b'we', b'have', b'n', b"'", b't', b'seen', b'any', b'of', b'them', b'.', b'[END]'], [b'[START]', b'except', b',', b'i', b"'", b've', b'never', b'lived', b'one', b'day', b'of', b'my', b'life', b'there', b'.', b'[END]'], [b'[START]', b'now', b'here', b'are', b'live', b'images', b'being', b'pulled', b'straight', b'from', b'the', b'feed', b'.', b'[END]']]>