In [8]:
import re

In [1]:
with open('input_txt.txt', 'r') as input:
    input_content = input.read()
    
print(input_content)

The TensorFlow tutorials are written as Jupyter notebooks and run directly in Google Colabâ€”a hosted notebook environment that requires no setup. At the top of each tutorial, you'll see a Run in Google Colab button. Click the button to open the notebook and run the code yourself.

For beginners
The best place to start is with the user-friendly Keras sequential API. Build models by plugging together building blocks. After these tutorials, read the Keras guide.
Beginner quickstart
This "Hello, World!" notebook shows the Keras Sequential API and model.fit.
Keras basics
This notebook collection demonstrates basic machine learning tasks using Keras.
Load data
These tutorials use tf.data to load various data formats and build input pipelines.
For experts
The Keras functional and subclassing APIs provide a define-by-run interface for customization and advanced research. Build your model, then write the forward and backward pass. Create custom layers, activations, and training loops.
Advanced

In [22]:
# create tokens
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', input_content)
# removing all leading and trailing white spaces
tokens = [item.strip() for item in tokens if item.strip()]

print(len(tokens), tokens)

247 ['The', 'TensorFlow', 'tutorials', 'are', 'written', 'as', 'Jupyter', 'notebooks', 'and', 'run', 'directly', 'in', 'Google', 'Colabâ€”a', 'hosted', 'notebook', 'environment', 'that', 'requires', 'no', 'setup', '.', 'At', 'the', 'top', 'of', 'each', 'tutorial', ',', 'you', "'", 'll', 'see', 'a', 'Run', 'in', 'Google', 'Colab', 'button', '.', 'Click', 'the', 'button', 'to', 'open', 'the', 'notebook', 'and', 'run', 'the', 'code', 'yourself', '.', 'For', 'beginners', 'The', 'best', 'place', 'to', 'start', 'is', 'with', 'the', 'user-friendly', 'Keras', 'sequential', 'API', '.', 'Build', 'models', 'by', 'plugging', 'together', 'building', 'blocks', '.', 'After', 'these', 'tutorials', ',', 'read', 'the', 'Keras', 'guide', '.', 'Beginner', 'quickstart', 'This', '"', 'Hello', ',', 'World', '!', '"', 'notebook', 'shows', 'the', 'Keras', 'Sequential', 'API', 'and', 'model', '.', 'fit', '.', 'Keras', 'basics', 'This', 'notebook', 'collection', 'demonstrates', 'basic', 'machine', 'learning', 't

In [27]:
# creating token IDs

# 1. sort the tokens in alphabetical order without duplicates
tokens = sorted(set(tokens))
print(len(tokens), tokens)

# 2. add IDs to tokens
vocab = {token:id for id, token in enumerate(tokens)}
print(len(vocab), vocab)

140 ['!', '"', "'", ',', '.', 'API', 'APIs', 'Advanced', 'After', 'At', 'Beginner', 'Build', 'Click', 'Colab', 'Colabâ€”a', 'Create', 'Customization', 'CycleGAN', 'Distribute', 'Distributed', 'For', 'GPUs', 'Google', 'Hello', 'Jupyter', 'Keras', 'Load', 'Neural', 'Run', 'Sequential', 'TPUs', 'TensorFlow', 'The', 'These', 'This', 'Transformers', 'World', 'a', 'across', 'activations', 'advanced', 'and', 'are', 'as', 'backward', 'basic', 'basics', 'beginners', 'best', 'blocks', 'build', 'building', 'button', 'by', 'code', 'collection', 'custom', 'customization', 'data', 'define-by-run', 'demonstrates', 'directly', 'each', 'environment', 'examples', 'experts', 'fit', 'for', 'formats', 'forward', 'functional', 'guide', 'has', 'hosted', 'how', 'in', 'including', 'input', 'instructive', 'interface', 'is', 'layers', 'learning', 'll', 'load', 'loop', 'loops', 'machine', 'machines', 'many', 'model', 'models', 'multiple', 'no', 'notebook', 'notebooks', 'of', 'open', 'or', 'pass', 'pipelines', 'pl

In [46]:
# # We can modify the tokenizer to use an <|unk|> token if it
# # encounters a word that is not part of the vocabulary. 

# # Furthermore, we add a token between
# # unrelated texts. 

# # For example, when training GPT-like LLMs on multiple independent
# # documents or books, it is common to insert a token before each document or book that
# # follows a previous text source

# Let's now modify the vocabulary to include these two special tokens, <unk> and
# <|endoftext|>, by adding these to the list of all unique words that we created in the
# previous section:

tokens.extend(['<|endoftext|>', '<|unknown|>'])
print(tokens)

all_vocab = {token:id for id, token in enumerate(tokens)}


for i, item in enumerate(list(all_vocab.items())[-5:]):
    print(item)

['!', '"', "'", ',', '.', 'API', 'APIs', 'Advanced', 'After', 'At', 'Beginner', 'Build', 'Click', 'Colab', 'Colabâ€”a', 'Create', 'Customization', 'CycleGAN', 'Distribute', 'Distributed', 'For', 'GPUs', 'Google', 'Hello', 'Jupyter', 'Keras', 'Load', 'Neural', 'Run', 'Sequential', 'TPUs', 'TensorFlow', 'The', 'These', 'This', 'Transformers', 'World', 'a', 'across', 'activations', 'advanced', 'and', 'are', 'as', 'backward', 'basic', 'basics', 'beginners', 'best', 'blocks', 'build', 'building', 'button', 'by', 'code', 'collection', 'custom', 'customization', 'data', 'define-by-run', 'demonstrates', 'directly', 'each', 'environment', 'examples', 'experts', 'fit', 'for', 'formats', 'forward', 'functional', 'guide', 'has', 'hosted', 'how', 'in', 'including', 'input', 'instructive', 'interface', 'is', 'layers', 'learning', 'll', 'load', 'loop', 'loops', 'machine', 'machines', 'many', 'model', 'models', 'multiple', 'no', 'notebook', 'notebooks', 'of', 'open', 'or', 'pass', 'pipelines', 'place'

In [52]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unknown|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV2(all_vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
 
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [54]:
tokenizer.encode(text)

[23, 3, 151, 137, 151, 151, 151, 150, 151, 119, 151, 151, 96, 119, 151, 4]

In [55]:
tokenizer.decode(tokenizer.encode(text))

'Hello, <|unknown|> you <|unknown|> <|unknown|> <|unknown|> <|endoftext|> <|unknown|> the <|unknown|> <|unknown|> of the <|unknown|>.'

In [56]:
tokenizer.decode(tokenizer.encode(input_content))

'The TensorFlow tutorials are written as Jupyter notebooks and run directly in Google Colabâ€”a hosted notebook environment that requires no setup. At the top of each tutorial, you\' ll see a Run in Google Colab button. Click the button to open the notebook and run the code yourself. For beginners The best place to start is with the user-friendly Keras sequential API. Build models by plugging together building blocks. After these tutorials, read the Keras guide. Beginner quickstart This" Hello, World!" notebook shows the Keras Sequential API and model. fit. Keras basics This notebook collection demonstrates basic machine learning tasks using Keras. Load data These tutorials use tf. data to load various data formats and build input pipelines. For experts The Keras functional and subclassing APIs provide a define-by-run interface for customization and advanced research. Build your model, then write the forward and backward pass. Create custom layers, activations, and training loops. Adva