In [1]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (4.5 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.7.0-py3-none-any.whl.metadata (1.5 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Downloading opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf>=5.28.0 (from tensorflow)
  Downlo

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

data = ['Yeah, you can be the greatest,',
        'you can be the best.',
        'You can be the King Kong bangin’ on your chest.']

tokenizer = Tokenizer(num_words=25, filters="!\"#$%&()*+,-./:;<=>?[\\]^_'{|}~\t\n", lower=True, split=' ')

"""
fit_on_texts() returns None
It's a void method that modifies the tokenizer internally
It builds the vocabulary, creates word_index, and counts frequencies
The result is stored in the tokenizer object, not returned
"""
fit_on_text = tokenizer.fit_on_texts(data)
print("fit_on_text : ", fit_on_text)
word_index = tokenizer.word_index
print("word_index : ", word_index)
text_to_seq = tokenizer.texts_to_sequences(data)
print("text_to_seq : ", text_to_seq)
seq_to_text = tokenizer.sequences_to_texts(tokenizer.texts_to_sequences(data))
print("seq_to_text : ", seq_to_text)
word_counts = tokenizer.word_counts
print("word_counts : ", word_counts)
word_docs = tokenizer.word_docs
print("word_docs : ", word_docs)

"""
texts_to_matrix() - One-Hot Encoding
Converts text directly to a binary matrix (one-hot encoding)
Creates a vector of length num_words (25 in your case)
Each position represents a word ID: 1.0 if the word is present, 0.0 if not
Example: If word IDs [5, 1, 2, 3, 4, 6] are in the text, positions 5, 1, 2, 3, 4, and 6 will be 1.0, all others 0.0
"""
binary_text_tokenizer = tokenizer.texts_to_matrix(data, mode='binary')
print("binary_text_tokenizer : ", binary_text_tokenizer)
binary_tokenizer = tokenizer.sequences_to_matrix(tokenizer.texts_to_sequences(data), mode='binary')
"""
sequences_to_matrix() - Same as above but from sequences
Does the same thing as texts_to_matrix() but takes sequences (numbers) as input
Useful if you already have sequences and don't want to convert from text again
Both methods produce identical results
"""
print("binary_tokenizer : ", binary_tokenizer)


fit_on_text :  None
word_index :  {'you': 1, 'can': 2, 'be': 3, 'the': 4, 'yeah': 5, 'greatest': 6, 'best': 7, 'king': 8, 'kong': 9, 'bangin’': 10, 'on': 11, 'your': 12, 'chest': 13}
text_to_seq :  [[5, 1, 2, 3, 4, 6], [1, 2, 3, 4, 7], [1, 2, 3, 4, 8, 9, 10, 11, 12, 13]]
seq_to_text :  ['yeah you can be the greatest', 'you can be the best', 'you can be the king kong bangin’ on your chest']
word_counts :  OrderedDict({'yeah': 1, 'you': 3, 'can': 3, 'be': 3, 'the': 3, 'greatest': 1, 'best': 1, 'king': 1, 'kong': 1, 'bangin’': 1, 'on': 1, 'your': 1, 'chest': 1})
word_docs :  defaultdict(<class 'int'>, {'you': 3, 'can': 3, 'the': 3, 'greatest': 1, 'be': 3, 'yeah': 1, 'best': 1, 'on': 1, 'bangin’': 1, 'your': 1, 'kong': 1, 'chest': 1, 'king': 1})
binary_text_tokenizer :  [[0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 1. 1. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0.]
 [0. 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0