In [1]:
import numpy as np
docs = ['Hello world',
		'Nepal Nepal',
		'hip hip hurray',
		'great to see you',
		'k xa khaber',
		'kohli kohli',
		'got it',
		'hello hello',
		'okay talk you later'
	  ]

In [4]:
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
#Creating a Tokenizer object with a special token for out-of-vocabulary (OOV) words.
# The 'oov_token' parameter ensures that any word not seen during the training phase
# is replaced with the specified token ('Inez') in the tokenized output.
tokenizer = Tokenizer(oov_token='Inez')

In [6]:
tokenizer.fit_on_texts(docs)

In [7]:
# Accessing the word index dictionary created by the Tokenizer.
# The 'word_index' attribute contains a mapping of words to their respective integer indices.
# This mapping is generated after the Tokenizer is fit on a text corpus.
# Example: {'word1': 1, 'word2': 2, ...}
tokenizer.word_index

{'Inez': 1,
 'hello': 2,
 'nepal': 3,
 'hip': 4,
 'you': 5,
 'kohli': 6,
 'world': 7,
 'hurray': 8,
 'great': 9,
 'to': 10,
 'see': 11,
 'k': 12,
 'xa': 13,
 'khaber': 14,
 'got': 15,
 'it': 16,
 'okay': 17,
 'talk': 18,
 'later': 19}

In [8]:
# count the frequency of words
tokenizer.word_counts

OrderedDict([('hello', 3),
             ('world', 1),
             ('nepal', 2),
             ('hip', 2),
             ('hurray', 1),
             ('great', 1),
             ('to', 1),
             ('see', 1),
             ('you', 2),
             ('k', 1),
             ('xa', 1),
             ('khaber', 1),
             ('kohli', 2),
             ('got', 1),
             ('it', 1),
             ('okay', 1),
             ('talk', 1),
             ('later', 1)])

In [9]:
# count the number of  input words
tokenizer.document_count

9

In [10]:
# Converting a list of text documents ('docs') into sequences of integers using the Tokenizer.
# The 'texts_to_sequences' method replaces each word in the documents with its corresponding
# integer index from the Tokenizer's word index.
# Words not found in the word index (out-of-vocabulary words) will be replaced with the OOV token's index.
sequences = tokenizer.texts_to_sequences(docs)

# Displaying the resulting sequences of integers.
sequences

[[2, 7],
 [3, 3],
 [4, 4, 8],
 [9, 10, 11, 5],
 [12, 13, 14],
 [6, 6],
 [15, 16],
 [2, 2],
 [17, 18, 5, 19]]

In [11]:
from keras.utils import pad_sequences

In [12]:
sequences = pad_sequences(sequences, padding='post')
sequences

array([[ 2,  7,  0,  0],
       [ 3,  3,  0,  0],
       [ 4,  4,  8,  0],
       [ 9, 10, 11,  5],
       [12, 13, 14,  0],
       [ 6,  6,  0,  0],
       [15, 16,  0,  0],
       [ 2,  2,  0,  0],
       [17, 18,  5, 19]], dtype=int32)