In [1]:
texts = [
    "The cat sat on the mat.",
    "The dog barked loudly at night.",
    "I love eating apples and bananas.",
    "She is reading a book in the library.",
    "The quick brown fox jumps over the lazy dog.",
    "Machine learning is a branch of artificial intelligence.",
    "Deep learning models need a lot of data.",
    "The weather today is sunny and bright.",
    "Natural language processing helps computers understand text.",
    "He bought fresh vegetables from the market."
]


# What Tokenizer Does

1. Builds a Vocabulary

    - It scans all the training texts and assigns each unique word an integer index.

    INPUT ---> ` texts = ["I love cats", "I love dogs"] `
    
    OUTPUT ---> ` {'i': 1, 'love': 2, 'cats': 3, 'dogs': 4} `

2. Converts Text → Sequences of Integers

    - After fitting, you can transform sentences into lists of integers.

    INPUT ---> ` tokenizer.texts_to_sequences(["I love dogs"]) `
    
    OUTPUT ---> ` [[1, 2, 4]] `


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

### Problem

When you fit a Tokenizer on your training text, it builds a vocabulary (word → integer mapping).

But during inference (testing or production), you might encounter a new word that was not in the training vocabulary.

### Solution: oov_token

If you set an OOV token, every unknown word will be replaced with a special token (like "nothing").

In [5]:
tokenizer = Tokenizer(oov_token='<nothing>')

In [6]:
tokenizer.fit_on_texts(texts)

In [7]:
tokenizer.word_index

{'<nothing>': 1,
 'the': 2,
 'is': 3,
 'a': 4,
 'dog': 5,
 'and': 6,
 'learning': 7,
 'of': 8,
 'cat': 9,
 'sat': 10,
 'on': 11,
 'mat': 12,
 'barked': 13,
 'loudly': 14,
 'at': 15,
 'night': 16,
 'i': 17,
 'love': 18,
 'eating': 19,
 'apples': 20,
 'bananas': 21,
 'she': 22,
 'reading': 23,
 'book': 24,
 'in': 25,
 'library': 26,
 'quick': 27,
 'brown': 28,
 'fox': 29,
 'jumps': 30,
 'over': 31,
 'lazy': 32,
 'machine': 33,
 'branch': 34,
 'artificial': 35,
 'intelligence': 36,
 'deep': 37,
 'models': 38,
 'need': 39,
 'lot': 40,
 'data': 41,
 'weather': 42,
 'today': 43,
 'sunny': 44,
 'bright': 45,
 'natural': 46,
 'language': 47,
 'processing': 48,
 'helps': 49,
 'computers': 50,
 'understand': 51,
 'text': 52,
 'he': 53,
 'bought': 54,
 'fresh': 55,
 'vegetables': 56,
 'from': 57,
 'market': 58}

In [8]:
tokenizer.word_counts

OrderedDict([('the', 8),
             ('cat', 1),
             ('sat', 1),
             ('on', 1),
             ('mat', 1),
             ('dog', 2),
             ('barked', 1),
             ('loudly', 1),
             ('at', 1),
             ('night', 1),
             ('i', 1),
             ('love', 1),
             ('eating', 1),
             ('apples', 1),
             ('and', 2),
             ('bananas', 1),
             ('she', 1),
             ('is', 3),
             ('reading', 1),
             ('a', 3),
             ('book', 1),
             ('in', 1),
             ('library', 1),
             ('quick', 1),
             ('brown', 1),
             ('fox', 1),
             ('jumps', 1),
             ('over', 1),
             ('lazy', 1),
             ('machine', 1),
             ('learning', 2),
             ('branch', 1),
             ('of', 2),
             ('artificial', 1),
             ('intelligence', 1),
             ('deep', 1),
             ('models', 1),
             ('n

In [9]:
sequences = tokenizer.texts_to_sequences(texts)

In [10]:
sequences

[[2, 9, 10, 11, 2, 12],
 [2, 5, 13, 14, 15, 16],
 [17, 18, 19, 20, 6, 21],
 [22, 3, 23, 4, 24, 25, 2, 26],
 [2, 27, 28, 29, 30, 31, 2, 32, 5],
 [33, 7, 3, 4, 34, 8, 35, 36],
 [37, 7, 38, 39, 4, 40, 8, 41],
 [2, 42, 43, 3, 44, 6, 45],
 [46, 47, 48, 49, 50, 51, 52],
 [53, 54, 55, 56, 57, 2, 58]]

Add Padding

In [11]:
from keras.utils import pad_sequences

In [12]:
sequences = pad_sequences(sequences, padding='post')

In [13]:
sequences

array([[ 2,  9, 10, 11,  2, 12,  0,  0,  0],
       [ 2,  5, 13, 14, 15, 16,  0,  0,  0],
       [17, 18, 19, 20,  6, 21,  0,  0,  0],
       [22,  3, 23,  4, 24, 25,  2, 26,  0],
       [ 2, 27, 28, 29, 30, 31,  2, 32,  5],
       [33,  7,  3,  4, 34,  8, 35, 36,  0],
       [37,  7, 38, 39,  4, 40,  8, 41,  0],
       [ 2, 42, 43,  3, 44,  6, 45,  0,  0],
       [46, 47, 48, 49, 50, 51, 52,  0,  0],
       [53, 54, 55, 56, 57,  2, 58,  0,  0]], dtype=int32)