In [1]:
import numpy as np
import pandas as pd

In [3]:
text_dataset = [ "The quick brown fox jumps over the lazy dog",
"Tomorrow is going to be a sunny day",
'She opened the door and found a surprise waiting for her',
'I enjoy reading books in my free time',
'The concert was amazing, and the crowd cheered loudly',
'He took a deep breath and stepped onto the stage',
'The smell of freshly baked cookies filled the kitchen',
'The little girl giggled as the puppy licked her face',
'After a long day at work, he decided to go for a walk in the park',
'The old house creaked in the wind, creating an eerie atmosphere',]

Tokenization

In [4]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(oov_token='<nothing>')

# oov_token specifies the out-of-vocabulary (OOV) token.
# The OOV token is used to represent words or tokens that are not present in the vocabulary learned by the Tokenizer.
# In this case, the OOV token is set to <nothing>, meaning that any unseen words will be represented by this token during tokenization.

In [5]:
tokenizer.fit_on_texts(text_dataset)

In [6]:
tokenizer.word_index

{'<nothing>': 1,
 'the': 2,
 'a': 3,
 'and': 4,
 'in': 5,
 'to': 6,
 'day': 7,
 'for': 8,
 'her': 9,
 'he': 10,
 'quick': 11,
 'brown': 12,
 'fox': 13,
 'jumps': 14,
 'over': 15,
 'lazy': 16,
 'dog': 17,
 'tomorrow': 18,
 'is': 19,
 'going': 20,
 'be': 21,
 'sunny': 22,
 'she': 23,
 'opened': 24,
 'door': 25,
 'found': 26,
 'surprise': 27,
 'waiting': 28,
 'i': 29,
 'enjoy': 30,
 'reading': 31,
 'books': 32,
 'my': 33,
 'free': 34,
 'time': 35,
 'concert': 36,
 'was': 37,
 'amazing': 38,
 'crowd': 39,
 'cheered': 40,
 'loudly': 41,
 'took': 42,
 'deep': 43,
 'breath': 44,
 'stepped': 45,
 'onto': 46,
 'stage': 47,
 'smell': 48,
 'of': 49,
 'freshly': 50,
 'baked': 51,
 'cookies': 52,
 'filled': 53,
 'kitchen': 54,
 'little': 55,
 'girl': 56,
 'giggled': 57,
 'as': 58,
 'puppy': 59,
 'licked': 60,
 'face': 61,
 'after': 62,
 'long': 63,
 'at': 64,
 'work': 65,
 'decided': 66,
 'go': 67,
 'walk': 68,
 'park': 69,
 'old': 70,
 'house': 71,
 'creaked': 72,
 'wind': 73,
 'creating': 74,
 'a

In [7]:
tokenizer.word_counts

OrderedDict([('the', 13),
             ('quick', 1),
             ('brown', 1),
             ('fox', 1),
             ('jumps', 1),
             ('over', 1),
             ('lazy', 1),
             ('dog', 1),
             ('tomorrow', 1),
             ('is', 1),
             ('going', 1),
             ('to', 2),
             ('be', 1),
             ('a', 5),
             ('sunny', 1),
             ('day', 2),
             ('she', 1),
             ('opened', 1),
             ('door', 1),
             ('and', 3),
             ('found', 1),
             ('surprise', 1),
             ('waiting', 1),
             ('for', 2),
             ('her', 2),
             ('i', 1),
             ('enjoy', 1),
             ('reading', 1),
             ('books', 1),
             ('in', 3),
             ('my', 1),
             ('free', 1),
             ('time', 1),
             ('concert', 1),
             ('was', 1),
             ('amazing', 1),
             ('crowd', 1),
             ('cheered', 1),
  

In [8]:
tokenizer.document_count

10

Converting text to sequences

In [9]:
sequences = tokenizer.texts_to_sequences(text_dataset)
sequences

[[2, 11, 12, 13, 14, 15, 2, 16, 17],
 [18, 19, 20, 6, 21, 3, 22, 7],
 [23, 24, 2, 25, 4, 26, 3, 27, 28, 8, 9],
 [29, 30, 31, 32, 5, 33, 34, 35],
 [2, 36, 37, 38, 4, 2, 39, 40, 41],
 [10, 42, 3, 43, 44, 4, 45, 46, 2, 47],
 [2, 48, 49, 50, 51, 52, 53, 2, 54],
 [2, 55, 56, 57, 58, 2, 59, 60, 9, 61],
 [62, 3, 63, 7, 64, 65, 10, 66, 6, 67, 8, 3, 68, 5, 2, 69],
 [2, 70, 71, 72, 5, 2, 73, 74, 75, 76, 77]]

Padding

In [10]:
from keras.utils import pad_sequences
sequences = pad_sequences(sequences, padding ='post')
sequences

array([[ 2, 11, 12, 13, 14, 15,  2, 16, 17,  0,  0,  0,  0,  0,  0,  0],
       [18, 19, 20,  6, 21,  3, 22,  7,  0,  0,  0,  0,  0,  0,  0,  0],
       [23, 24,  2, 25,  4, 26,  3, 27, 28,  8,  9,  0,  0,  0,  0,  0],
       [29, 30, 31, 32,  5, 33, 34, 35,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 2, 36, 37, 38,  4,  2, 39, 40, 41,  0,  0,  0,  0,  0,  0,  0],
       [10, 42,  3, 43, 44,  4, 45, 46,  2, 47,  0,  0,  0,  0,  0,  0],
       [ 2, 48, 49, 50, 51, 52, 53,  2, 54,  0,  0,  0,  0,  0,  0,  0],
       [ 2, 55, 56, 57, 58,  2, 59, 60,  9, 61,  0,  0,  0,  0,  0,  0],
       [62,  3, 63,  7, 64, 65, 10, 66,  6, 67,  8,  3, 68,  5,  2, 69],
       [ 2, 70, 71, 72,  5,  2, 73, 74, 75, 76, 77,  0,  0,  0,  0,  0]],
      dtype=int32)