- Tokenization
- Sequencing
- Padding
- Stemming
- Lemmatization

In [4]:
import warnings
warnings.filterwarnings('ignore')

---
## Tokenization

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
sentence = ['We love machine learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)

In [8]:
tokenizer.word_index

{'we': 1, 'love': 2, 'machine': 3, 'learning': 4}

In [9]:
# Repeating words
sentence = ['We love machine learning and deep learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'we': 2, 'love': 3, 'machine': 4, 'and': 5, 'deep': 6}

In [10]:
# Tokenization is not case sensitive
# Tokenization does not consider special characters
sentence = ['@ We love machine LEARNING...,!! and deep learning']
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'learning': 1, 'we': 2, 'love': 3, 'machine': 4, 'and': 5, 'deep': 6}

In [13]:
sentence = ['We are learning natural language processing',
            'We have learned computer vision',
            'we are learning text preprocessing',
            'we are learning from a good trainer']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentence)
tokenizer.word_index

{'we': 1,
 'are': 2,
 'learning': 3,
 'natural': 4,
 'language': 5,
 'processing': 6,
 'have': 7,
 'learned': 8,
 'computer': 9,
 'vision': 10,
 'text': 11,
 'preprocessing': 12,
 'from': 13,
 'a': 14,
 'good': 15,
 'trainer': 16}

---
## Sequencing

In [14]:
sentences = ['We are learning text preprocessing',
             'Tokenization refers to representing each word as a token',
             'Sequencing refers to representing sentences as sequence of tokens',
             'Padding refers to adding zeros to sequences to make them all of same length']

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'to': 1,
 'refers': 2,
 'representing': 3,
 'as': 4,
 'of': 5,
 'we': 6,
 'are': 7,
 'learning': 8,
 'text': 9,
 'preprocessing': 10,
 'tokenization': 11,
 'each': 12,
 'word': 13,
 'a': 14,
 'token': 15,
 'sequencing': 16,
 'sentences': 17,
 'sequence': 18,
 'tokens': 19,
 'padding': 20,
 'adding': 21,
 'zeros': 22,
 'sequences': 23,
 'make': 24,
 'them': 25,
 'all': 26,
 'same': 27,
 'length': 28}

In [15]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[6, 7, 8, 9, 10],
 [11, 2, 1, 3, 12, 13, 4, 14, 15],
 [16, 2, 1, 3, 17, 4, 18, 5, 19],
 [20, 2, 1, 21, 22, 1, 23, 1, 24, 25, 26, 5, 27, 28]]

In [19]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization, sequencing and padding'])

[[9, 10, 11, 16, 20]]

In [20]:
tokenizer.texts_to_sequences(['Text preprocessing does not involve tokenization, sequencing and padding'])

[[9, 10, 11, 16, 20]]

In [21]:
# Out of vocabulary token
sentences = ['We are learning text preprocessing',
             'Tokenization refers to representing each word as a token',
             'Sequencing refers to representing sentences as sequence of tokens',
             'Padding refers to adding zeros to sequences to make them all of same length']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
tokenizer.word_index

{'#OOV': 1,
 'to': 2,
 'refers': 3,
 'representing': 4,
 'as': 5,
 'of': 6,
 'we': 7,
 'are': 8,
 'learning': 9,
 'text': 10,
 'preprocessing': 11,
 'tokenization': 12,
 'each': 13,
 'word': 14,
 'a': 15,
 'token': 16,
 'sequencing': 17,
 'sentences': 18,
 'sequence': 19,
 'tokens': 20,
 'padding': 21,
 'adding': 22,
 'zeros': 23,
 'sequences': 24,
 'make': 25,
 'them': 26,
 'all': 27,
 'same': 28,
 'length': 29}

In [22]:
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[7, 8, 9, 10, 11],
 [12, 3, 2, 4, 13, 14, 5, 15, 16],
 [17, 3, 2, 4, 18, 5, 19, 6, 20],
 [21, 3, 2, 22, 23, 2, 24, 2, 25, 26, 27, 6, 28, 29]]

In [23]:
tokenizer.texts_to_sequences(['Text preprocessing involves tokenization, sequencing and padding'])

[[10, 11, 1, 12, 17, 1, 21]]

In [24]:
tokenizer.texts_to_sequences(['Text preprocessing does not involve tokenization, sequencing and padding'])

[[10, 11, 1, 1, 1, 12, 17, 1, 21]]

---
## Padding

In [26]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'We are learning sequencing',
             'We are learning the technique of padding',
             'machine learning and deep learning are fun']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 6, 5, 2],
 [3, 4, 2, 7],
 [3, 4, 2, 8],
 [3, 4, 2, 9, 10, 11, 12],
 [5, 2, 13, 14, 2, 4, 15]]

In [27]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(sequences)
padded_sequences

array([[ 0,  0,  0,  3,  6,  5,  2],
       [ 0,  0,  0,  3,  4,  2,  7],
       [ 0,  0,  0,  3,  4,  2,  8],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]])

In [28]:
padded_sequences = pad_sequences(sequences, padding = 'pre')
padded_sequences

array([[ 0,  0,  0,  3,  6,  5,  2],
       [ 0,  0,  0,  3,  4,  2,  7],
       [ 0,  0,  0,  3,  4,  2,  8],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]])

In [29]:
padded_sequences = pad_sequences(sequences, padding = 'post')
padded_sequences

array([[ 3,  6,  5,  2,  0,  0,  0],
       [ 3,  4,  2,  7,  0,  0,  0],
       [ 3,  4,  2,  8,  0,  0,  0],
       [ 3,  4,  2,  9, 10, 11, 12],
       [ 5,  2, 13, 14,  2,  4, 15]])

In [30]:
sentences = ['We love machine learning',
             'We are learning tokenization',
             'We are learning sequencing',
             'We are learning the technique of padding',
             'Machine learning and deep learning are fun',
             'The main goal behind text preprocessing is to give numerical representation to the text data']

tokenizer = Tokenizer(oov_token = '#OOV')
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences

[[3, 9, 6, 2],
 [3, 4, 2, 10],
 [3, 4, 2, 11],
 [3, 4, 2, 5, 12, 13, 14],
 [6, 2, 15, 16, 2, 4, 17],
 [5, 18, 19, 20, 7, 21, 22, 8, 23, 24, 25, 8, 5, 7, 26]]

In [31]:
padded_sequences = pad_sequences(sequences, padding = 'pre')
padded_sequences

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  6,  2, 15, 16,  2,  4, 17],
       [ 5, 18, 19, 20,  7, 21, 22,  8, 23, 24, 25,  8,  5,  7, 26]])

In [32]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8)
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  6,  2, 15, 16,  2,  4, 17],
       [ 8, 23, 24, 25,  8,  5,  7, 26]])

In [33]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8, truncating = 'pre')
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  6,  2, 15, 16,  2,  4, 17],
       [ 8, 23, 24, 25,  8,  5,  7, 26]])

In [34]:
padded_sequences = pad_sequences(sequences, padding = 'pre', maxlen = 8, truncating = 'post')
padded_sequences

array([[ 0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  6,  2, 15, 16,  2,  4, 17],
       [ 5, 18, 19, 20,  7, 21, 22,  8]])

---
---


In [35]:
# Complete process
sentences = ['We love machine learning',
             'We are learning tokenization',
             'We are learning sequencing',
             'We are learning the technique of padding',
             'Machine learning and deep learning are fun',
             'The main goal behind text preprocessing is to give numerical representation to the text data']

tokenizer = Tokenizer(oov_token = '#OOV')                 # Create a tokenizer object
tokenizer.fit_on_texts(sentences)                         # Tokenization
sequences = tokenizer.texts_to_sequences(sentences)       # Sequencing
padded_sequences = pad_sequences(sequences, maxlen = 10)  # Padding
padded_sequences

array([[ 0,  0,  0,  0,  0,  0,  3,  9,  6,  2],
       [ 0,  0,  0,  0,  0,  0,  3,  4,  2, 10],
       [ 0,  0,  0,  0,  0,  0,  3,  4,  2, 11],
       [ 0,  0,  0,  3,  4,  2,  5, 12, 13, 14],
       [ 0,  0,  0,  6,  2, 15, 16,  2,  4, 17],
       [21, 22,  8, 23, 24, 25,  8,  5,  7, 26]])

# =================================================

---
# Stemming

In [36]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmer.stem('breaking')

'break'

In [37]:
print(stemmer.stem('breaks'))
print(stemmer.stem('breaking'))
print(stemmer.stem('broke'))
print(stemmer.stem('broken'))
print(stemmer.stem('changes'))
print(stemmer.stem('changed'))
print(stemmer.stem('changing'))
print(stemmer.stem('writes'))
print(stemmer.stem('writing'))
print(stemmer.stem('running'))
print(stemmer.stem('trouble'))
print(stemmer.stem('troubling'))
print(stemmer.stem('troubled'))
print(stemmer.stem('ran'))
print(stemmer.stem('cats'))
print(stemmer.stem('knives'))
print(stemmer.stem('leaves'))

break
break
broke
broken
chang
chang
chang
write
write
run
troubl
troubl
troubl
ran
cat
knive
leav


---
## Lemmatization

In [41]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...


True

In [38]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('breaks')

'break'

In [39]:
print(lemmatizer.lemmatize('breaks', pos = 'v'))
print(lemmatizer.lemmatize('breaking', pos = 'v'))
print(lemmatizer.lemmatize('broke', pos = 'v'))
print(lemmatizer.lemmatize('broken', pos = 'v'))
print(lemmatizer.lemmatize('changes', pos = 'v'))
print(lemmatizer.lemmatize('changed', pos = 'v'))
print(lemmatizer.lemmatize('changing', pos = 'v'))
print(lemmatizer.lemmatize('writes', pos = 'v'))
print(lemmatizer.lemmatize('writing', pos = 'v'))
print(lemmatizer.lemmatize('running', pos = 'v'))
print(lemmatizer.lemmatize('trouble', pos = 'v'))
print(lemmatizer.lemmatize('troubling', pos = 'v'))
print(lemmatizer.lemmatize('troubled', pos = 'v'))
print(lemmatizer.lemmatize('ran', pos = 'v'))
print(lemmatizer.lemmatize('cats', pos = 'v'))
print(lemmatizer.lemmatize('knives'))
print(lemmatizer.lemmatize('leaves'))

break
break
break
break
change
change
change
write
write
run
trouble
trouble
trouble
run
cat
knife
leaf
