<a href="https://colab.research.google.com/github/Mubassir1820/NLP-basics/blob/main/Word_Encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries and APIs

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

Define Training Sentences

In [None]:
#Sentences to tokenize
train_sentences = [
    'It is a sunny day'
    #add a new sentence here
]

Setup the Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = 100) #Instantiate
tokenizer.fit_on_texts(train_sentences) #train
word_index = tokenizer.word_index #store words

In [None]:
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'sunny': 4, 'day': 5}


Creating Sequence of tokens

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

Define Training sentences in a list

In [None]:
train_sentences = [
    'It is a sunny day',
    'It is a cloudy day',
    'will it rain today?'
]

Setup the Tokenizer

In [None]:
tokenizer = Tokenizer(num_words = 100) #Instantiate
tokenizer.fit_on_texts(train_sentences) #train
word_index = tokenizer.word_index #store words

In [None]:
print(word_index)

{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}


Create Sequences

In [None]:
#Create sequence using tokenizer
sequences = tokenizer.texts_to_sequences(train_sentences)

In [None]:
#print word dictionaries and sequences
print(f'Word indexes >>>>{word_index}')
print(f'Sequences of words >>>>{sequences}')

Word indexes >>>>{'it': 1, 'is': 2, 'a': 3, 'day': 4, 'sunny': 5, 'cloudy': 6, 'will': 7, 'rain': 8, 'today': 9}
Sequences of words >>>>[[1, 2, 3, 5, 4], [1, 2, 3, 6, 4], [7, 1, 8, 9]]


In [None]:
print(train_sentences[0])
print(sequences[0])

It is a sunny day
[1, 2, 3, 5, 4]


Tokenizing new data using the same tokenizer

In [None]:
new_sentences = [
    'Will it be raining today?',
    'It is a pleasant day.'
]

In [None]:
new_sequences = tokenizer.texts_to_sequences(new_sentences)

In [None]:
print(new_sentences)
print(new_sequences)

['Will it be raining today?', 'It is a pleasant day.']
[[7, 1, 9], [1, 2, 3, 4]]


Replacing newly encounted words with special values

In [None]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<oov>') #Instantiate
tokenizer.fit_on_texts(train_sentences) #train
word_index = tokenizer.word_index #store words

In [None]:
#create sequences of the new sentences
new_sequences = tokenizer.texts_to_sequences(new_sentences)
print(word_index)
print(new_sentences)
print(new_sequences)

{'<oov>': 1, 'it': 2, 'is': 3, 'a': 4, 'day': 5, 'sunny': 6, 'cloudy': 7, 'will': 8, 'rain': 9, 'today': 10}
['Will it be raining today?', 'It is a pleasant day.']
[[8, 2, 1, 1, 10], [2, 3, 4, 1, 5]]


Padding the sequences(for the sequences to be the same size)

In [1]:
#import required APIs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Define the training sentences

In [2]:
train_sentences = [
    'It will rain',
    'The weather is cloudy!',
    'Will it be raining today?',
    'It is a super hot day!',

]

Train the tokenizer

In [3]:
tokenizer = Tokenizer(num_words = 100, oov_token = '<oov>') #Instantiate
tokenizer.fit_on_texts(train_sentences) #train
word_index = tokenizer.word_index #store words

In [7]:
#create sequences of the new sentences
sequences = tokenizer.texts_to_sequences(train_sentences)

Pad Sequences

In [8]:
#Pad sequences
padded_seqs = pad_sequences(sequences)

In [9]:
print(word_index)
print(train_sentences)
print(sequences)
print(padded_seqs)

{'<oov>': 1, 'it': 2, 'will': 3, 'is': 4, 'rain': 5, 'the': 6, 'weather': 7, 'cloudy': 8, 'be': 9, 'raining': 10, 'today': 11, 'a': 12, 'super': 13, 'hot': 14, 'day': 15}
['It will rain', 'The weather is cloudy!', 'Will it be raining today?', 'It is a super hot day!']
[[2, 3, 5], [6, 7, 4, 8], [3, 2, 9, 10, 11], [2, 4, 12, 13, 14, 15]]
[[ 0  0  0  2  3  5]
 [ 0  0  6  7  4  8]
 [ 0  3  2  9 10 11]
 [ 2  4 12 13 14 15]]


Customizing the padded sequences with parameters

In [10]:
padded_seqs = pad_sequences(sequences,
                            padding = 'post',
                            maxlen = 5,
                            truncating = 'post',
)

In [11]:
print(padded_seqs)

[[ 2  3  5  0  0]
 [ 6  7  4  8  0]
 [ 3  2  9 10 11]
 [ 2  4 12 13 14]]
