<a href="https://colab.research.google.com/github/shreyassks/Learning-Content/blob/master/NLP_Deeplearning.ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [0]:
sentence = ['PM Modi and President Trump talked on Kashmir issue in G20 summit','Trump said that Kashmir issue is Bilateral!!',
           'My yoga classes were started again after a long break']

In [3]:
token = Tokenizer(num_words = 50) # Tokenizer is a constructor, num words is the parameter which tells tokenizer to take top 50 words for tokenizing
token.fit_on_texts(sentence) # token is an instance and fit_on_texts is the method 
word_dict = token.word_index # word_index property will put the sentence in dictionary form, key is the word and value is the index assigned to the unique words in corpus

# Most repeated unique word is indexed at the first
print(word_dict)  # punctuations are removed and upper case is scaled down to lower case


{'trump': 1, 'kashmir': 2, 'issue': 3, 'pm': 4, 'modi': 5, 'and': 6, 'president': 7, 'talked': 8, 'on': 9, 'in': 10, 'g20': 11, 'summit': 12, 'said': 13, 'that': 14, 'is': 15, 'bilateral': 16, 'my': 17, 'yoga': 18, 'classes': 19, 'were': 20, 'started': 21, 'again': 22, 'after': 23, 'a': 24, 'long': 25, 'break': 26}


In [4]:
word_seq = token.texts_to_sequences(sentence) # this method will represent the values assigned to each word in corpus
print(word_seq)

[[4, 5, 6, 7, 1, 8, 9, 2, 3, 10, 11, 12], [1, 13, 14, 2, 3, 15, 16], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26]]


In [5]:
test = ['Trump and Modi didnot talk about Pakistan or Imran Khan']
test_seq = token.texts_to_sequences(test) # when you use a test data and convert into sequence, then only the words indexed before would be detected and new words will be ignored
print(test_seq)

[[1, 6, 5]]


In [6]:
'''
To even recognise the unseen words, incorporate a key word oov_token in tokenizer object
'''
token = Tokenizer(num_words = 50, oov_token = '<OOV>') # OOV means Out of Vocabulary,its an initializer. token is an instance
token.fit_on_texts(sentence) # token is an instance and fit_on_texts is the method 
word_dict = token.word_index # word_index property will put the sentence in dictionary form, key is the word and value is the index assigned to the unique words in corpus

test_seq = token.texts_to_sequences(test)
print(test_seq)
print(word_dict)

[[2, 7, 6, 1, 1, 1, 1, 1, 1, 1]]
{'<OOV>': 1, 'trump': 2, 'kashmir': 3, 'issue': 4, 'pm': 5, 'modi': 6, 'and': 7, 'president': 8, 'talked': 9, 'on': 10, 'in': 11, 'g20': 12, 'summit': 13, 'said': 14, 'that': 15, 'is': 16, 'bilateral': 17, 'my': 18, 'yoga': 19, 'classes': 20, 'were': 21, 'started': 22, 'again': 23, 'after': 24, 'a': 25, 'long': 26, 'break': 27}


In [7]:
'''
Padding will allow us to use sentences of different lengths
Padding is mainly performed to maintain the uniformity of text sequences
The size of sequence will be according to the longest sentence and padding is done after the sentence, default is 'pre'
if you want sentences of max length of k words then use truncating parameter
'''
padding = pad_sequences(word_seq, padding = 'post') 
print(padding)

[[ 4  5  6  7  1  8  9  2  3 10 11 12]
 [ 1 13 14  2  3 15 16  0  0  0  0  0]
 [17 18 19 20 21 22 23 24 25 26  0  0]]
