# Natural Language Processing in TensorFlow by deeplearning.ai
https://www.coursera.org/learn/natural-language-processing-tensorflow/

## Week 1 - Sentiment in text

### Word Encoding

**Tokenizer**

In [0]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
sentences = ['I love my dog','I love my cat','You Love my dog!','Do you think my dog is amazing']

In [13]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(sequences)

{'my': 1, 'love': 2, 'dog': 3, 'i': 4, 'you': 5, 'cat': 6, 'do': 7, 'think': 8, 'is': 9, 'amazing': 10}
[[4, 2, 1, 3], [4, 2, 1, 6], [5, 2, 1, 3], [7, 5, 8, 1, 3, 9, 10]]


Tokenizer creates a corpus of dictionary of {word : token} format

**word_index**
  

>  num_words parameter takes n( in this case n=100) most common words from the text you pass in fit_on_texts()

>  `{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}`

>  *(Auto lowercases and excludes puntuations i.e dog and dog! are same)*




**texts_to_sequences**

> [[3, 1, 2, 4], [3, 1, 2, 5], [6, 1, 2, 4]]



### Padding - Size Uniformity

By default, Padding as well as losing data is 'pre' (can be put to 'post' by padding = 'post' in pad_sequences())

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(sequences)
print(padded)

[[ 0  0  0  4  2  1  3]
 [ 0  0  0  4  2  1  6]
 [ 0  0  0  5  2  1  3]
 [ 7  5  8  1  3  9 10]]


In [0]:
test_data = ['I really love my dog','My dog loves my food']

In [19]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[4, 2, 1, 3], [1, 3, 1]]


In [20]:
padded_1 = pad_sequences(test_seq)
print(padded_1)

[[4 2 1 3]
 [0 1 3 1]]


oov_token

In [21]:
tokenizer = Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
oov_sequences = tokenizer.texts_to_sequences(sentences)
print(word_index)
print(oov_sequences)

{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]


In [23]:
test_seq = tokenizer.texts_to_sequences(test_data)
print(test_seq)

[[5, 1, 3, 2, 4], [2, 4, 1, 2, 1]]


In [22]:
padded_2 = pad_sequences(oov_sequences)
print(padded_2)

[[ 0  0  0  5  3  2  4]
 [ 0  0  0  5  3  2  7]
 [ 0  0  0  6  3  2  4]
 [ 8  6  9  2  4 10 11]]


In [24]:
padded_3 = pad_sequences(test_seq)
print(padded_3)

[[5 1 3 2 4]
 [2 4 1 2 1]]


In [25]:
padded_4 = pad_sequences(test_seq,maxlen=10)
print(padded_4)

[[0 0 0 0 0 5 1 3 2 4]
 [0 0 0 0 0 2 4 1 2 1]]


In [26]:
padded_4 = pad_sequences(test_seq,maxlen=10,padding='post')
print(padded_4)

[[5 1 3 2 4 0 0 0 0 0]
 [2 4 1 2 1 0 0 0 0 0]]


In [27]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

sentences = ['I love my dog','I love my cat','You Love my dog!','Do you think my dog is amazing']

tokenizer = Tokenizer(num_words=100,oov_token='<OOV>')
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)

padded = pad_sequences(sequences,padding='post',maxlen=10)#default padding = pre and maxlen = longest sequence length
print(padded)

test_data = ['I really love my dog','My dog loves my food'] # <OOV> comes into picture when missing data
sequences = tokenizer.texts_to_sequences(sentences)
print(sequences)


{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
[[ 5  3  2  4  0  0  0  0  0  0]
 [ 5  3  2  7  0  0  0  0  0  0]
 [ 6  3  2  4  0  0  0  0  0  0]
 [ 8  6  9  2  4 10 11  0  0  0]]
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
