# Sentiment analysis 

## Basics

In [0]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [0]:
test = ["Ivan's studying", "He's hungry"]

In [3]:
tokenizer = Tokenizer(num_words=100)
tokenizer.fit_on_texts(test)  
widx = tokenizer.word_index
print(widx)

sequences = tokenizer.texts_to_sequences(test)

print(sequences)

{"ivan's": 1, 'studying': 2, "he's": 3, 'hungry': 4}
[[1, 2], [3, 4]]


## Problem: New words outside training sample

In [0]:
fail = ["Ivan's studying", "He's hungry", "He's tired"]

In [5]:
sequences = tokenizer.texts_to_sequences(fail)

print(sequences)

[[1, 2], [3, 4], [3]]


In [6]:
# To avoid this problem...
tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
tokenizer.fit_on_texts(fail)  
widx = tokenizer.word_index
print(widx)

sequences = tokenizer.texts_to_sequences(fail)

print(sequences)

{'<OOV>': 1, "he's": 2, "ivan's": 3, 'studying': 4, 'hungry': 5, 'tired': 6}
[[3, 4], [2, 5], [2, 6]]


## Padding

In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded = pad_sequences(sequences, maxlen=2)
print(padded)
print('\n' + str(type(padded)))

[[3 4]
 [2 5]
 [2 6]]

<class 'numpy.ndarray'>


Notice: If we have a maxlen smaller than 2, we lose info in this case


In [14]:
test = pad_sequences(sequences, maxlen=1)
print(test)

[[4]
 [5]
 [6]]


In [8]:
padded = pad_sequences(sequences,
                       maxlen = 3, 
                       padding='post')
print(padded)

[[3 4 0]
 [2 5 0]
 [2 6 0]]


In [11]:
print(widx)
print(sequences)
print('\nPadded Test Seuquence: ')
print(padded)

{'<OOV>': 1, "he's": 2, "ivan's": 3, 'studying': 4, 'hungry': 5, 'tired': 6}
[[3, 4], [2, 5], [2, 6]]

Padded Test Seuquence: 
[[3 4 0]
 [2 5 0]
 [2 6 0]]
