## Lesson 1 - Tokeniser

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer

sentences = [
    'i love my dog',
    'I, love my cat',
    'You love my dog!'
]

tokeniser = Tokenizer(num_words=100)
tokeniser.fit_on_texts(sentences)
print(tokeniser.word_index)

{'love': 1, 'my': 2, 'i': 3, 'dog': 4, 'cat': 5, 'you': 6}


## Lesson 2 - OOV Token and Padding

In [2]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokeniser = Tokenizer(num_words=100, oov_token='<OOV>')
tokeniser.fit_on_texts(sentences)
word_index = tokeniser.word_index
print('Word index:', word_index)

Word index: {'<OOV>': 1, 'love': 2, 'my': 3, 'i': 4, 'dog': 5, 'cat': 6, 'you': 7}


In [4]:
sequences = tokeniser.texts_to_sequences(sentences)
print(sequences)

[[4, 2, 3, 5], [4, 2, 3, 6], [7, 2, 3, 5]]


In [6]:
padded = pad_sequences(sequences, maxlen=5)
print(padded)

[[0 4 2 3 5]
 [0 4 2 3 6]
 [0 7 2 3 5]]


In [9]:
test_data = [
    'i really love my dog',
    'my dog loves my manatee'
]

test_seq = tokeniser.texts_to_sequences(test_data)
print('Test sequence:', test_seq)

test_pad = pad_sequences(test_seq, maxlen=10)
print('Test pad:\n', test_pad)

Test sequence: [[4, 1, 2, 3, 5], [3, 5, 1, 3, 1]]
Test pad:
 [[0 0 0 0 0 4 1 2 3 5]
 [0 0 0 0 0 3 5 1 3 1]]


## Lesson 3 - Sarcasm

In [12]:
import json

with open('tmp/sarcasm.json', 'r') as f:
    datastore = json.load(f)
    
print(len(datastore))
print(datastore[0])

26709
{'article_link': 'https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5', 'headline': "former versace store clerk sues over secret 'black code' for minority shoppers", 'is_sarcastic': 0}


In [14]:
sentences = []
labels = []

for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    
print('Sentences:', len(sentences))
print('Labels:', len(labels))

Sentences: 26709
Labels: 26709


In [17]:
tokeniser = Tokenizer(oov_token='<OOV>')
tokeniser.fit_on_texts(sentences)

In [28]:
word_index = tokeniser.word_index

print(len(word_index))
print(list(word_index.keys())[:20])

29657
['<OOV>', 'to', 'of', 'the', 'in', 'for', 'a', 'on', 'and', 'with', 'is', 'new', 'trump', 'man', 'from', 'at', 'about', 'you', 'this', 'by']


In [29]:
sequences = tokeniser.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)
