In [1]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [21]:
# our model dealing only with numerical data. encoding text words is a good solution.
# giving every word a unique id is a possible encoding method.

sentences = [
    'I love my dog',
    'I love my cat',
    'You love my dog!',
    'Do you think my dog is amazing?'
]

print("----------  words codes ----------")
tokenizer = Tokenizer(num_words = 100, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

word_index = tokenizer.word_index
print(word_index)

print("----------  sentences after encoding ----------")
sentences = tokenizer.texts_to_sequences(sentences)
print(sentences)

print("-------- sequences after padding with maxlen = 5 --------")
padded_sequences = pad_sequences(sequences, maxlen=10, padding='post')
print(padded_sequences)

print("--- padding with unseen words ------")
print(pad_sequences(tokenizer.texts_to_sequences(["hello", "suffi"]), maxlen=4, padding='post'))


----------  words codes ----------
{'<OOV>': 1, 'my': 2, 'love': 3, 'dog': 4, 'i': 5, 'you': 6, 'cat': 7, 'do': 8, 'think': 9, 'is': 10, 'amazing': 11}
----------  sentences after encoding ----------
[[5, 3, 2, 4], [5, 3, 2, 7], [6, 3, 2, 4], [8, 6, 9, 2, 4, 10, 11]]
-------- sequences after padding with maxlen = 5 --------
[[ 5  3  2  4  0  0  0  0  0  0]
 [ 5  3  2  7  0  0  0  0  0  0]
 [ 6  3  2  4  0  0  0  0  0  0]
 [ 8  6  9  2  4 10 11  0  0  0]]
--- padding with unseen words ------
[[1 0 0 0]
 [1 0 0 0]]


### Sarcasm Data

In [22]:
import json

data_path = "data/sarcasm.json"
with open(data_path, 'r') as f:
    datastore = json.load(f)
    
sentences, labels, urls = [], [], [] 
for item in datastore:
    sentences.append(item['headline'])
    labels.append(item['is_sarcastic'])
    urls.append(item['article_link'])

print("Item 0")
print(f"headline: {sentences[0]}")
print(f"is_sarcastic: {labels[0]}")
print(f"article_link: {urls[0]}")

Item 0
headline: former versace store clerk sues over secret 'black code' for minority shoppers
is_sarcastic: 0
article_link: https://www.huffingtonpost.com/entry/versace-black-code_us_5861fbefe4b0de3a08f600d5


In [36]:
# fit tokenizer from the sarcasm headlines.
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

# convert sentences txt to sequences and pad sequences.
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')

print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


### BBC Text archive

In [32]:
import csv
data_path = "data/bbc-text.csv"
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ];

first_row = True
sentences = []
labels = []
with open(data_path, 'r') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader: 
        if first_row : first_row=False; continue
        labels.append(row[0])
        sentence = row[1]
        
        for w in stopwords:  
            sentence = sentence.replace(' ' + w + ' ', ' ').replace('  ', ' ')
        sentences.append(sentence)

print(len(sentences))
print(len(labels))

2225
2225


In [33]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
print(len(word_index))



29714


In [34]:
sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[  96  176 1158 ...    0    0    0]
(2225, 2442)


In [39]:
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(labels)
label_word_index = label_tokenizer.word_index
label_seq = label_tokenizer.texts_to_sequences(labels)
print(label_seq[:10])
print(label_word_index)

[[4], [2], [1], [1], [5], [3], [3], [1], [1], [5]]
{'sport': 1, 'business': 2, 'politics': 3, 'tech': 4, 'entertainment': 5}
