In [1]:
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
import numpy as np

In [2]:
vocab_size = 10000
embedding_dim = 16
max_length = 100

In [3]:
with open('data/sarcasm.json') as f:
    data = json.load(f)

In [4]:
sentences = []
labels = []
urls = []

for d in data:
    sentences.append(d['headline'])
    labels.append(d['is_sarcastic'])
    urls.append(d['article_link'])

In [5]:
train_size = int(len(sentences) * 0.8)

train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
train_labels = labels[:train_size]
test_labels = labels[train_size:]

# convert labels to numpy arrays
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

In [6]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

In [7]:
word_index

{'<OOV>': 1,
 'to': 2,
 'of': 3,
 'the': 4,
 'in': 5,
 'for': 6,
 'a': 7,
 'on': 8,
 'and': 9,
 'with': 10,
 'is': 11,
 'new': 12,
 'trump': 13,
 'man': 14,
 'from': 15,
 'at': 16,
 'about': 17,
 'you': 18,
 'by': 19,
 'this': 20,
 'after': 21,
 'up': 22,
 'out': 23,
 'be': 24,
 'how': 25,
 'that': 26,
 'it': 27,
 'as': 28,
 'not': 29,
 'are': 30,
 'your': 31,
 'what': 32,
 'his': 33,
 'all': 34,
 'he': 35,
 'who': 36,
 'just': 37,
 'has': 38,
 'will': 39,
 'more': 40,
 'into': 41,
 'one': 42,
 'year': 43,
 'report': 44,
 'have': 45,
 'over': 46,
 'area': 47,
 'why': 48,
 'donald': 49,
 'u': 50,
 'day': 51,
 'can': 52,
 'says': 53,
 's': 54,
 'first': 55,
 'woman': 56,
 'time': 57,
 'like': 58,
 'get': 59,
 'her': 60,
 'old': 61,
 "trump's": 62,
 'no': 63,
 'now': 64,
 'off': 65,
 'an': 66,
 'life': 67,
 'obama': 68,
 'people': 69,
 'than': 70,
 'women': 71,
 "'": 72,
 'house': 73,
 'back': 74,
 'was': 75,
 'still': 76,
 'white': 77,
 'make': 78,
 'down': 79,
 'clinton': 80,
 'my': 81,

In [8]:
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')

In [9]:
len(train_padded[0])

100

In [10]:
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

In [11]:
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    GlobalAveragePooling1D(),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [12]:
history = model.fit(train_padded, train_labels, epochs=30, validation_data=(test_padded, test_labels), verbose=2)

Epoch 1/30
668/668 - 2s - loss: 0.6742 - accuracy: 0.5715 - val_loss: 0.6124 - val_accuracy: 0.7741
Epoch 2/30
668/668 - 1s - loss: 0.4528 - accuracy: 0.8221 - val_loss: 0.3866 - val_accuracy: 0.8383
Epoch 3/30
668/668 - 1s - loss: 0.3178 - accuracy: 0.8714 - val_loss: 0.3585 - val_accuracy: 0.8416
Epoch 4/30
668/668 - 2s - loss: 0.2665 - accuracy: 0.8953 - val_loss: 0.3424 - val_accuracy: 0.8553
Epoch 5/30
668/668 - 2s - loss: 0.2308 - accuracy: 0.9100 - val_loss: 0.3433 - val_accuracy: 0.8516
Epoch 6/30
668/668 - 2s - loss: 0.2033 - accuracy: 0.9223 - val_loss: 0.3499 - val_accuracy: 0.8525
Epoch 7/30
668/668 - 2s - loss: 0.1814 - accuracy: 0.9325 - val_loss: 0.3608 - val_accuracy: 0.8525
Epoch 8/30
668/668 - 3s - loss: 0.1623 - accuracy: 0.9390 - val_loss: 0.3758 - val_accuracy: 0.8501
Epoch 9/30
668/668 - 2s - loss: 0.1474 - accuracy: 0.9466 - val_loss: 0.3932 - val_accuracy: 0.8517
Epoch 10/30
668/668 - 1s - loss: 0.1348 - accuracy: 0.9504 - val_loss: 0.4132 - val_accuracy: 0.8456

In [16]:
sentence = [
    'granny starting to fear spiders in the garden might be real',
    'the weather today is bright and sunny'
]

sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

In [17]:
model.predict(padded)

array([[9.8498428e-01],
       [2.3248792e-04]], dtype=float32)