### Import necessary packages

In [105]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

### Read JSON File

In [106]:
with open("headline_sarcasm/Sarcasm_Headlines_Dataset.json", "r") as f:
    sentences = []
    labels = []
    url = []
    
    for line in f:
        data = json.loads(json.loads(json.dumps([line]))[0])
        url.append(data['article_link'])
        sentences.append(data['headline'])
        labels.append(data['is_sarcastic'])

###  Train_Test

In [109]:
train_size = int(len(labels) * 0.8)
# split sentences
train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
# split labels
train_labels = np.array(labels[:train_size])
test_labels = np.array(labels[train_size:])

print(train_sentences[1])

the 'roseanne' revival catches up to our thorny political mood, for better and worse


### Tokenization

In [117]:
vocab_size = 30000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index=tokenizer.word_index
print(word_index)



### Sequences

In [118]:
maxLength = 20
trunc_type= 'post'
# Create a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
# Padding
train_padded = pad_sequences(train_sequence, padding='post',
                             maxlen = maxLength, truncating = trunc_type)
train_padded = np.array(train_padded)

test_sequence=tokenizer.texts_to_sequences(test_sentences)
test_padded = np.array(pad_sequences(test_sequence,padding='post',
                            maxlen=max_len, truncating = trunc_type))
print(test_sequence[32])
print(test_padded[32])

[3461, 1439, 2, 17881, 10828, 496, 20, 225]
[ 3461  1439     2 17881 10828   496    20   225     0     0     0     0
     0     0     0     0     0     0     0     0]


### Embedding

#### Build a deep learning network

In [121]:
embedding_dim = 2
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                             input_length=maxLength),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [122]:
numEpochs = 30
batches=100

history = model.fit(train_padded, train_labels, epochs = numEpochs, batch_size=batches,
                   validation_data=(test_padded, test_labels),verbose = 2)

Epoch 1/30
214/214 - 1s - loss: 0.6770 - accuracy: 0.5627 - val_loss: 0.6525 - val_accuracy: 0.6134 - 1s/epoch - 6ms/step
Epoch 2/30
214/214 - 0s - loss: 0.5773 - accuracy: 0.7301 - val_loss: 0.5003 - val_accuracy: 0.8019 - 472ms/epoch - 2ms/step
Epoch 3/30
214/214 - 0s - loss: 0.4004 - accuracy: 0.8717 - val_loss: 0.3900 - val_accuracy: 0.8416 - 473ms/epoch - 2ms/step
Epoch 4/30
214/214 - 0s - loss: 0.2966 - accuracy: 0.9018 - val_loss: 0.3509 - val_accuracy: 0.8553 - 473ms/epoch - 2ms/step
Epoch 5/30
214/214 - 0s - loss: 0.2368 - accuracy: 0.9211 - val_loss: 0.3369 - val_accuracy: 0.8604 - 478ms/epoch - 2ms/step
Epoch 6/30
214/214 - 0s - loss: 0.1944 - accuracy: 0.9348 - val_loss: 0.3330 - val_accuracy: 0.8572 - 468ms/epoch - 2ms/step
Epoch 7/30
214/214 - 0s - loss: 0.1609 - accuracy: 0.9485 - val_loss: 0.3360 - val_accuracy: 0.8598 - 473ms/epoch - 2ms/step
Epoch 8/30
214/214 - 0s - loss: 0.1339 - accuracy: 0.9578 - val_loss: 0.3432 - val_accuracy: 0.8587 - 466ms/epoch - 2ms/step
Epo

### Sample classification

In [123]:
sentences=[
    "granny starting to fear spiders in the garden might be real",
    "the weather today is bright and sunny"
]
sequence = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequence, maxlen=maxLength,padding = 'post', 
                       truncating = trunc_type)
print(padded)

[[    1   924     2   865 20970     5     4  2511   377    24   178     0
      0     0     0     0     0     0     0     0]
 [    4  1829   642    11  4277     9  7915     0     0     0     0     0
      0     0     0     0     0     0     0     0]]


In [124]:
model.predict(padded)



array([[7.6077062e-01],
       [2.8430883e-05]], dtype=float32)