### Import necessary packages

In [78]:
import json
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

### Read JSON File

In [79]:
with open("headline_sarcasm/Sarcasm_Headlines_Dataset.json", "r") as f:
    sentences = []
    labels = []
    url = []
    
    for line in f:
        data = json.loads(json.loads(json.dumps([line]))[0])
        url.append(data['article_link'])
        sentences.append(data['headline'])
        labels.append(data['is_sarcastic'])

###  Train_Test

In [85]:
train_size = int(len(labels) * 0.7)
# split sentences
train_sentences = sentences[:train_size]
test_sentences = sentences[train_size:]
# split labels
train_labels = np.array(labels[:train_size])
test_labels = np.array(labels[train_size:])

print(train_sentences[1])

the 'roseanne' revival catches up to our thorny political mood, for better and worse


### Tokenization

In [86]:
vocab_size = 30000
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(train_sentences)
word_index=tokenizer.word_index
print(word_index)



### Sequences

In [87]:
maxLength = 20
trunc_type= 'post'
# Create a sequence
train_sequence = tokenizer.texts_to_sequences(train_sentences)
# Padding
train_padded = pad_sequences(train_sequence, padding='post',
                             maxlen = maxLength, truncating = trunc_type)
train_padded = np.array(train_padded)

test_sequence=tokenizer.texts_to_sequences(test_sentences)
test_padded = np.array(pad_sequences(test_sequence,padding='post',
                            maxlen=max_len, truncating = trunc_type))
print(test_sequence[32])
print(test_padded[32])

[17169, 4811, 18628, 3471, 1542, 12, 128]
[17169  4811 18628  3471  1542    12   128     0     0     0     0     0
     0     0     0     0     0     0     0     0]


### Embedding

#### Build a deep learning network

In [88]:
embedding_dim = 2
model=tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, 
                             input_length=maxLength),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation = 'relu'),
    tf.keras.layers.Dense(1, activation = 'sigmoid')
])

model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [None]:
numEpochs = 30
batches=100000

history = model.fit(train_padded, train_labels, epochs = numEpochs, batch_size=batches,
                   validation_data=(test_padded, test_labels),verbose = 2)

Epoch 1/30
1/1 - 0s - loss: 6.1153e-07 - accuracy: 1.0000 - val_loss: 2.0140 - val_accuracy: 0.8137 - 103ms/epoch - 103ms/step
Epoch 2/30
1/1 - 0s - loss: 6.1123e-07 - accuracy: 1.0000 - val_loss: 2.0140 - val_accuracy: 0.8137 - 66ms/epoch - 66ms/step
Epoch 3/30
1/1 - 0s - loss: 6.1093e-07 - accuracy: 1.0000 - val_loss: 2.0141 - val_accuracy: 0.8137 - 57ms/epoch - 57ms/step
Epoch 4/30
1/1 - 0s - loss: 6.1063e-07 - accuracy: 1.0000 - val_loss: 2.0142 - val_accuracy: 0.8137 - 65ms/epoch - 65ms/step
Epoch 5/30
1/1 - 0s - loss: 6.1032e-07 - accuracy: 1.0000 - val_loss: 2.0143 - val_accuracy: 0.8137 - 57ms/epoch - 57ms/step
Epoch 6/30
1/1 - 0s - loss: 6.1002e-07 - accuracy: 1.0000 - val_loss: 2.0143 - val_accuracy: 0.8137 - 58ms/epoch - 58ms/step
Epoch 7/30
1/1 - 0s - loss: 6.0972e-07 - accuracy: 1.0000 - val_loss: 2.0144 - val_accuracy: 0.8137 - 64ms/epoch - 64ms/step
Epoch 8/30
1/1 - 0s - loss: 6.0942e-07 - accuracy: 1.0000 - val_loss: 2.0145 - val_accuracy: 0.8137 - 57ms/epoch - 57ms/ste

### Sample classification

In [75]:
sentences=[
    "granny starting to fear spiders in the garden might be real",
    "the weather today is bright and sunny"
]
sequence = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequence, maxlen=maxLength,padding = 'post', 
                       truncating = trunc_type)
print(padded)

[[    1   847     2   890 20551     5     4  2407   414    22   183     0
      0     0     0     0     0     0     0     0]
 [    4  1703   643    11  5930     9 14190     0     0     0     0     0
      0     0     0     0     0     0     0     0]]


In [76]:
model.predict(padded)



array([[0.5002825 ],
       [0.50056684]], dtype=float32)