In [87]:
import json

sentences = []
labels = []
urls = []

with open("Sarcasm_Headlines_Dataset.json", 'r') as f:
    for line in f:
        item = json.loads(line)  # Parse each line as a JSON object
        sentences.append(item['headline'])
        labels.append(item['is_sarcastic'])
        urls.append(item['article_link'])

In [88]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences, padding='post')
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [89]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

print("Training data shape:", len(X_train))
print("Testing data shape:", len(X_test))

Training data shape: 21367
Testing data shape: 5342


In [90]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Split the data first
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Tokenizer settings
max_length = 40  # You can adjust this
padding_type = 'post'
trunc_type = 'post'
oov_token = "<OOV>"

# Initialize the tokenizer
tokenizer = Tokenizer(num_words=100000, oov_token=oov_token)

# Fit the tokenizer only on the training sentences
tokenizer.fit_on_texts(X_train)

# Convert training and testing sentences to sequences
training_sequences = tokenizer.texts_to_sequences(X_train)
testing_sequences = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to ensure consistent input length
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print("Training data shape:", training_padded.shape)
print("Testing data shape:", testing_padded.shape)

Training data shape: (21367, 40)
Testing data shape: (5342, 40)


In [91]:
import tensorflow as tf
from tensorflow.keras import regularizers

embedding_dim = 32
vocab_size = len(tokenizer.word_index) + 1
max_length = 40

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.001)),
    tf.keras.layers.Dropout(0.7),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [92]:
import numpy as np

# Convert labels to NumPy arrays
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
lr_schedule = tf.keras.callbacks.LearningRateScheduler(lambda epoch: 1e-3 * 10 ** (-(epoch / 10)))

# Now fit the model
history = model.fit(
    training_padded, 
    training_labels, 
    epochs=num_epochs, 
    validation_data=(
        testing_padded, 
        testing_labels
    ), 
    verbose=2, 
    callbacks=[
        early_stopping, 
        lr_schedule,
    ])

Epoch 1/30
668/668 - 3s - loss: 0.6614 - accuracy: 0.5909 - val_loss: 0.5678 - val_accuracy: 0.7106 - lr: 0.0010 - 3s/epoch - 4ms/step
Epoch 2/30
668/668 - 3s - loss: 0.5215 - accuracy: 0.8003 - val_loss: 0.4682 - val_accuracy: 0.8347 - lr: 7.9433e-04 - 3s/epoch - 4ms/step
Epoch 3/30
668/668 - 3s - loss: 0.4471 - accuracy: 0.8643 - val_loss: 0.4293 - val_accuracy: 0.8557 - lr: 6.3096e-04 - 3s/epoch - 4ms/step
Epoch 4/30
668/668 - 3s - loss: 0.4025 - accuracy: 0.8867 - val_loss: 0.4098 - val_accuracy: 0.8574 - lr: 5.0119e-04 - 3s/epoch - 4ms/step
Epoch 5/30
668/668 - 3s - loss: 0.3730 - accuracy: 0.9018 - val_loss: 0.3971 - val_accuracy: 0.8613 - lr: 3.9811e-04 - 3s/epoch - 4ms/step
Epoch 6/30
668/668 - 3s - loss: 0.3491 - accuracy: 0.9086 - val_loss: 0.3928 - val_accuracy: 0.8611 - lr: 3.1623e-04 - 3s/epoch - 4ms/step
Epoch 7/30
668/668 - 3s - loss: 0.3291 - accuracy: 0.9183 - val_loss: 0.3895 - val_accuracy: 0.8624 - lr: 2.5119e-04 - 3s/epoch - 4ms/step
Epoch 8/30
668/668 - 3s - loss:

In [94]:
sentence = [
    "Bruce Springsteen Songs Ranked by the Degree of Flagrancy with Which He Uses the Word “Daddy”",
    "Harris tries to turn the tables on Trump by calling him 'unhinged'",
]

# Tokenize and pad the sequences
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

# Get the predicted probabilities
predictions = model.predict(padded)
print(predictions)

# Convert the probabilities to "Sarcastic" or "Not Sarcastic"
threshold = 0.5
for i, prediction in enumerate(predictions):
    if prediction > threshold:
        print(f"Sentence: '{sentence[i]}' is Sarcastic")
    else:
        print(f"Sentence: '{sentence[i]}' is Not Sarcastic")

[[0.64829034]
 [0.08248807]]
Sentence: 'Bruce Springsteen Songs Ranked by the Degree of Flagrancy with Which He Uses the Word “Daddy”' is Sarcastic
Sentence: 'Harris tries to turn the tables on Trump by calling him 'unhinged'' is Not Sarcastic
