In [3]:
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
import nltk
# Load the dataset
def load_data(file_path):
    headlines = []
    labels = []
    with open(file_path, 'r') as file:
        for line in file:
            # Load each line as a separate JSON object
            data = json.loads(line)
            headlines.append(data['headline'])
            labels.append(1 if data['is_sarcastic'] == 1 else 0)
    return pd.DataFrame({'headline': headlines, 'label': labels})

# Load the dataset into a DataFrame
df = load_data('../datasets/Sarcasm_Headlines_Dataset.json')

# Split the dataset into training and testing sets
X = df['headline']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize the text data
max_words = 5000  # Vocabulary size
max_sequence_length = 100  # Length of input sequences

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

# Convert text to sequences of integers
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform input size
X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Build the LSTM model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_sequence_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_pad, y_train, epochs=5, batch_size=64, validation_data=(X_test_pad, y_test))

# Evaluate the model
score, accuracy = model.evaluate(X_test_pad, y_test, verbose=2)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Function to make predictions
def predict_sarcasm(headline):
    sequence = tokenizer.texts_to_sequences([headline])
    padded = pad_sequences(sequence, maxlen=max_sequence_length)
    prediction = model.predict(padded)
    return "Sarcastic" if prediction > 0.5 else "Not Sarcastic"

# Example usage
print(predict_sarcasm("I love waiting in long lines!"))  # Sarcastic example
print(predict_sarcasm("I can't wait to go to the beach!"))  # Non-sarcastic example

Epoch 1/5




[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 202ms/step - accuracy: 0.7123 - loss: 0.5293 - val_accuracy: 0.8497 - val_loss: 0.3428
Epoch 2/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 185ms/step - accuracy: 0.8876 - loss: 0.2689 - val_accuracy: 0.8486 - val_loss: 0.3397
Epoch 3/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 215ms/step - accuracy: 0.9149 - loss: 0.2151 - val_accuracy: 0.8512 - val_loss: 0.3567
Epoch 4/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 210ms/step - accuracy: 0.9310 - loss: 0.1786 - val_accuracy: 0.8517 - val_loss: 0.3757
Epoch 5/5
[1m334/334[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 207ms/step - accuracy: 0.9452 - loss: 0.1434 - val_accuracy: 0.8484 - val_loss: 0.4248
167/167 - 4s - 23ms/step - accuracy: 0.8484 - loss: 0.4248
Test Accuracy: 84.84%
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 577ms/step
Not Sarcastic
[1m1/1[0m [32m━━━━━