In [3]:
from google.colab import files
uploaded = files.upload()  # Then choose your file from system


Saving Sarcasm_Headlines_Dataset.json to Sarcasm_Headlines_Dataset.json


In [10]:
import tensorflow as tf
from transformers import TFBertForSequenceClassification, BertTokenizer, create_optimizer
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# ✅ Load data
df = pd.read_json("Sarcasm_Headlines_Dataset.json", lines=True)
df = df[['headline', 'is_sarcastic']]
df['headline'] = df['headline'].astype(str)

# ✅ Tokenize
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_len = 50
tokens = tokenizer(
    list(df['headline']),
    padding='max_length',
    truncation=True,
    max_length=max_len,
    return_tensors='np'
)

input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']
labels = np.array(df['is_sarcastic'])

# ✅ Train-test split (convert to numpy for indexing)
X_train_ids, X_test_ids, X_train_mask, X_test_mask, y_train, y_test = train_test_split(
    input_ids, attention_mask, labels, test_size=0.2, random_state=42
)

train_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": X_train_ids, "attention_mask": X_train_mask}, y_train
)).batch(32).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices((
    {"input_ids": X_test_ids, "attention_mask": X_test_mask}, y_test
)).batch(32).prefetch(tf.data.AUTOTUNE)

# ✅ Load model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# ✅ Optimizer
steps_per_epoch = len(train_dataset)
num_train_steps = steps_per_epoch * 5
optimizer, lr_schedule = create_optimizer(init_lr=2e-5, num_train_steps=num_train_steps, num_warmup_steps=0)

# ✅ Compile
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)

# ✅ Train (no callbacks to avoid errors)
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=5
)

# ✅ Evaluate
loss, acc = model.evaluate(val_dataset)
print(f"\n✅ Final Test Accuracy: {acc * 100:.2f}%")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

✅ Final Test Accuracy: 93.20%
