# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import Dataset

!pip install -q transformers==4.28.0
!pip install -q evaluate
from transformers import BertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import evaluate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m110.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

# Read data into a dataframe
Before you continue, download Sarcasm_Headlines_Dataset_v2.json from https://www.kaggle.com/datasets/rmisra/news-headlines-dataset-for-sarcasm-detection and upload it to the Google Colaboratory runtime.

In [2]:
df = pd.read_json("/content/Sarcasm_Headlines_Dataset_v2.json", lines=True) # Read line separated json
df.drop(columns=['article_link'], inplace=True) # Drop irrelevant columns
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


# Create training, evaluation, and testing splits

In [3]:
fractions = np.array([0.6, 0.2, 0.2]) # 60% training, 20% evaluation, 20% testing
df = df.sample(frac=1) # Shuffle the dataset
train, val, test = np.array_split(df, (fractions[:-1].cumsum() * len(df)).astype(int))

# Tokenize the data

In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased") # Download the tokenizer

train_ds = Dataset.from_pandas(train.reset_index(drop=True), split="train") # Reset index after split
train_ds = train_ds.rename_column('is_sarcastic', 'labels') # Rename the target variable as specified by the Trainer class
train_ds = train_ds.map(lambda x: tokenizer(x['headline'], truncation=True, padding=True), batched=True) # Tokenize in batches

val_ds = Dataset.from_pandas(val.reset_index(drop=True), split="val")
val_ds = val_ds.rename_column('is_sarcastic', 'labels')
val_ds = val_ds.map(lambda x: tokenizer(x['headline'], truncation=True, padding=True), batched=True)

test_ds = Dataset.from_pandas(test.reset_index(drop=True), split="test")
test_ds = test_ds.rename_column('is_sarcastic', 'labels')
test_ds = test_ds.map(lambda x: tokenizer(x['headline'], truncation=True, padding=True), batched=True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/17171 [00:00<?, ? examples/s]

Map:   0%|          | 0/5724 [00:00<?, ? examples/s]

Map:   0%|          | 0/5724 [00:00<?, ? examples/s]

# Load the model

In [5]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2) # Download the model

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

# Set the hyperparameters

In [6]:
args = TrainingArguments(
    output_dir = 'sarcasm_detection',   # output directory for runs
    overwrite_output_dir = True,
    evaluation_strategy = 'steps',      # evaluate after every eval_steps
    per_device_train_batch_size = 8,    # batch size for training
    per_device_eval_batch_size = 8,     # batch size for evaluation
    learning_rate = 5e-5,               # learning rate for AdamW
    num_train_epochs = 3,               # training epochs
    eval_steps = 500,                   # steps between evaluation
    load_best_model_at_end = True,
    metric_for_best_model = 'accuracy'  # preserve the most accurate model
)

# Define evaluation metrics

In [7]:
accuracy_metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# Train the model

In [8]:
trainer = Trainer(
    model = model,                  # model to be trained
    args = args,                    # training args
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,            # for padding batched data
    compute_metrics=compute_metrics # for custom metrics
)

In [9]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.4615,0.37072,0.865304
1000,0.377,0.295893,0.870545
1500,0.3324,0.319449,0.892558
2000,0.355,0.375431,0.895003
2500,0.271,0.311805,0.908281
3000,0.2214,0.450224,0.890287
3500,0.2281,0.329012,0.90898
4000,0.227,0.298892,0.915269
4500,0.1723,0.383261,0.91457
5000,0.1093,0.427687,0.91754


TrainOutput(global_step=6441, training_loss=0.2373632665858323, metrics={'train_runtime': 1036.3614, 'train_samples_per_second': 49.706, 'train_steps_per_second': 6.215, 'total_flos': 1684368629206980.0, 'train_loss': 0.2373632665858323, 'epoch': 3.0})

# Evaluate on test data

In [10]:
trainer.predict(test_dataset=test_ds)

PredictionOutput(predictions=array([[-4.5848746,  4.1110706],
       [-3.7449162,  2.9885755],
       [ 3.6147652, -2.738117 ],
       ...,
       [-3.7190018,  2.9672015],
       [-2.7274573,  1.7568054],
       [ 4.8386326, -3.9789166]], dtype=float32), label_ids=array([1, 1, 0, ..., 1, 0, 0]), metrics={'test_loss': 0.4046051502227783, 'test_accuracy': 0.9198113207547169, 'test_runtime': 24.4849, 'test_samples_per_second': 233.777, 'test_steps_per_second': 29.243})