In [1]:
import torch


print("Torch version:", torch.__version__)



Torch version: 2.2.1+cu121


In [2]:
import torch

if torch.cuda.is_available():
    print("GPU is available.")
    print("Using device:", torch.cuda.get_device_name(0))
else:
    print("GPU is NOT available. Using CPU.")


GPU is NOT available. Using CPU.


  return torch._C._cuda_getDeviceCount() > 0


In [3]:
from torchtext import data
from torchtext import datasets

In [4]:
import torch
from torch import nn, optim

In [5]:
import json

with open("Sarcasm_Headlines_Dataset_v2.json", "r") as f1, open("Sarcasm_Headlines_Dataset.json", "r") as f2:
    data1 = [json.loads(line) for line in f1]
    data2 = [json.loads(line) for line in f2]

merged_data = data1 + data2

with open("merged_dataset.json", "w") as f:
    for item in merged_data:
        f.write(json.dumps(item) + "\n")

print(" Merged dataset size:", len(merged_data))


 Merged dataset size: 55328


In [8]:
import spacy


In [9]:
# Load JSON Lines file
with open("merged_dataset.json", "r") as f:
    raw_data = [json.loads(line) for line in f]

# Use spaCy tokenizer
spacy_en = spacy.load("en_core_web_sm")

def tokenize(text):
    return [tok.text.lower() for tok in spacy_en.tokenizer(text)]


In [15]:
from torch.utils.data import Dataset, DataLoader  
from torch.nn.utils.rnn import pad_sequence
from torchtext.vocab import build_vocab_from_iterator


In [16]:
class SarcasmDataset(Dataset):
    def __init__(self, data, vocab=None):
        self.texts = [tokenize(item["headline"]) for item in data]
        self.labels = [item["is_sarcastic"] for item in data]

        if vocab is None:
            self.vocab = build_vocab_from_iterator(self.texts, specials=["<pad>", "<unk>"])
            self.vocab.set_default_index(self.vocab["<unk>"])
        else:
            self.vocab = vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        tokens = self.vocab(self.texts[idx])
        return torch.tensor(tokens), torch.tensor(self.labels[idx])


In [17]:
def collate_fn(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(x) for x in texts])
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    labels = torch.tensor(labels)
    return padded_texts, lengths, labels


In [18]:
dataset = SarcasmDataset(raw_data)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)


In [19]:
print("Full dataset size:", len(raw_data))

Full dataset size: 55328


In [20]:
from sklearn.model_selection import train_test_split

# Assuming `merged_data` is already loaded as a list of dicts
train_data, test_data = train_test_split(merged_data, test_size=0.2, random_state=42)

print("Train size:", len(train_data))
print("Test size :", len(test_data))


Train size: 44262
Test size : 11066


In [21]:
train_dataset = SarcasmDataset(train_data)
test_dataset = SarcasmDataset(test_data, vocab=train_dataset.vocab)


In [22]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [23]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [24]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, padding_idx):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, text, lengths):
        embedded = self.embedding(text)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, hidden = self.gru(packed)
        return self.fc(self.dropout(hidden[-1]))


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = GRUClassifier(
    vocab_size=len(train_dataset.vocab),
    embedding_dim=100,
    hidden_dim=64,
    output_dim=2,
    padding_idx=train_dataset.vocab['<pad>']
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)


In [26]:
def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    for texts, lengths, labels in dataloader:
        texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(texts, lengths)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_preds.extend(torch.argmax(outputs, dim=1).cpu().tolist())
        all_labels.extend(labels.cpu().tolist())

    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return total_loss / len(dataloader), acc, precision, recall, f1


In [27]:
def evaluate(model, dataloader, criterion):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for texts, lengths, labels in dataloader:
            texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)

            outputs = model(texts, lengths)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            all_preds.extend(torch.argmax(outputs, dim=1).cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    acc = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    return total_loss / len(dataloader), acc, precision, recall, f1


In [30]:
train_metrics = []
test_metrics = []
Epochs=50
for epoch in range(Epochs):
    train_loss, train_acc, train_prec, train_rec, train_f1 = train_epoch(model, train_loader, optimizer, criterion)
    test_loss, test_acc, test_prec, test_rec, test_f1 = evaluate(model, test_loader, criterion)

    train_metrics.append((train_loss, train_acc, train_prec, train_rec, train_f1))
    test_metrics.append((test_loss, test_acc, test_prec, test_rec, test_f1))

    print(f"\nEpoch {epoch+1}/{Epochs}")
    print(f"Train  | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | Prec: {train_prec:.4f} | Rec: {train_rec:.4f} | F1: {train_f1:.4f}")
    print(f"Test   | Loss: {test_loss:.4f}  | Acc: {test_acc:.4f}  | Prec: {test_prec:.4f}  | Rec: {test_rec:.4f}  | F1: {test_f1:.4f}")

#  Average metrics
import numpy as np
train_metrics = np.array(train_metrics)
test_metrics = np.array(test_metrics)

avg_train = train_metrics.mean(axis=0)
avg_test = test_metrics.mean(axis=0)

print("\nAverage Train Metrics:")
print(f"Loss: {avg_train[0]:.4f} | Acc: {avg_train[1]:.4f} | Prec: {avg_train[2]:.4f} | Rec: {avg_train[3]:.4f} | F1: {avg_train[4]:.4f}")

print("\nAverage Test Metrics:")
print(f"Loss: {avg_test[0]:.4f} | Acc: {avg_test[1]:.4f} | Prec: {avg_test[2]:.4f} | Rec: {avg_test[3]:.4f} | F1: {avg_test[4]:.4f}")



Epoch 1/50
Train  | Loss: 0.0406 | Acc: 0.9873 | Prec: 0.9869 | Rec: 0.9852 | F1: 0.9861
Test   | Loss: 0.2146  | Acc: 0.9500  | Prec: 0.9575  | Rec: 0.9329  | F1: 0.9450

Epoch 2/50
Train  | Loss: 0.0223 | Acc: 0.9930 | Prec: 0.9926 | Rec: 0.9920 | F1: 0.9923
Test   | Loss: 0.2353  | Acc: 0.9546  | Prec: 0.9605  | Rec: 0.9401  | F1: 0.9502

Epoch 3/50
Train  | Loss: 0.0140 | Acc: 0.9954 | Prec: 0.9952 | Rec: 0.9948 | F1: 0.9950
Test   | Loss: 0.2590  | Acc: 0.9586  | Prec: 0.9679  | Rec: 0.9413  | F1: 0.9544

Epoch 4/50
Train  | Loss: 0.0114 | Acc: 0.9965 | Prec: 0.9961 | Rec: 0.9962 | F1: 0.9962
Test   | Loss: 0.2795  | Acc: 0.9549  | Prec: 0.9690  | Rec: 0.9319  | F1: 0.9501

Epoch 5/50
Train  | Loss: 0.0067 | Acc: 0.9983 | Prec: 0.9981 | Rec: 0.9981 | F1: 0.9981
Test   | Loss: 0.2835  | Acc: 0.9580  | Prec: 0.9740  | Rec: 0.9337  | F1: 0.9534

Epoch 6/50
Train  | Loss: 0.0084 | Acc: 0.9976 | Prec: 0.9972 | Rec: 0.9975 | F1: 0.9974
Test   | Loss: 0.2746  | Acc: 0.9588  | Prec: 0.96