In [1]:
# %% [code]
# Basic setup
import os, re, html, json, random
from collections import Counter

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Set seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [3]:
# %% [code]
# Load IMDB Sentiment Dataset
imdb_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
df = pd.read_csv(imdb_path).sample(frac=1.0, random_state=SEED).reset_index(drop=True)
train_df, test_df = torch.utils.data.random_split(df, [40000, 10000])
print("IMDB Sentiment shape:", df.shape)

# Load Focused Reddit Sarcasm Dataset
reddit_path = "/kaggle/input/sarcasm/train-balanced-sarcasm.csv"
sarcasm_df = pd.read_csv(reddit_path)
sarcasm_df.rename(columns={"comment": "text", "label": "label"}, inplace=True)
sarcasm_df.dropna(subset=['text', 'label'], inplace=True)
sarcasm_df['text'] = sarcasm_df['text'].astype(str)
sarcasm_df['label'] = sarcasm_df['label'].astype(int)
sarcasm_df = sarcasm_df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)

# Split the sarcasm data
from sklearn.model_selection import train_test_split
train_sarc, test_sarc = train_test_split(
    sarcasm_df, test_size=0.2, random_state=SEED, stratify=sarcasm_df["label"]
)
print("Focused Sarcasm shape:", sarcasm_df.shape)

IMDB Sentiment shape: (50000, 2)
Focused Sarcasm shape: (1010771, 10)


In [4]:
# %% [code]
def clean_text(t: str) -> str:
    t = html.unescape(str(t))
    t = re.sub(r"<[^>]+>", " ", t)                  # remove HTML
    t = re.sub(r"[^a-zA-Z\\s']", " ", t)             # keep letters & apostrophes
    return re.sub(r"\\s+", " ", t).strip().lower()

_tok_pat = re.compile(r"[a-z']{2,}")
def tokenize(text):
    return _tok_pat.findall(clean_text(text))

# IMDB tokens (1 = positive)
train_tokens = [tokenize(t) for t in train_df.dataset['review'].iloc[train_df.indices]]
test_tokens  = [tokenize(t) for t in test_df.dataset['review'].iloc[test_df.indices]]
train_labels = (train_df.dataset['sentiment'].iloc[train_df.indices].values == "positive").astype(np.float32)
test_labels  = (test_df.dataset['sentiment'].iloc[test_df.indices].values == "positive").astype(np.float32)

# Sarcasm tokens (1 = sarcastic)
train_sarc_tokens = [tokenize(t) for t in train_sarc["text"].tolist()]
test_sarc_tokens  = [tokenize(t) for t in test_sarc["text"].tolist()]
train_sarc_labels = train_sarc["label"].astype(np.float32).values
test_sarc_labels  = test_sarc["label"].astype(np.float32).values

In [5]:
# %% [code]
PAD, UNK = "<pad>", "<unk>"
min_freq = 5

# Build vocab from both training sets
all_tokens = train_tokens + train_sarc_tokens
ctr = Counter(w for toks in all_tokens for w in toks)
vocab_words = [w for w,c in ctr.items() if c >= min_freq]

word_to_idx = {PAD:0, UNK:1}
for w in vocab_words:
    word_to_idx[w] = len(word_to_idx)

idx_to_word = {i:w for w,i in word_to_idx.items()}
pad_idx, unk_idx = word_to_idx[PAD], word_to_idx[UNK]
vocab_size = len(word_to_idx)
print("Combined Vocab size:", vocab_size)

# Numericalize all datasets
def numericalize(toks):
    return torch.tensor([word_to_idx.get(w, unk_idx) for w in toks], dtype=torch.long)

train_ids = [numericalize(t) for t in train_tokens]
test_ids  = [numericalize(t) for t in test_tokens]
train_sarc_ids = [numericalize(t) for t in train_sarc_tokens]
test_sarc_ids  = [numericalize(t) for t in test_sarc_tokens]

Combined Vocab size: 56617


In [6]:
# %% [code]
class TextDS(Dataset):
    def __init__(self, seqs, labels):
        self.seqs = seqs
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __len__(self): return len(self.labels)
    def __getitem__(self, i): return self.seqs[i], self.labels[i]

def collate(batch):
    xs, ys = zip(*batch)
    xs_pad = pad_sequence(xs, batch_first=True, padding_value=pad_idx)
    mask = xs_pad.ne(pad_idx)
    y = torch.stack(ys)
    return xs_pad, mask, y

BATCH = 64
# We set num_workers=0 to avoid harmless multiprocessing errors in notebooks
train_loader = DataLoader(TextDS(train_ids, train_labels), batch_size=BATCH, shuffle=True, collate_fn=collate, num_workers=0)
test_loader  = DataLoader(TextDS(test_ids,  test_labels),  batch_size=BATCH, shuffle=False, collate_fn=collate, num_workers=0)
train_sarc_loader = DataLoader(TextDS(train_sarc_ids, train_sarc_labels), batch_size=BATCH, shuffle=True, collate_fn=collate, num_workers=0)
test_sarc_loader  = DataLoader(TextDS(test_sarc_ids,  test_sarc_labels),  batch_size=BATCH, shuffle=False, collate_fn=collate, num_workers=0)

In [11]:
# %% [code]
import glob # <-- ADD THIS LINE

def find_glove():
    # Adjusted pattern to specifically find the 50d version you're using
    for pat in ["**/glove.6B.50d.txt", "**/glove.6B.*.txt", "**/glove.*.txt"]:
        for p in glob.glob(os.path.join("/kaggle/input", pat), recursive=True):
            return p
    return None

def load_glove_as_embedding(word_to_idx, pad_idx, embedding_dim=None):
    glove_path = find_glove()
    if glove_path is None:
        print("No GloVe found; using random embeddings.")
        # Default to a reasonable dimension if none is found or provided
        dim = embedding_dim or 100 
        return nn.Embedding(len(word_to_idx), dim, padding_idx=pad_idx)
    
    print(f"Using GloVe: {glove_path}")
    # Automatically determine embedding_dim from the filename
    if embedding_dim is None:
        m = re.search(r"\\.(\\d+)d\\.txt$", glove_path)
        embedding_dim = int(m.group(1)) if m else 300
    
    mat = np.random.randn(len(word_to_idx), embedding_dim).astype(np.float32) * 0.5 / embedding_dim
    mat[pad_idx] = 0.0
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.rstrip().split(" ")
            if len(parts) != embedding_dim + 1: continue
            word, vec = parts[0], parts[1:]
            if word in word_to_idx:
                mat[word_to_idx[word]] = np.asarray(vec, dtype=np.float32)
                
    return nn.Embedding.from_pretrained(torch.tensor(mat), freeze=False, padding_idx=pad_idx)

# Let the function determine the embedding dimension automatically from the file
embedding = load_glove_as_embedding(word_to_idx, pad_idx).to(device)
print("Embedding dim:", embedding.embedding_dim)

Using GloVe: /kaggle/input/glove6b50dtxt/glove.6B.50d.txt
Embedding dim: 300


In [12]:
# %% [code]
class AttentionPool(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.proj = nn.Linear(in_dim, in_dim)
        self.v = nn.Linear(in_dim, 1, bias=False)
    def forward(self, H, mask=None):
        S = torch.tanh(self.proj(H))
        logits = self.v(S).squeeze(-1)
        if mask is not None: logits = logits.masked_fill(~mask, -1e9)
        w = torch.softmax(logits, dim=-1)
        ctx = (H * w.unsqueeze(-1)).sum(1)
        return ctx, w

class BiLSTMAttnMulti(nn.Module):
    def __init__(self, embedding, hidden=256, p=0.4): # Tuned dropout
        super().__init__()
        self.embedding = embedding
        self.dropout = nn.Dropout(p)
        self.lstm = nn.LSTM(embedding.embedding_dim, hidden, batch_first=True, bidirectional=True)
        self.attn = AttentionPool(2*hidden)
        self.fc_sent = nn.Linear(2*hidden, 1)
        self.fc_sarc = nn.Linear(2*hidden, 1)

    def encode(self, x, mask):
        E = self.dropout(self.embedding(x))
        H, _ = self.lstm(E)
        H = self.dropout(H)
        ctx, w = self.attn(H, mask)
        ctx = self.dropout(ctx)
        return ctx, w

    def forward(self, x, mask, task="sentiment", return_attn=False):
        ctx, w = self.encode(x, mask)
        logits = self.fc_sent(ctx) if task == "sentiment" else self.fc_sarc(ctx)
        if return_attn:
            return logits.squeeze(-1), w
        return logits.squeeze(-1)

HIDDEN = 256
model = BiLSTMAttnMulti(embedding=embedding, hidden=HIDDEN).to(device)
print("Model OK. Encoder out dim:", 2*HIDDEN)

Model OK. Encoder out dim: 512


In [13]:
# %% [code]
# Tuned learning rate and increased epochs
EPOCHS = 8
criterion = nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)

@torch.no_grad()
def acc_from_logits(logits, y):
    p = torch.sigmoid(logits)
    return ((p >= 0.5).float() == y).float().mean()

def run_epoch_multitask(epoch, sar_loader):
    model.train()
    sar_iter = iter(sar_loader)
    for xb, mb, yb in train_loader:
        # Sentiment step
        xb, mb, yb = xb.to(device), mb.to(device), yb.to(device)
        optimizer.zero_grad()
        s_logits = model(xb, mb, task="sentiment")
        s_loss = criterion(s_logits, yb)
        s_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Sarcasm step
        try: xs, ms, ys = next(sar_iter)
        except StopIteration: sar_iter = iter(sar_loader); xs, ms, ys = next(sar_iter)
        xs, ms, ys = xs.to(device), ms.to(device), ys.to(device)
        optimizer.zero_grad()
        z_logits = model(xs, ms, task="sarcasm")
        z_loss = criterion(z_logits, ys)
        z_loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        s_loss, s_acc, s_n = 0., 0., 0
        for xb, mb, yb in test_loader:
            xb, mb, yb = xb.to(device), mb.to(device), yb.to(device)
            logits = model(xb, mb, task="sentiment")
            s_loss += criterion(logits, yb).item() * yb.size(0)
            s_acc  += acc_from_logits(logits, yb).item() * yb.size(0)
            s_n += yb.size(0)
        
        t_loss, t_acc, t_n = 0., 0., 0
        for xb, mb, yb in test_sarc_loader:
            xb, mb, yb = xb.to(device), mb.to(device), yb.to(device)
            logits = model(xb, mb, task="sarcasm")
            t_loss += criterion(logits, yb).item() * yb.size(0)
            t_acc  += acc_from_logits(logits, yb).item() * yb.size(0)
            t_n += yb.size(0)
            
    print(f"Epoch {epoch:02d} | Sent val loss {s_loss/s_n:.4f} acc {s_acc/s_n:.4f} | "
          f"Sarc val loss {t_loss/t_n:.4f} acc {t_acc/t_n:.4f}")

# Run training
for e in range(1, EPOCHS+1):
    run_epoch_multitask(e, train_sarc_loader)

Epoch 01 | Sent val loss 0.2585 acc 0.8961 | Sarc val loss 0.6031 acc 0.6707
Epoch 02 | Sent val loss 0.2722 acc 0.8939 | Sarc val loss 0.5934 acc 0.6815
Epoch 03 | Sent val loss 0.2746 acc 0.8940 | Sarc val loss 0.5854 acc 0.6895
Epoch 04 | Sent val loss 0.3611 acc 0.8858 | Sarc val loss 0.5814 acc 0.6926
Epoch 05 | Sent val loss 0.3899 acc 0.8804 | Sarc val loss 0.5726 acc 0.7002
Epoch 06 | Sent val loss 0.4025 acc 0.8827 | Sarc val loss 0.5706 acc 0.7013
Epoch 07 | Sent val loss 0.5072 acc 0.8799 | Sarc val loss 0.5697 acc 0.7034
Epoch 08 | Sent val loss 0.5215 acc 0.8784 | Sarc val loss 0.5669 acc 0.7052


In [14]:
# %% [code]
def predict_texts_with_sarcasm(texts, mc_passes=30, sarcasm_threshold=0.5, uncertainty_tau=0.15):
    # --- Get deterministic predictions ---
    model.eval()
    with torch.no_grad():
        ids = [numericalize(tokenize(t)) for t in texts]
        x = pad_sequence(ids, batch_first=True, padding_value=pad_idx).to(device)
        mask = x.ne(pad_idx).to(device)
        s_logits = model(x, mask, task="sentiment")
        sarc_logits = model(x, mask, task="sarcasm")
        s_probs = torch.sigmoid(s_logits).cpu().numpy()
        sarc_probs = torch.sigmoid(sarc_logits).cpu().numpy()

    # --- Get MC dropout uncertainty ---
    model.train()
    with torch.no_grad():
        mc_probs = [torch.sigmoid(model(x, mask, task="sentiment")) for _ in range(mc_passes)]
        mc_probs = torch.stack(mc_probs)
        mc_mean = mc_probs.mean(dim=0).cpu().numpy()
        mc_std  = mc_probs.std(dim=0).cpu().numpy()
    model.eval()

    # --- Compile results ---
    results = []
    for txt, sp, sc, mm, ss in zip(texts, s_probs, sarc_probs, mc_mean, mc_std):
        if ss > uncertainty_tau:
            final_label = "UNCERTAIN"
        elif sp >= 0.5 and sc >= sarcasm_threshold:
            final_label = "NEGATIVE (sarcastic flip)"
        elif sp >= 0.5:
            final_label = "POSITIVE"
        else:
            final_label = "NEGATIVE"

        results.append({
            "text": txt,
            "sentiment_prob": float(sp),
            "sarcasm_prob": float(sc),
            "mc_mean": float(mm),
            "mc_std": float(ss),
            "final_label": final_label
        })
    return results

@torch.no_grad()
def show_attention(text, task="sentiment", topk=10):
    model.eval()
    toks = tokenize(text)
    ids = numericalize(toks).unsqueeze(0).to(device)
    mask = ids.ne(pad_idx)
    logits, w = model(ids, mask, task=task, return_attn=True)
    
    print(f"\nText: {text}")
    print(f"Task: {task} | Probability: {torch.sigmoid(logits).item():.4f}")
    
    pairs = sorted(zip(toks, w[0, :len(toks)].cpu().numpy()), key=lambda x: x[1], reverse=True)
    for tok, wt in pairs[:topk]:
        print(f"{wt:.3f} {tok}")

In [17]:
# %% [code]
samples = [
    "This was one of the best movies I have ever seen, truly a masterpiece.",
    "Well, that was two hours of my life I’ll never get back — but at least the popcorn was good.",
    "This movie was absolutely fantastic... if you enjoy watching paint dry.",
    "I just love waiting in line for hours, it’s the highlight of my day.",
    "The acting was phenomenal and the storyline was captivating from start to finish.",
    "The staff was rude and completely unhelpful when I tried to ask for assistance."
]

# Get final predictions
predictions = predict_texts_with_sarcasm(samples)
for r in predictions:
    print(
        f"\\nText: {r['text']}\\n"
        f"  Sentiment Prob: {r['sentiment_prob']:.4f} | Sarcasm Prob: {r['sarcasm_prob']:.4f}\\n"
        f"  MC Mean/Std: {r['mc_mean']:.4f} / {r['mc_std']:.4f} -> Final Label: {r['final_label']}"
    )

# Visualize attention for a tricky sentence
show_attention(samples[1], task="sentiment")
show_attention(samples[1], task="sarcasm")

\nText: This was one of the best movies I have ever seen, truly a masterpiece.\n  Sentiment Prob: 0.9986 | Sarcasm Prob: 0.3369\n  MC Mean/Std: 0.9984 / 0.0012 -> Final Label: POSITIVE
\nText: Well, that was two hours of my life I’ll never get back — but at least the popcorn was good.\n  Sentiment Prob: 0.5626 | Sarcasm Prob: 0.3505\n  MC Mean/Std: 0.5456 / 0.1549 -> Final Label: UNCERTAIN
\nText: This movie was absolutely fantastic... if you enjoy watching paint dry.\n  Sentiment Prob: 0.0004 | Sarcasm Prob: 0.5080\n  MC Mean/Std: 0.0008 / 0.0009 -> Final Label: NEGATIVE
\nText: I just love waiting in line for hours, it’s the highlight of my day.\n  Sentiment Prob: 0.3084 | Sarcasm Prob: 0.4922\n  MC Mean/Std: 0.3215 / 0.0623 -> Final Label: NEGATIVE
\nText: The acting was phenomenal and the storyline was captivating from start to finish.\n  Sentiment Prob: 0.8708 | Sarcasm Prob: 0.1957\n  MC Mean/Std: 0.8499 / 0.0883 -> Final Label: POSITIVE
\nText: The staff was rude and completely 