# AITA Moral Judgement - NLP Exam

Notebook Structure
0) Setup & Reproducibility
1) Load and preprocess training data
2) Train/validation split
3) TF-IDF + Logistic Regression (baseline)
4) DistilBERT 
5) DistilBERT fine-tuning (1 epoch)
6) DistilBERT fine-tuning (2 epochs)
7) DistilBERT (2 epochs) with class-weighted loss
8) External dataset evaluation
9) Token-level interpretability (Grad × Input)


## 0) Setup Chunck

In [1]:
%pip install pandas matplotlib seaborn scikit-learn transformers torch datasets evaluate 

%pip install 'accelerate>=0.26.0'


Collecting matplotlib
  Downloading matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl.metadata (52 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl.metadata (114 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=3 (from matplotlib)
  Downloading pyparsing-3.3.1-py3-none-any.whl.metadata (5.6 kB)
Downloading matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl (8.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
#Core stuff
import re
import random
from pathlib import Path
import numpy as np
import os
import pandas as pd

# plit + classical ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight

#Transformers 
import torch
import torch.nn as nn
from datasets import Dataset
import evaluate 
import accelerate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    DataCollatorWithPadding
)

Matplotlib is building the font cache; this may take a moment.


In [None]:
#Repo root
REPO_ROOT = Path.cwd()

DATA_DIR = REPO_ROOT / "data"   
OUT_REPORTS = REPO_ROOT / "classification_reports"
OUT_PLOTS   = REPO_ROOT / "plots_results"
OUT_TABLES  = REPO_ROOT / "tables_results"
OUT_TOKENS  = REPO_ROOT / "bert_token"

for d in [OUT_REPORTS, OUT_PLOTS, OUT_TABLES, OUT_TOKENS]:
    d.mkdir(parents=True, exist_ok=True)

print("Repo root:", REPO_ROOT)


Repo root: /Users/hannahmaihojgaard/Documents/GitHub/NLP_AITA_F25


In [23]:
# Output subfolders
OUT_REPORTS_BERT  = OUT_REPORTS / "bert"
OUT_REPORTS_TFIDF = OUT_REPORTS / "tfidf"

OUT_PLOTS_BERT  = OUT_PLOTS / "bert"
OUT_PLOTS_TFIDF = OUT_PLOTS / "tfidf"

for d in [OUT_REPORTS_BERT, OUT_REPORTS_TFIDF, OUT_PLOTS_BERT, OUT_PLOTS_TFIDF]:
    d.mkdir(parents=True, exist_ok=True)

In [5]:
#Seed
SEED = 200
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

## 1) Load in Data + Preprocessing of the primary dataset

In [None]:
#Load training/validation dataset
path = DATA_DIR / "data_train_val.csv"
raw_df = pd.read_csv(path)

#raw_df.head()

## 2) Preprocessing

In [None]:
#df info
print("Shape:", raw_df.shape)
print("Columns:", list(raw_df.columns))

Shape: (97628, 9)
Columns: ['id', 'timestamp', 'title', 'body', 'edited', 'verdict', 'score', 'num_comments', 'is_asshole']


#### Preprocessing Pipeline 
 1) Drop empty/NaN body text (moral judgment needs narrative context)
 2) Remove is_asshole if present (binary label conflicts with 3-class task)
 3) Combine title + body into one text field
 4) Map verdict strings to numeric labels: 0=YTA, 1=NTA, 2=ES


In [16]:
#Making a copy of raw_df
df = raw_df.copy()

#Drop empty bodies
df["body"] = df["body"].fillna("")
df = df[df["body"].str.strip().astype(bool)].copy()

#Drop binary variable 
df = df.drop(columns=["is_asshole"])

In [17]:
#Defining clean_text function
def clean_text(t: str) -> str:
    t = str(t)
    t = re.sub(r'^\s*\[?\s*aita\s*\]?\s*', '', t, flags=re.IGNORECASE)  # remove "[AITA]"
    t = re.sub(r"http\S+", "", t)                                       # remove URLs
    t = re.sub(r"\s+", " ", t).strip()                                  # normalize spaces
    return t

In [18]:
#Building text
df["title"] = df["title"].fillna("")                                   #Combining title + post
df["text"] = (df["title"] + " " + df["body"]).apply(clean_text)        #Applying clean_text 

In [19]:
#Map verdict -> labels
label_map = {
    "asshole": 0,           # YTA
    "not the asshole": 1,   # NTA
    "everyone sucks": 2     # ES
}

In [20]:
df["verdict_norm"] = df["verdict"].astype(str).str.strip().str.lower()
df["labels"] = df["verdict_norm"].map(label_map)

In [22]:
#Sanity check after cleaning
before = len(df)
df = df[df["labels"].notna()].copy()
after = len(df)
print(f"Dropped {before - after} rows with verdicts outside YTA/NTA/ES.")

df["labels"] = df["labels"].astype(int)

# Keep only what we need for modeling
df = df[["text", "labels"]].reset_index(drop=True)

print("Final shape:", df.shape)
print("Label distribution:\n", df["labels"].value_counts().sort_index())
df.head()

Dropped 0 rows with verdicts outside YTA/NTA/ES.
Final shape: (85539, 2)
Label distribution:
 labels
0    20921
1    59068
2     5550
Name: count, dtype: int64


Unnamed: 0,text,labels
0,I wrote an explanation in TIL and came off as ...,0
1,Threw my parent's donuts away My parents are d...,0
2,I told a goth girl she looked like a clown. I ...,1
3,: Argument I had with another redditor in r/HIMYM,2
4,Had a disagreement about Les Miserables with a...,0


### 3) Train/Validation split

In [24]:
train_df, val_df = train_test_split(
    df,
    test_size=0.20,          # 80% train, 20% validation
    random_state=SEED,
    stratify=df["labels"]    # preserve label distribution
)

print("Train size:", len(train_df), " Val size:", len(val_df))
print("\nTrain label distribution:\n", train_df["labels"].value_counts(normalize=True))
print("\nVal label distribution:\n", val_df["labels"].value_counts(normalize=True))

Train size: 68431  Val size: 17108

Train label distribution:
 labels
1    0.690535
0    0.244582
2    0.064883
Name: proportion, dtype: float64

Val label distribution:
 labels
1    0.690554
0    0.244564
2    0.064882
Name: proportion, dtype: float64


### 4) Baseline TF-IDF + Logistic Regression

In [None]:
#Helper functions for TF-IDF vectorization, model fitting, and evaluation

def vectorize_tfidf(X_train, X_val, max_features=10000):
    vectorizer = TfidfVectorizer(
        lowercase=True,
        max_features=max_features,
        ngram_range=(1, 2)
    )
    X_train_vec = vectorizer.fit_transform(X_train)
    X_val_vec = vectorizer.transform(X_val)
    return X_train_vec, X_val_vec, vectorizer


def clf_fit(X_train_vec, y_train, random_state=42):
    clf = LogisticRegression(
        random_state=random_state,
        solver="lbfgs",
        max_iter=1000,
        n_jobs=-1
    )
    clf.fit(X_train_vec, y_train)
    return clf


def clf_evaluate(clf, X_val_vec, y_val, label_names=("YTA", "NTA", "ES"),
                 save_path=None, title="TF-IDF + Logistic Regression"):
    y_pred = clf.predict(X_val_vec)

    print(classification_report(y_val, y_pred, target_names=list(label_names)))

    cm = confusion_matrix(y_val, y_pred, labels=[0, 1, 2])
    disp = ConfusionMatrixDisplay(cm, display_labels=label_names)
    disp.plot(values_format="d")
    plt.title(title)

    # Save plot (same plot, just saved to the right folder)
    if save_path is not None:
        plt.savefig(save_path, dpi=300, bbox_inches="tight")

    plt.show()
    return y_pred

In [None]:
#Run TF-IDF baseline
X_train_vec, X_val_vec, tfidf_vectorizer = vectorize_tfidf(train_df["text"], val_df["text"])
clf = clf_fit(X_train_vec, train_df["labels"])

tfidf_cm_path = OUT_PLOTS_TFIDF / "TF-IDF+logisticregression_cm.png"
y_pred_baseline = clf_evaluate(
    clf,
    X_val_vec,
    val_df["labels"],
    save_path=tfidf_cm_path
)

#Save classification report
tfidf_report = classification_report(
    val_df["labels"],
    y_pred_baseline,
    target_names=["YTA", "NTA", "ES"],
    output_dict=True
)

tfidf_df = pd.DataFrame(tfidf_report).transpose()
tfidf_df.to_csv(OUT_REPORTS_TFIDF / "tfidf_classification_report.csv")

#View
tfidf_df

### 5) DistilBERT fine-tuning (1 epoch)

In [27]:
#Ensure labels are integers
train_df["labels"] = train_df["labels"].astype(int)
val_df["labels"]   = val_df["labels"].astype(int)

# Convert to Hugging Face Datasets
train_hf = Dataset.from_pandas(train_df[["text", "labels"]].reset_index(drop=True))
val_hf   = Dataset.from_pandas(val_df[["text", "labels"]].reset_index(drop=True))

In [28]:
#Load in model and tokenizer
model_id = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
#Tokenization function applied to all texts (truncation enabled for long posts)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = train_hf.map(preprocess_function, batched=True)
tokenized_val   = val_hf.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

id2label = {0: "YTA", 1: "NTA", 2: "ES"}
label2id = {"YTA": 0, "NTA": 1, "ES": 2}

model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"f1_macro": f1_metric.compute(predictions=preds, references=labels, average="macro")["f1"]}


In [None]:
#Defining training arguments and trainer
training_args = TrainingArguments(
    output_dir="./training/distilbert_aita",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

#Training model
trainer.train()
trainer.evaluate()

In [None]:
#Confusion matrix + classification report 
preds_output = trainer.predict(tokenized_val)
y_pred_bert = np.argmax(preds_output.predictions, axis=-1)
y_true = val_df["labels"].to_numpy()

print(classification_report(y_true, y_pred_bert, target_names=["YTA", "NTA", "ES"]))

cm_bert = confusion_matrix(y_true, y_pred_bert, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(cm_bert, display_labels=["YTA", "NTA", "ES"])
disp.plot(values_format="d")
plt.title("DistilBERT Fine-tune (1 epoch)")

plt.savefig(OUT_PLOTS_BERT / "bert_1epoch_cm.png", dpi=300, bbox_inches="tight")
plt.show()

report_dict = classification_report(
    y_true,
    y_pred_bert,
    target_names=["YTA", "NTA", "ES"],
    output_dict=True
)

report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv(OUT_REPORTS_BERT / "bert_epoch1_classification_report.csv")  
report_df


### 6) DistilBERT fine-tuning (2 epochs)

In [None]:
#Training arguments + trainer
training_args = TrainingArguments(
    output_dir="./training/distilbert_aita",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

#training model + evaulate
trainer.train()
trainer.evaluate()

In [None]:
#Getting predictions and making report + cm
preds_output = trainer.predict(tokenized_val)
y_pred_bert = np.argmax(preds_output.predictions, axis=-1)
y_true = val_df["labels"].to_numpy()

report_dict = classification_report(
    y_true,
    y_pred_bert,
    target_names=["YTA", "NTA", "ES"],
    output_dict=True
)

report_df = pd.DataFrame(report_dict).transpose()
report_df.to_csv(OUT_REPORTS_BERT / "bert_epoch2_classification_report.csv") 
report_df

cm_bert = confusion_matrix(y_true, y_pred_bert, labels=[0, 1, 2])
disp = ConfusionMatrixDisplay(cm_bert, display_labels=["YTA", "NTA", "ES"])
disp.plot(values_format="d")
plt.title("DistilBERT Fine-tuned (2 epochs)")

plt.savefig(OUT_PLOTS_BERT / "bert_2epochs_cm.png", dpi=300, bbox_inches="tight")
plt.show()

### 7) DistilBERT (2 epochs) with class-weighted loss

In [None]:
#Compute class weights based on training label distribution
classes = np.array([0, 1, 2])
weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=train_df["labels"].to_numpy()
)
class_weights = torch.tensor(weights, dtype=torch.float)
print("Class weights:", class_weights)

In [None]:
#Custom Trainer overriding loss computation to apply class-weighted cross-entropy
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
training_args = TrainingArguments(
    output_dir="./training/distilbert_aita",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="no",
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

#Training model + evaulate
trainer.train()
trainer.evaluate()

In [None]:
#Getting preds and making report 
preds_output_w2 = trainer.predict(tokenized_val)
y_pred_bert = np.argmax(preds_output_w2.predictions, axis=-1)
y_true = val_df["labels"].to_numpy()

report_dict_w2 = classification_report(
    y_true,
    y_pred_bert,
    target_names=["YTA", "NTA", "ES"],
    output_dict=True
)

report_df_w2 = pd.DataFrame(report_dict_w2).transpose()
report_df_w2.to_csv(OUT_REPORTS_BERT / "bert_w_epoch2_classification_report.csv")

report_df_w2

### 8) External Dataset Evaluation
Tests generalization to a different AITA dataset with a different annotation scheme

In [30]:
#Load in data
path_ext = DATA_DIR / "data_test.csv"
test_df = pd.read_csv(path_ext)
print("External dataset shape:", test_df.shape)
print("External dataset columns:", list(test_df.columns))
print("\nRaw external verdict distribution:\n", test_df["verdict"].astype(str).value_counts().head(10))

External dataset shape: (11670, 5)
External dataset columns: ['pid', 'title', 'post', 'full post', 'verdict']

Raw external verdict distribution:
 verdict
user_ok          6000
user_is_fault    5670
Name: count, dtype: int64


In [None]:
#Combine title + post into text field + applying clean_text (same style as primary dataset)
test_df["text"] = test_df["title"].fillna("") + " " + test_df["post"].fillna("") 
test_df["text"] = test_df["text"].astype(str).apply(clean_text)

#Normalize verdict labels
test_df["verdict_norm"] = test_df["verdict"].astype(str).str.strip().str.lower()

#Map external labels: binary scheme
label_map_ext = {
    "user_is_fault": 0,  # YTA
    "user_ok": 1         # NTA
}

test_df["labels"] = test_df["verdict_norm"].map(label_map_ext)

print("\nMapped label distribution (including NaN):\n", test_df["labels"].value_counts(dropna=False))

#Drop rows with unmapped verdicts
before = len(test_df)
test_df = test_df[test_df["labels"].notna()].reset_index(drop=True)
after = len(test_df)

print(f"\nDropped {before - after} rows due to unmapped verdicts.")
print("Final external dataset shape:", test_df.shape)
print("Final external label distribution:\n", test_df["labels"].value_counts().sort_index())



Mapped label distribution (including NaN):
 labels
1    6000
0    5670
Name: count, dtype: int64

Dropped 0 rows due to unmapped verdicts.
Final external dataset shape: (11670, 8)
Final external label distribution:
 labels
0    5670
1    6000
Name: count, dtype: int64


#### 8.1) DistilBERT evaluation on external dataset

In [None]:
#Convert to Hugging Face Dataset
test_hf = Dataset.from_pandas(test_df[["text", "labels"]].reset_index(drop=True))

def tokenize_ext(batch):
    return tokenizer(batch["text"], truncation=True)

tokenized_test = test_hf.map(tokenize_ext, batched=True)

#Predict using the existing trainer/model (no training)
preds = trainer.predict(tokenized_test)
y_pred = np.argmax(preds.predictions, axis=1)
y_true = test_df["labels"].to_numpy()

print(
    classification_report(
        y_true,
        y_pred,
        labels=[0, 1],
        target_names=["YTA", "NTA"]
    )
)

#Save report
bert_ext_report = classification_report(
    y_true,
    y_pred,
    labels=[0, 1],
    target_names=["YTA", "NTA"],
    output_dict=True
)

bert_ext_df = pd.DataFrame(bert_ext_report).transpose()
bert_ext_df.to_csv(
    OUT_REPORTS / "bert" / "bert_external_test_classification_report.csv"
)

print("\nSaved: classification_reports/bert/bert_external_test_classification_report.csv")
bert_ext_df.head()


#### 8.2) TF-IDF baseline evaluation on external dataset (binary)

In [None]:
#Transform external texts using the *already fitted* TF-IDF vectorizer
X_test_vec = tfidf_vectorizer.transform(test_df["text"])
y_test = test_df["labels"].to_numpy()

#Predict using the *already fitted* logistic regression classifier
y_pred_tfidf_ext = clf.predict(X_test_vec)

print(
    classification_report(
        y_test,
        y_pred_tfidf_ext,
        labels=[0, 1],
        target_names=["YTA", "NTA"]
    )
)

#Save report to correct folder
tfidf_ext_report = classification_report(
    y_test,
    y_pred_tfidf_ext,
    labels=[0, 1],
    target_names=["YTA", "NTA"],
    output_dict=True
)

tfidf_ext_df = pd.DataFrame(tfidf_ext_report).transpose()
tfidf_ext_df.to_csv(
    OUT_REPORTS / "tfidf" / "tfidf_external_test_classification_report.csv"
)

print("\nSaved: classification_reports/tfidf/tfidf_external_test_classification_report.csv")
tfidf_ext_df.head()


### 9) Token-level Interpretability (Grad x Input)
This section analyzes which tokens contribute most strongly to YTA vs. NTA predictions using gradient × input attribution.

In [None]:
#Gradient x input attribution
def token_importance_gradxinput(
    text,
    model,
    tokenizer,
    target_class=None,
    max_length=256
):
    """
    Compute token-level importance using gradient × input.

    If target_class is None:
        explains the model's predicted class
    If target_class is provided (0=YTA, 1=NTA):
        forces explanation toward that class
    """

    model.eval()
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    encoded = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        return_attention_mask=True
    )

    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)

    embedding_layer = model.get_input_embeddings()
    embeds = embedding_layer(input_ids)
    embeds.requires_grad_(True)

    outputs = model(inputs_embeds=embeds, attention_mask=attention_mask)
    logits = outputs.logits

    pred_class = torch.argmax(logits, dim=-1).item()
    if target_class is None:
        target_class = pred_class

    model.zero_grad()
    logits[0, target_class].backward()

    grads = embeds.grad
    token_scores = (grads * embeds).sum(dim=-1).squeeze(0)

    tokens = tokenizer.convert_ids_to_tokens(input_ids.squeeze(0))
    scores = token_scores.detach().cpu().numpy()

    rows = []
    for tok, score in zip(tokens, scores):
        if tok in [tokenizer.cls_token, tokenizer.sep_token, tokenizer.pad_token]:
            continue
        rows.append((tok, abs(score)))

    df = pd.DataFrame(rows, columns=["token", "importance"])
    if df.empty:
        return df, pred_class, target_class

    df["importance_norm"] = df["importance"] / df["importance"].max()

    # Merge wordpieces (##)
    merged_tokens = []
    merged_scores = []

    for tok, score in zip(df["token"], df["importance_norm"]):
        if tok.startswith("##") and merged_tokens:
            merged_tokens[-1] += tok[2:]
            merged_scores[-1] = max(merged_scores[-1], score)
        else:
            merged_tokens.append(tok)
            merged_scores.append(score)

    df_merged = pd.DataFrame({
        "token": merged_tokens,
        "importance_norm": merged_scores
    }).sort_values("importance_norm", ascending=False)

    return df_merged.reset_index(drop=True), pred_class, target_class


#### 9.1) Single-example attribution

In [None]:
i = 0  # index of example to inspect
text = test_df.loc[i, "text"]

# Explain predicted class
df_tok, pred_class, _ = token_importance_gradxinput(
    text=text,
    model=model,
    tokenizer=tokenizer
)

print("Predicted class:", pred_class)
print(df_tok.head(30))

# Force explanation toward YTA
df_tok_yta, _, _ = token_importance_gradxinput(
    text=text,
    model=model,
    tokenizer=tokenizer,
    target_class=0
)

print(df_tok_yta.head(20))

In [None]:
#Contrastice attribution: YTA vs. NTA (single post)
df_yta, _, _ = token_importance_gradxinput(text, model, tokenizer, target_class=0)
df_nta, _, _ = token_importance_gradxinput(text, model, tokenizer, target_class=1)

df_compare = (
    df_yta.merge(df_nta, on="token", how="outer", suffixes=("_yta", "_nta"))
    .fillna(0)
)

df_compare["diff"] = (
    df_compare["importance_norm_yta"] - df_compare["importance_norm_nta"]
)

df_compare = df_compare.sort_values(
    "diff", key=lambda x: x.abs(), ascending=False
)

# Save tables
df_compare.to_csv(
    OUT_TOKENS / "bert_token_importance_yta_vs_nta.csv",
    index=False
)

df_compare.head(50).to_csv(
    OUT_TOKENS / "bert_token_importance_yta_vs_nta_top50.csv",
    index=False
)

print(df_compare.head(20))


#### 9.2) Aggregated contrastive attribution across multiple posts

In [None]:
def aggregate_contrastive_tokens(
    df_source,
    model,
    tokenizer,
    n_samples=50,
    max_length=256
):
    rng = np.random.default_rng(SEED)
    indices = rng.choice(
        df_source.index.to_numpy(),
        size=min(n_samples, len(df_source)),
        replace=False
    )

    rows = []

    for idx in indices:
        text = df_source.loc[idx, "text"]

        df_yta, _, _ = token_importance_gradxinput(
            text, model, tokenizer, target_class=0, max_length=max_length
        )
        df_nta, _, _ = token_importance_gradxinput(
            text, model, tokenizer, target_class=1, max_length=max_length
        )

        df_merge = (
            df_yta.merge(df_nta, on="token", how="outer", suffixes=("_yta", "_nta"))
            .fillna(0)
        )

        df_merge["diff"] = (
            df_merge["importance_norm_yta"] - df_merge["importance_norm_nta"]
        )
        rows.append(df_merge)

    df_all = pd.concat(rows)
    df_agg = (
        df_all.groupby("token")[["importance_norm_yta", "importance_norm_nta", "diff"]]
        .mean()
        .reset_index()
    )

    df_agg["abs_diff"] = df_agg["diff"].abs()
    return df_agg.sort_values("abs_diff", ascending=False)


In [None]:
df_compare_agg = aggregate_contrastive_tokens(
    test_df,
    model,
    tokenizer
)

df_compare_agg.to_csv(
    OUT_TOKENS / "bert_token_contrastive_aggregated.csv",
    index=False
)

# Plot aggregated contrastive tokens
top_n = 25
plot_df = df_compare_agg.head(top_n).sort_values("diff")

plt.figure(figsize=(10, 8))
plt.barh(plot_df["token"], plot_df["diff"])
plt.axvline(0, linewidth=1)
plt.title("Aggregated token attribution: YTA vs NTA")
plt.xlabel("Mean importance difference (YTA − NTA)")
plt.ylabel("Token")
plt.tight_layout()

plt.savefig(
    OUT_TOKENS / "bert_token_contrastive_top_tokens.png",
    dpi=300,
    bbox_inches="tight"
)
plt.show()