# 03 Transformer Models (Planned)

In [1]:
import os, json, re, random
import numpy as np
import pandas as pd

import transformers
import datasets
from datasets import Dataset

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

SEED = 42

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAIN_PATH = "../data/raw/train.tsv"
VALID_PATH = "../data/raw/valid.tsv"
TEST_PATH  = "../data/raw/test.tsv"

for p in [TRAIN_PATH, VALID_PATH, TEST_PATH]:
    print(p, "exists:", os.path.exists(p))

# Raw peek (first 2 lines)
for name, p in [("train", TRAIN_PATH), ("valid", VALID_PATH), ("test", TEST_PATH)]:
    print("\n---", name, "raw head ---")
    with open(p, "r", encoding="utf-8", errors="replace") as f:
        for _ in range(2):
            print(f.readline().rstrip("\n"))


../data/raw/train.tsv exists: True
../data/raw/valid.tsv exists: True
../data/raw/test.tsv exists: True

--- train raw head ---
2635.json	false	Says the Annies List political group supports third-trimester abortions on demand.	abortion	dwayne-bohac	State representative	Texas	republican	0	1	0	0	0	a mailer
10540.json	half-true	When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.	energy,history,job-accomplishments	scott-surovell	State delegate	Virginia	democrat	0	0	1	1	0	a floor speech.

--- valid raw head ---
12134.json	barely-true	We have less Americans working now than in the 70s.	economy,jobs	vicky-hartzler	U.S. Representative	Missouri	republican	1	0	1	0	0	an interview with ABC17 News
238.json	pants-fire	When Obama was sworn into office, he DID NOT use the Holy Bible, but instead the Kuran (Their equivalency to our Bible, but very different beliefs).	obama-birth-certificate,religion	chain-email			

In [3]:
def load_liar_basic(path: str) -> pd.DataFrame:
    df = pd.read_csv(
        path,
        sep="\t",
        header=None,          # LIAR has no header
        dtype=str,
        keep_default_na=False # keep empty strings as empty strings
    )

    # LIAR: col 1 = label, col 2 = statement
    df = df[[1, 2]].rename(columns={1: "label", 2: "statement"})
    return df

train_df = load_liar_basic(TRAIN_PATH)
valid_df = load_liar_basic(VALID_PATH)
test_df  = load_liar_basic(TEST_PATH)

print("Loaded shapes:", train_df.shape, valid_df.shape, test_df.shape)
train_df.head()


Loaded shapes: (10240, 2) (1284, 2) (1267, 2)


Unnamed: 0,label,statement
0,false,Says the Annies List political group supports ...
1,half-true,When did the decline of coal start? It started...
2,mostly-true,"Hillary Clinton agrees with John McCain ""by vo..."
3,false,Health care reform legislation is likely to ma...
4,half-true,The economic turnaround started at the end of ...


In [4]:
LABELS = ["pants-fire", "false", "barely-true", "half-true", "mostly-true", "true"]
LABEL2ID = {l: i for i, l in enumerate(LABELS)}
ID2LABEL = {i: l for l, i in LABEL2ID.items()}

def normalize_label(x: str) -> str:
    x = (x or "").strip().lower()
    x = x.replace("_", "-").replace(" ", "-")
    x = re.sub(r"-+", "-", x)
    return x

def prepare_df(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df["statement"] = df["statement"].astype(str).fillna("").str.strip()
    df["label"] = df["label"].astype(str).map(normalize_label)

    print("Label counts (before filter):", df["label"].value_counts().head(10).to_dict())

    df = df[df["label"].isin(LABEL2ID)].copy()
    df["labels"] = df["label"].map(LABEL2ID).astype(int)

    # Keep only what the model needs
    df = df[["statement", "labels"]].reset_index(drop=True)

    # Safety check
    assert len(df) > 0, "DataFrame became empty after filtering. Check label mapping / TSV parsing."
    return df

train_df = prepare_df(train_df)
valid_df = prepare_df(valid_df)
test_df  = prepare_df(test_df)

print("Final shapes:", train_df.shape, valid_df.shape, test_df.shape)
train_df.head()


Label counts (before filter): {'half-true': 2114, 'false': 1995, 'mostly-true': 1962, 'true': 1676, 'barely-true': 1654, 'pants-fire': 839}
Label counts (before filter): {'false': 263, 'mostly-true': 251, 'half-true': 248, 'barely-true': 237, 'true': 169, 'pants-fire': 116}
Label counts (before filter): {'half-true': 265, 'false': 249, 'mostly-true': 241, 'barely-true': 212, 'true': 208, 'pants-fire': 92}
Final shapes: (10240, 2) (1284, 2) (1267, 2)


Unnamed: 0,statement,labels
0,Says the Annies List political group supports ...,1
1,When did the decline of coal start? It started...,3
2,"Hillary Clinton agrees with John McCain ""by vo...",4
3,Health care reform legislation is likely to ma...,1
4,The economic turnaround started at the end of ...,3


In [5]:
train_ds = Dataset.from_pandas(train_df, preserve_index=False)
valid_ds = Dataset.from_pandas(valid_df, preserve_index=False)
test_ds  = Dataset.from_pandas(test_df,  preserve_index=False)

print("Dataset lengths:", len(train_ds), len(valid_ds), len(test_ds))
train_ds[0]


Dataset lengths: 10240 1284 1267


{'statement': 'Says the Annies List political group supports third-trimester abortions on demand.',
 'labels': 1}

In [6]:
MODEL_NAME = "distilbert-base-uncased"  # fast baseline

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABEL2ID),
    id2label=ID2LABEL,
    label2id=LABEL2ID,
)


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 263.09it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [7]:
MAX_LEN = 64

def tokenize_batch(batch):
    return tokenizer(
        batch["statement"],
        truncation=True,
        max_length=MAX_LEN
    )

train_tok = train_ds.map(tokenize_batch, batched=True, remove_columns=["statement"])
valid_tok = valid_ds.map(tokenize_batch, batched=True, remove_columns=["statement"])
test_tok  = test_ds.map(tokenize_batch,  batched=True, remove_columns=["statement"])

# Make sure Trainer sees torch tensors for the numeric fields only
train_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
valid_tok.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_tok.set_format(type="torch",  columns=["input_ids", "attention_mask", "labels"])

print("Tokenized columns:", train_tok.column_names)
print("Example:", train_tok[0])


Map: 100%|██████████| 10240/10240 [00:00<00:00, 11514.19 examples/s]
Map: 100%|██████████| 1284/1284 [00:00<00:00, 13171.63 examples/s]
Map: 100%|██████████| 1267/1267 [00:00<00:00, 11416.59 examples/s]

Tokenized columns: ['labels', 'input_ids', 'token_type_ids', 'attention_mask']
Example: {'labels': tensor(1), 'input_ids': tensor([  101,  2758,  1996,  8194,  2015,  2862,  2576,  2177,  6753,  2353,
         1011, 12241, 20367, 11324,  2015,  2006,  5157,  1012,   102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}





In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    pr, rc, f1, _ = precision_recall_fscore_support(
        labels, preds, average="macro", zero_division=0
    )
    return {
        "accuracy": acc,
        "precision_macro": pr,
        "recall_macro": rc,
        "f1_macro": f1
    }


In [9]:
import os

RUN_DIR = "../results/transformer_runs"
RESULTS_DIR = "../results"
os.makedirs(RUN_DIR, exist_ok=True)
os.makedirs(RESULTS_DIR, exist_ok=True)

common_args = dict(
    output_dir=RUN_DIR,

    # ↓ safer memory settings (avoid kernel crash)
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,

    num_train_epochs=1,
    max_steps=300,  # ✅ Option A: stop early (quick run)
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,

    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none",
    seed=SEED,

    # ✅ stability in notebooks / prevents dataloader worker crashes
    dataloader_num_workers=0,
    dataloader_pin_memory=False,

    # ✅ avoids some Trainer/dataset column issues
    remove_unused_columns=False,
)

try:
    training_args = TrainingArguments(
        **common_args,
        evaluation_strategy="epoch",
        save_strategy="epoch",
    )
except TypeError:
    training_args = TrainingArguments(
        **common_args,
        eval_strategy="epoch",
        save_strategy="epoch",
    )

training_args


TrainingArguments(
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=True,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=False,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
enable_jit_checkpoint=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=Fal

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer_kwargs = dict(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=valid_tok,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

try:
    trainer = Trainer(**trainer_kwargs, tokenizer=tokenizer)
except TypeError:
    trainer = Trainer(**trainer_kwargs)

trainer


<transformers.trainer.Trainer at 0x799d3ba16d80>

In [11]:
train_result = trainer.train()
train_result


Epoch,Training Loss,Validation Loss,Accuracy,Precision Macro,Recall Macro,F1 Macro
0,1.730175,1.736202,0.235202,0.141734,0.195922,0.136485


Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


TrainOutput(global_step=300, training_loss=1.7452124786376952, metrics={'train_runtime': 298.5285, 'train_samples_per_second': 8.039, 'train_steps_per_second': 1.005, 'total_flos': 24566175524736.0, 'train_loss': 1.7452124786376952, 'epoch': 0.234375})

In [12]:
valid_metrics = trainer.evaluate(eval_dataset=valid_tok)
test_metrics  = trainer.evaluate(eval_dataset=test_tok)

print("VALID:", valid_metrics)
print("TEST :", test_metrics)


VALID: {'eval_loss': 1.7362022399902344, 'eval_accuracy': 0.235202492211838, 'eval_precision_macro': 0.14173356543079718, 'eval_recall_macro': 0.19592159743145446, 'eval_f1_macro': 0.13648470159419065, 'eval_runtime': 37.5168, 'eval_samples_per_second': 34.225, 'eval_steps_per_second': 2.159, 'epoch': 0.234375}
TEST : {'eval_loss': 1.716438889503479, 'eval_accuracy': 0.23283346487766376, 'eval_precision_macro': 0.11467515772245412, 'eval_recall_macro': 0.1983185017006359, 'eval_f1_macro': 0.13390705378410298, 'eval_runtime': 36.6039, 'eval_samples_per_second': 34.614, 'eval_steps_per_second': 2.186, 'epoch': 0.234375}


In [13]:
# Save metrics
with open(os.path.join(RESULTS_DIR, "transformer_valid_metrics.json"), "w") as f:
    json.dump(valid_metrics, f, indent=2)

with open(os.path.join(RESULTS_DIR, "transformer_test_metrics.json"), "w") as f:
    json.dump(test_metrics, f, indent=2)

# Predict on test
pred_out = trainer.predict(test_tok)
test_logits = pred_out.predictions
test_pred_ids = np.argmax(test_logits, axis=-1)

pred_df = test_df.copy()
pred_df["pred_id"] = test_pred_ids
pred_df["pred_label"] = pred_df["pred_id"].map(ID2LABEL)

pred_path = os.path.join(RESULTS_DIR, "transformer_test_predictions.csv")
pred_df.to_csv(pred_path, index=False)

print("Saved:")
print("-", os.path.join(RESULTS_DIR, "transformer_valid_metrics.json"))
print("-", os.path.join(RESULTS_DIR, "transformer_test_metrics.json"))
print("-", pred_path)
pred_df.head()


Saved:
- ../results/transformer_valid_metrics.json
- ../results/transformer_test_metrics.json
- ../results/transformer_test_predictions.csv


Unnamed: 0,statement,labels,pred_id,pred_label
0,Building a wall on the U.S.-Mexico border will...,5,1,false
1,Wisconsin is on pace to double the number of l...,1,4,mostly-true
2,Says John McCain has done nothing to help the ...,1,1,false
3,Suzanne Bonamici supports a plan that will cut...,3,1,false
4,When asked by a reporter whether hes at the ce...,0,1,false


In [14]:
MODEL_OUT = os.path.join(RUN_DIR, "final_model")
trainer.save_model(MODEL_OUT)
tokenizer.save_pretrained(MODEL_OUT)
print("Saved model to:", MODEL_OUT)


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.27it/s]

Saved model to: ../results/transformer_runs/final_model





In [15]:
import json, pandas as pd

with open("../results/baseline_metrics.json") as f:
    base = json.load(f)

with open("../results/transformer_valid_metrics.json") as f:
    tval = json.load(f)

with open("../results/transformer_test_metrics.json") as f:
    ttest = json.load(f)

conf = pd.read_csv("../results/top_confusions.csv")

print("BASELINE (valid): acc", base["accuracy"], "macro_f1", base["macro_f1"])
print("TRANSFORMER (valid): acc", tval.get("eval_accuracy"), "macro_f1", tval.get("eval_f1_macro"))
print("TRANSFORMER (test): acc", ttest.get("eval_accuracy"), "macro_f1", ttest.get("eval_f1_macro"))
print("\nTop confusions:\n", conf.head(10))


BASELINE (valid): acc 0.21495327102803738 macro_f1 0.19583751645282768
TRANSFORMER (valid): acc 0.235202492211838 macro_f1 0.13648470159419065
TRANSFORMER (test): acc 0.23283346487766376 macro_f1 0.13390705378410298

Top confusions:
     true_label   pred_label  count
0  barely-true    half-true     71
1  mostly-true    half-true     71
2    half-true        false     69
3  barely-true        false     61
4        false    half-true     55
5    half-true  mostly-true     54
6  mostly-true         true     46
7        false  mostly-true     44
8         true    half-true     43
9  mostly-true        false     43
