In [1]:
# (optional) force single GPU to avoid NCCL issues; set *before* importing torch
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

!pip install -q --upgrade accelerate transformers datasets scikit-learn pandas matplotlib evaluate


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [2]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split

SEED = 42

def load_ade_clean(seed=SEED):
    # HF ADE classification config
    ds = load_dataset("ade_corpus_v2", "Ade_corpus_v2_classification")

    # unify to one dataframe and keep only text/label
    parts = []
    for split in ds.keys():
        df = ds[split].to_pandas()[["text","label"]]
        parts.append(df)
    df_all = pd.concat(parts, ignore_index=True).dropna(subset=["text","label"]).reset_index(drop=True)

    # collapse exact duplicate texts; binary label by majority vote
    df_clean = (
        df_all.groupby("text")["label"]
              .agg(lambda x: int(round(x.mean())))
              .reset_index()
    )

    # 80/10/10 stratified split
    X = df_clean["text"].tolist()
    y = df_clean["label"].tolist()
    X_tr, X_tmp, y_tr, y_tmp = train_test_split(X, y, test_size=0.20, random_state=seed, stratify=y)
    X_va, X_te, y_va, y_te   = train_test_split(X_tmp, y_tmp, test_size=0.50, random_state=seed, stratify=y_tmp)

    return DatasetDict({
        "train":      Dataset.from_dict({"text": X_tr, "label": y_tr}),
        "validation": Dataset.from_dict({"text": X_va, "label": y_va}),
        "test":       Dataset.from_dict({"text": X_te, "label": y_te}),
    })

dataset = load_ade_clean()
{k: len(dataset[k]) for k in dataset}


{'train': 16716, 'validation': 2090, 'test': 2090}

In [3]:
import hashlib
def _H(texts): 
    return set(hashlib.md5(t.encode("utf-8","ignore")).hexdigest() for t in texts)

htr, hva, hte = _H(dataset["train"]["text"]), _H(dataset["validation"]["text"]), _H(dataset["test"]["text"])
print("train ∩ val :", len(htr & hva))
print("train ∩ test:", len(htr & hte))
print("val   ∩ test:", len(hva & hte))


train ∩ val : 0
train ∩ test: 0
val   ∩ test: 0


In [20]:
import numpy as np, json
from typing import Dict, Any
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix, f1_score

# ---------- TF-IDF + LinearSVM (with calibration) ----------
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline

def run_tfidf_baseline(dataset, name="TF-IDF + LinearSVM"):
    tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=50_000, min_df=2)
    base  = LinearSVC()
    clf   = CalibratedClassifierCV(base, method="sigmoid", cv=3)  # gives probs → AUC, thresholding

    pipe = Pipeline([("tfidf", tfidf), ("clf", clf)])
    Xtr, ytr = dataset["train"]["text"],      np.array(dataset["train"]["label"])
    Xva, yva = dataset["validation"]["text"], np.array(dataset["validation"]["label"])
    Xte, yte = dataset["test"]["text"],       np.array(dataset["test"]["label"])

    pipe.fit(Xtr, ytr)

    # tune threshold on validation for best F1
    p_va = pipe.predict_proba(Xva)[:,1]
    ths  = np.linspace(0.2, 0.8, 61)
    best_t, _ = max(((t, f1_score(yva, (p_va>=t).astype(int))) for t in ths), key=lambda x: x[1])

    # test metrics at that threshold
    p_te = pipe.predict_proba(Xte)[:,1]
    yhat = (p_te >= best_t).astype(int)

    acc = accuracy_score(yte, yhat)
    pr, rc, f1, _ = precision_recall_fscore_support(yte, yhat, average="binary", zero_division=0)
    auc = roc_auc_score(yte, p_te)
    cm  = confusion_matrix(yte, yhat).tolist()

    return {"Model": name, "ThresholdUsed": round(float(best_t),2),
            "Test_Accuracy": float(acc), "Test_Precision": float(pr),
            "Test_Recall": float(rc), "Test_F1": float(f1), "Test_AUC": float(auc),
            "Checkpoint": "(sklearn pipeline)"}

# ---------- Transformer trainer with tuned recipe ----------
import torch
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          Trainer, TrainingArguments, EarlyStoppingCallback)

def run_transformer(model_id: str, friendly_name: str,
                    batch=12, epochs=5, lr=1e-5, sched="linear", warmup_ratio=0.06,
                    max_len=512, patience=3) -> Dict[str,Any]:

    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    def tok_fn(ex): return tok(ex["text"], truncation=True, max_length=max_len)
    dtr = dataset["train"].map(tok_fn, batched=True)
    dva = dataset["validation"].map(tok_fn, batched=True)
    dte = dataset["test"].map(tok_fn, batched=True)

    model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=2)

    args = TrainingArguments(
        output_dir=f"/workspace/ade-project/outputs/{friendly_name.replace(' ','_')}",
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        gradient_accumulation_steps=1,
        learning_rate=lr, lr_scheduler_type=sched, warmup_ratio=warmup_ratio,
        num_train_epochs=epochs,
        eval_strategy="steps", eval_steps=200,
        logging_steps=50, save_steps=200, save_total_limit=2,
        load_best_model_at_end=True, metric_for_best_model="f1",
        report_to="none"
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = logits.argmax(axis=1)
        # AUC from 2-logit difference
        probs = 1/(1+np.exp(-(logits[:,1]-logits[:,0])))
        pr, rc, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
        try: auc = roc_auc_score(labels, probs)
        except: auc = float("nan")
        acc = (preds==labels).mean()
        return {"accuracy": acc, "precision": pr, "recall": rc, "f1": f1, "auc": auc}

    tr = Trainer(model=model, args=args, train_dataset=dtr, eval_dataset=dva,
                 tokenizer=tok, compute_metrics=compute_metrics,
                 callbacks=[EarlyStoppingCallback(early_stopping_patience=patience)])
    tr.train()

    # threshold on validation
    pred_va = tr.predict(dva)
    p_va = 1/(1+np.exp(-(pred_va.predictions[:,1] - pred_va.predictions[:,0])))
    y_va = np.array(pred_va.label_ids)
    ths  = np.linspace(0.2, 0.8, 61)
    best_t, _ = max(((t, f1_score(y_va, (p_va>=t).astype(int))) for t in ths), key=lambda x: x[1])

    # test
    pred_te = tr.predict(dte)
    p_te = 1/(1+np.exp(-(pred_te.predictions[:,1] - pred_te.predictions[:,0])))
    y_te = np.array(pred_te.label_ids)
    yhat = (p_te >= best_t).astype(int)

    acc = accuracy_score(y_te, yhat)
    pr, rc, f1, _ = precision_recall_fscore_support(y_te, yhat, average="binary", zero_division=0)
    auc = roc_auc_score(y_te, p_te)
    ckpt_path = args.output_dir.replace("/outputs/","/models/")  # simple path tag
    model.save_pretrained(ckpt_path); tok.save_pretrained(ckpt_path)

    return {"Model": f"{friendly_name} (tuned recipe)", "ThresholdUsed": round(float(best_t),2),
            "Test_Accuracy": float(acc), "Test_Precision": float(pr),
            "Test_Recall": float(rc), "Test_F1": float(f1), "Test_AUC": float(auc),
            "Checkpoint": ckpt_path}


In [17]:
import pandas as pd

rows = []
rows.append(run_transformer("dmis-lab/biobert-base-cased-v1.2", "BioBERT"))
rows.append(run_transformer("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract", "PubMedBERT"))
rows.append(run_transformer("bert-base-uncased", "BERT-base"))
rows.append(run_tfidf_baseline(dataset))

df = pd.DataFrame(rows)
display(df)

OUT = "/workspace/ade-project/outputs"
os.makedirs(OUT, exist_ok=True)
df.to_csv(f"{OUT}/comparison_tuned_recipe.csv", index=False)
with open(f"{OUT}/comparison_tuned_recipe.json","w") as f: json.dump(rows, f, indent=2)

print("Saved:", f"{OUT}/comparison_tuned_recipe.csv")
print("Saved:", f"{OUT}/comparison_tuned_recipe.json")


Map:   0%|          | 0/16716 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=dtr, eval_dataset=dva,


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
200,0.4649,0.418735,0.795694,0.0,0.0,0.0,0.826919
400,0.2614,0.203061,0.925359,0.82494,0.805621,0.815166,0.961939
600,0.2243,0.166956,0.938756,0.821505,0.894614,0.856502,0.973449
800,0.2113,0.174231,0.948804,0.892157,0.852459,0.871856,0.978883
1000,0.2501,0.18541,0.932057,0.933131,0.71897,0.812169,0.979962
1200,0.1427,0.196622,0.944498,0.827368,0.920375,0.871397,0.981059
1400,0.1911,0.150899,0.952632,0.871041,0.901639,0.886076,0.983535
1600,0.1467,0.182148,0.953589,0.854077,0.932084,0.891377,0.982664
1800,0.1651,0.186311,0.947847,0.832636,0.932084,0.879558,0.984042
2000,0.0887,0.178465,0.954067,0.855914,0.932084,0.892377,0.984313


Map:   0%|          | 0/16716 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=dtr, eval_dataset=dva,


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
200,0.4237,0.377722,0.84689,0.824242,0.318501,0.459459,0.863566
400,0.2149,0.248077,0.90622,0.700173,0.946136,0.804781,0.969541
600,0.2739,0.184762,0.939234,0.842466,0.864169,0.853179,0.976595
800,0.2367,0.15827,0.949761,0.859375,0.901639,0.88,0.982449
1000,0.2169,0.150996,0.951675,0.867117,0.901639,0.884041,0.983685
1200,0.1774,0.186293,0.947847,0.841202,0.918033,0.87794,0.986089
1400,0.232,0.145617,0.952153,0.867416,0.903981,0.885321,0.986439
1600,0.12,0.197398,0.95311,0.871332,0.903981,0.887356,0.985695
1800,0.156,0.155964,0.958373,0.895349,0.901639,0.898483,0.986942
2000,0.1023,0.170629,0.958373,0.902844,0.892272,0.897527,0.987122


Map:   0%|          | 0/16716 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Map:   0%|          | 0/2090 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  tr = Trainer(model=model, args=args, train_dataset=dtr, eval_dataset=dva,


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
200,0.5218,0.481337,0.795694,0.0,0.0,0.0,0.74376
400,0.3172,0.262418,0.902392,0.826979,0.660422,0.734375,0.939355
600,0.236,0.226111,0.902871,0.712121,0.880562,0.787435,0.95575
800,0.2567,0.17994,0.931579,0.828704,0.838407,0.833527,0.96611
1000,0.2754,0.216529,0.920574,0.882698,0.704918,0.783854,0.967707
1200,0.1948,0.244195,0.909569,0.705882,0.955504,0.81194,0.974005
1400,0.2221,0.204033,0.931579,0.90113,0.747073,0.816901,0.9738


Unnamed: 0,Model,ThresholdUsed,Test_Accuracy,Test_Precision,Test_Recall,Test_F1,Test_AUC,Checkpoint
0,BioBERT (tuned recipe),0.32,0.953589,0.873303,0.903981,0.888377,0.983821,/workspace/ade-project/models/BioBERT
1,PubMedBERT (tuned recipe),0.68,0.95311,0.896386,0.871194,0.88361,0.985381,/workspace/ade-project/models/PubMedBERT
2,BERT-base (tuned recipe),0.4,0.928708,0.798283,0.871194,0.833147,0.967685,/workspace/ade-project/models/BERT-base
3,TF-IDF + LinearSVM,0.36,0.886124,0.710468,0.747073,0.728311,0.929341,(sklearn pipeline)


Saved: /workspace/ade-project/outputs/comparison_tuned_recipe.csv
Saved: /workspace/ade-project/outputs/comparison_tuned_recipe.json


In [27]:
from datasets import load_dataset
ds = load_dataset("SetFit/ade_corpus_v2_classification")  # train/test ready


README.md:   0%|          | 0.00/331 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


train.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/17637 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5879 [00:00<?, ? examples/s]

In [28]:
# ===== External evaluation on SetFit ADE (test split) =====
import os, json, numpy as np, pandas as pd, torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, confusion_matrix

# 1) Load SetFit ADE external test set
ext = load_dataset("SetFit/ade_corpus_v2_classification")
external_test = ext["test"]  # keep as-is; it already has columns: 'text', 'label'
print("External test size:", len(external_test))

# 2) Helper to evaluate a saved HF checkpoint at a fixed probability threshold
def eval_ckpt(ckpt_path: str, threshold: float, dataset, max_len=512):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tok = AutoTokenizer.from_pretrained(ckpt_path, use_fast=True, local_files_only=False)
    mdl = AutoModelForSequenceClassification.from_pretrained(ckpt_path, local_files_only=False).to(device)

    def tok_fn(ex): 
        return tok(ex["text"], truncation=True, max_length=max_len)
    dset_tok = dataset.map(tok_fn, batched=True)

    tr = Trainer(model=mdl, tokenizer=tok)
    pred = tr.predict(dset_tok)
    logits = pred.predictions
    y_true = np.array(pred.label_ids)

    # P(class=1) from 2-logit difference
    prob1 = 1/(1+np.exp(-(logits[:,1]-logits[:,0])))
    y_hat = (prob1 >= threshold).astype(int)

    acc = accuracy_score(y_true, y_hat)
    pr, rc, f1, _ = precision_recall_fscore_support(y_true, y_hat, average="binary", zero_division=0)
    try: auc = roc_auc_score(y_true, prob1)
    except: auc = float("nan")
    cm  = confusion_matrix(y_true, y_hat).tolist()
    return {"Accuracy":acc, "Precision":pr, "Recall":rc, "F1":f1, "AUC":auc, "CM":cm}

# 3) Load your tuned models + thresholds from the JSON you saved earlier
CHOICES_JSON = "/workspace/ade-project/outputs/comparison_tuned_recipe.json"
rows = json.load(open(CHOICES_JSON))

# keep only transformer rows (skip TF-IDF because there’s no HF checkpoint path)
models = [r for r in rows if "Checkpoint" in r and r["Checkpoint"] != "(sklearn pipeline)"]
assert len(models) > 0, "No transformer checkpoints found in your comparison JSON."

# 4) Evaluate all models on SetFit test split
ext_rows = []
for m in models:
    name = m["Model"]
    thr  = float(m["ThresholdUsed"])
    ckpt = m["Checkpoint"]
    print(f"Scoring external SetFit — {name} @ thr={thr} …")
    met = eval_ckpt(ckpt, thr, external_test)
    ext_rows.append({
        "Model": name,
        "ThresholdUsed": thr,
        "Ext_Accuracy": met["Accuracy"],
        "Ext_Precision": met["Precision"],
        "Ext_Recall": met["Recall"],
        "Ext_F1": met["F1"],
        "Ext_AUC": met["AUC"],
        "Checkpoint": ckpt
    })

df_ext = pd.DataFrame(ext_rows).sort_values("Ext_F1", ascending=False).reset_index(drop=True)
display(df_ext)

# 5) Save
OUT = "/workspace/ade-project/outputs"
os.makedirs(OUT, exist_ok=True)
CSV = f"{OUT}/external_setfit_eval.csv"
JSON = f"{OUT}/external_setfit_eval.json"
df_ext.to_csv(CSV, index=False)
json.dump(ext_rows, open(JSON, "w"), indent=2)
print("Saved:", CSV)
print("Saved:", JSON)


Repo card metadata block was not found. Setting CardData to empty.


External test size: 5879
Scoring external SetFit — BioBERT (tuned recipe) @ thr=0.32 …


Map:   0%|          | 0/5879 [00:00<?, ? examples/s]

  tr = Trainer(model=mdl, tokenizer=tok)


Scoring external SetFit — PubMedBERT (tuned recipe) @ thr=0.68 …


Map:   0%|          | 0/5879 [00:00<?, ? examples/s]

  tr = Trainer(model=mdl, tokenizer=tok)


Scoring external SetFit — BERT-base (tuned recipe) @ thr=0.4 …


Map:   0%|          | 0/5879 [00:00<?, ? examples/s]

  tr = Trainer(model=mdl, tokenizer=tok)


Unnamed: 0,Model,ThresholdUsed,Ext_Accuracy,Ext_Precision,Ext_Recall,Ext_F1,Ext_AUC,Checkpoint
0,BioBERT (tuned recipe),0.32,0.9818,0.961742,0.97494,0.968296,0.996484,/workspace/ade-project/models/BioBERT
1,PubMedBERT (tuned recipe),0.68,0.967171,0.948036,0.936158,0.942059,0.992427,/workspace/ade-project/models/PubMedBERT
2,BERT-base (tuned recipe),0.4,0.92941,0.865932,0.890215,0.877905,0.973231,/workspace/ade-project/models/BERT-base


Saved: /workspace/ade-project/outputs/external_setfit_eval.csv
Saved: /workspace/ade-project/outputs/external_setfit_eval.json


In [29]:
from datasets import load_dataset
import random

# Load
ds = load_dataset("SetFit/ade_corpus_v2_classification")
print(ds)  # shows splits and sizes

# Label names (0 = no ADE, 1 = ADE)
label_names = {0: "no_ADE", 1: "ADE"}

# Helper to pretty-print an example
def show(ex):
    print(f"[{label_names[int(ex['label'])]}] {ex['text']}\n")

# Look at 3 random test examples
print("=== Random test examples ===")
for ex in ds["test"].shuffle(seed=42).select(range(3)):
    show(ex)

# Look at one positive (ADE) and one negative (no_ADE) from test
pos = next(ex for ex in ds["test"] if ex["label"] == 1)
neg = next(ex for ex in ds["test"] if ex["label"] == 0)

print("=== One ADE example ===")
show(pos)
print("=== One no-ADE example ===")
show(neg)

# Quick label distribution (train/test)
for split in ["train", "test"]:
    counts = {0:0, 1:0}
    for y in ds[split]["label"]:
        counts[int(y)] += 1
    print(f"{split} label counts:", {label_names[k]: v for k,v in counts.items()})


Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 17637
    })
    test: Dataset({
        features: ['text', 'label', 'label_text'],
        num_rows: 5879
    })
})
=== Random test examples ===
[ADE] CONCLUSIONS: Clinicians should be aware of a risk of serotonin syndrome with serious extrapyramidal reactions in patients receiving sertraline or venlafaxine when metoclopramide is coadministered even in a single, conventional dose.

[no_ADE] He was referred after a percutaneous liver biopsy which revealed a moderately differentiated HCC.

[no_ADE] Measurements of the peripheral arterial circulation were made using the Doppler ultrasonic velocity detector.

=== One ADE example ===
[ADE] The patient was given methimazole instead of propylthiouracil but, 10 weeks later, agranulocytosis again occurred.

=== One no-ADE example ===
[no_ADE] The use of somatostatin analog in gastroenteropancreatic tumors other than carcinoid.

train label c