#**BioBERT**

In [None]:
!pip install transformers torch



In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForQuestionAnswering, AutoModelForMaskedLM, pipeline
import torch
from torch.nn.functional import cosine_similarity

# -----------------------
# Load BioBERT pretrained
# -----------------------
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=tokenizer)

encoder = AutoModel.from_pretrained(model_name)
mlm_model = AutoModelForMaskedLM.from_pretrained(model_name)
mlm = pipeline("fill-mask", model=mlm_model, tokenizer=tokenizer)


# Helper function
def embed(text):
    return encoder(**tokenizer(text, return_tensors="pt")).last_hidden_state[:,0,:]


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0
Device set to use cuda:0


##Zero-Shot

In [None]:
# 1. Zero-shot QA
print("\n1. Zero-shot QA:")
context = "Aspirin is used to reduce fever, pain, and inflammation. It prevents blood clots."
question = "What does aspirin reduce?"
print("Input Q:", question)
print("Expected: fever, pain, inflammation")
print("Predicted:", qa_pipeline(question=question, context=context))


# 2. Text Similarity
print("\n2. Text Similarity:")
s1, s2 = "Aspirin reduces inflammation", "Aspirin lowers swelling"
with torch.no_grad():
    sim = cosine_similarity(embed(s1), embed(s2)).item()
print(f"Texts: '{s1}' vs '{s2}'")
print("Expected: High similarity (~0.8–1.0)")
print(f"Predicted similarity: {sim:.3f}")


# 3. Masked Language Modeling
print("\n3. Masked Language Modeling:")
mlm_results = mlm("Insulin is produced by the [MASK].")[:5]
print("Input: 'Insulin is produced by the [MASK].'")
print("Expected: pancreas, beta cells, islets, body organs...")
print("Predicted:", [r['token_str'] for r in mlm_results])


# 4. Token Embedding Inspection
print("\n4. Token Embedding Inspection:")
sentence = "BRCA1 is a gene linked to breast cancer."
tokens = tokenizer(sentence, return_tensors="pt")
print("Sentence:", sentence)
print("Expected: Tokens like BRCA1, gene, cancer get distinct embeddings")
print("Predicted tokens:", tokenizer.convert_ids_to_tokens(tokens["input_ids"][0]))


# 5. Sentence Classification Hack
print("\n5. Sentence Classification Hack:")
claim = "Aspirin reduces fever"
sim_true = cosine_similarity(embed(claim), embed("This is true")).item()
sim_false = cosine_similarity(embed(claim), embed("This is false")).item()
print("Claim:", claim)
print("Expected: Similarity to 'True' > 'False'")
print(f"Predicted: True={sim_true:.3f}, False={sim_false:.3f}")


# 6. Domain Mismatch QA
print("\n6. Domain Mismatch QA:")
context = "Paris is the capital of France."
question = "What is the capital of France?"
print("Input Q:", question)
print("Expected: Paris")
print("Predicted:", qa_pipeline(question=question, context=context))


# 7. Word Similarity
print("\n7. Word Similarity:")
w1, w2 = "myocardial infarction", "heart attack"
sim = cosine_similarity(embed(w1), embed(w2)).item()
print(f"Words: '{w1}' vs '{w2}'")
print("Expected: High similarity (>0.8)")
print(f"Predicted similarity: {sim:.3f}")


# 8. Long Context QA
print("\n8. Long Context QA:")
context = """Alzheimer's disease is a progressive neurodegenerative disorder.
It is strongly associated with the accumulation of beta-amyloid plaques in the brain."""
question = "Which protein is implicated in Alzheimer's disease?"
print("Input Q:", question)
print("Expected: beta-amyloid")
print("Predicted:", qa_pipeline(question=question, context=context))


# 9. Contradiction Probe
print("\n9. Contradiction Probe:")
context = "Aspirin reduces inflammation."
print("Q1: Does aspirin reduce inflammation?")
print("Expected: Yes (reduce)")
print("Predicted:", qa_pipeline(question="Does aspirin reduce inflammation?", context=context))
print("Q2: Does aspirin increase inflammation?")
print("Expected: No (should not match)")
print("Predicted:", qa_pipeline(question="Does aspirin increase inflammation?", context=context))


# 10. Random Robustness
print("\n10. Random Robustness QA:")
context = "Banana walks on Mars with a scalpel."
question = "What walks on Mars?"
print("Input Q:", question)
print("Expected: (nonsense / meaningless)")
print("Predicted:", qa_pipeline(question=question, context=context))


1. Zero-shot QA:
Input Q: What does aspirin reduce?
Expected: fever, pain, inflammation
Predicted: {'score': 0.010497326031327248, 'start': 0, 'end': 38, 'answer': 'Aspirin is used to reduce fever, pain,'}

2. Text Similarity:
Texts: 'Aspirin reduces inflammation' vs 'Aspirin lowers swelling'
Expected: High similarity (~0.8–1.0)
Predicted similarity: 0.985

3. Masked Language Modeling:
Input: 'Insulin is produced by the [MASK].'
Expected: pancreas, beta cells, islets, body organs...
Predicted: ['##eim', '##ivers', '##ob', '##ively', '##encing']

4. Token Embedding Inspection:
Sentence: BRCA1 is a gene linked to breast cancer.
Expected: Tokens like BRCA1, gene, cancer get distinct embeddings
Predicted tokens: ['[CLS]', 'br', '##ca', '##1', 'is', 'a', 'gene', 'linked', 'to', 'breast', 'cancer', '.', '[SEP]']

5. Sentence Classification Hack:
Claim: Aspirin reduces fever
Expected: Similarity to 'True' > 'False'
Predicted: True=0.892, False=0.905

6. Domain Mismatch QA:
Input Q: What is t

##Few-Shot

##Fine-tuning

In [None]:
!git clone https://github.com/dmis-lab/biobert.git
%cd biobert
!pip install -r requirements.txt


Cloning into 'biobert'...
remote: Enumerating objects: 359, done.[K
remote: Counting objects: 100% (81/81), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 359 (delta 46), reused 51 (delta 23), pack-reused 278 (from 1)[K
Receiving objects: 100% (359/359), 516.14 KiB | 19.85 MiB/s, done.
Resolving deltas: 100% (208/208), done.
/content/biobert
[31mERROR: Could not find a version that satisfies the requirement tensorflow-gpu==1.15.2 (from versions: 2.12.0)[0m[31m
[0m[31mERROR: No matching distribution found for tensorflow-gpu==1.15.2[0m[31m
[0m

In [None]:
#Upload the dataset manualy since it is not being installed
#Got the data from http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz

from google.colab import files
import os

# Create target folder
os.makedirs("./datasets/QA/BioASQ", exist_ok=True)

# Upload files manually
uploaded = files.upload()

# Move each uploaded file into BioASQ folder
for filename in uploaded.keys():
    os.rename(filename, f"./datasets/QA/BioASQ/{filename}")

print("Files inside BioASQ:", os.listdir("./datasets/QA/BioASQ"))


Saving BioASQ-train-factoid-4b.json to BioASQ-train-factoid-4b.json
Saving BioASQ-test-factoid-4b-1.json to BioASQ-test-factoid-4b-1.json
Saving BioASQ-test-factoid-4b-3.json to BioASQ-test-factoid-4b-3.json
Saving BioASQ-test-factoid-4b-4.json to BioASQ-test-factoid-4b-4.json
Files inside BioASQ: ['BioASQ-test-factoid-4b-4.json', 'BioASQ-test-factoid-4b-3.json', 'BioASQ-test-factoid-4b-1.json', 'BioASQ-train-factoid-4b.json']


In [None]:
import os
#put the model manullay into the created folder biobert_v1.1_pubmed
BIOBERT_DIR = "./biobert_v1.1_pubmed"
QA_DIR = "./datasets/QA/BioASQ"
OUTPUT_DIR = "./qa_outputs"

os.environ["BIOBERT_DIR"] = BIOBERT_DIR
os.environ["QA_DIR"] = QA_DIR
os.environ["OUTPUT_DIR"] = OUTPUT_DIR


In [None]:
# --- installs ---
!pip install -q transformers==2.19.0 datasets==2.* numpy
# Hard-disable W&B (even if installed somewhere in the env)
import os
os.environ["WANDB_DISABLED"] = "true"

from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
import torch


[31mERROR: Ignored the following yanked versions: 4.14.0, 4.25.0, 4.46.0, 4.52.0[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement transformers==2.19.0 (from versions: 0.1, 2.0.0, 2.1.0, 2.1.1, 2.2.0, 2.2.1, 2.2.2, 2.3.0, 2.4.0, 2.4.1, 2.5.0, 2.5.1, 2.6.0, 2.7.0, 2.8.0, 2.9.0, 2.9.1, 2.10.0, 2.11.0, 3.0.0, 3.0.1, 3.0.2, 3.1.0, 3.2.0, 3.3.0, 3.3.1, 3.4.0, 3.5.0, 3.5.1, 4.0.0rc1, 4.0.0, 4.0.1, 4.1.0, 4.1.1, 4.2.0, 4.2.1, 4.2.2, 4.3.0rc1, 4.3.0, 4.3.1, 4.3.2, 4.3.3, 4.4.0, 4.4.1, 4.4.2, 4.5.0, 4.5.1, 4.6.0, 4.6.1, 4.7.0, 4.8.0, 4.8.1, 4.8.2, 4.9.0, 4.9.1, 4.9.2, 4.10.0, 4.10.1, 4.10.2, 4.10.3, 4.11.0, 4.11.1, 4.11.2, 4.11.3, 4.12.0, 4.12.1, 4.12.2, 4.12.3, 4.12.4, 4.12.5, 4.13.0, 4.14.1, 4.15.0, 4.16.0, 4.16.1, 4.16.2, 4.17.0, 4.18.0, 4.19.0, 4.19.1, 4.19.2, 4.19.3, 4.19.4, 4.20.0, 4.20.1, 4.21.0, 4.21.1, 4.21.2, 4.21.3, 4.22.0, 4.22.1, 4.22.2, 4.23.0, 4.23.1, 4.24.0, 4.25.1, 4.26.0, 4.26.1, 4.27.0, 4.27.1, 4.27.2, 4.27.3, 4.27.4, 4.28.0, 4.28.1, 4.29.0, 4.

In [None]:

import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

# ===== 1) Load BioBERT =====
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# ===== 2) Flatten BioASQ (train) =====
def squad_to_rows(path):
    with open(path) as f:
        squad_dict = json.load(f)
    rows = []
    for article in squad_dict["data"]:
        for p in article["paragraphs"]:
            context = p["context"]
            for qa in p["qas"]:
                rows.append({
                    "id": qa["id"],
                    "title": article.get("title", ""),
                    "context": context,
                    "question": qa["question"],
                    "answers": qa["answers"],  # list of dicts
                })
    return rows

train_file = "./datasets/QA/BioASQ/BioASQ-train-factoid-4b.json"
train_rows = squad_to_rows(train_file)

full_dataset = Dataset.from_list(train_rows)
print("Total examples:", len(full_dataset))

# ===== 3) Split train → train/validation (90/10) =====
raw = full_dataset.train_test_split(test_size=0.1, seed=42)
print("Train size:", len(raw["train"]))
print("Validation size:", len(raw["test"]))

# ===== 4) Preprocess features =====
max_len = 384
doc_stride = 128

def prepare_features(examples):
    tokenized = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_len,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    sample_mapping = tokenized.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = tokenized["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sample_idx = sample_mapping[i]
        answers = examples["answers"][sample_idx]

        if len(answers) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers[0]["answer_start"]
            end_char = start_char + len(answers[0]["text"])

            sequence_ids = tokenized.sequence_ids(i)
            context_start = sequence_ids.index(1)
            context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            if not (offsets[context_start][0] <= start_char and offsets[context_end][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                idx = context_start
                while idx <= context_end and offsets[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offsets[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    tokenized["start_positions"] = start_positions
    tokenized["end_positions"] = end_positions
    return tokenized

train_ds = raw["train"].map(
    prepare_features,
    batched=True,
    remove_columns=raw["train"].column_names,
)
val_ds = raw["test"].map(
    prepare_features,
    batched=True,
    remove_columns=raw["test"].column_names,
)

cols = ["input_ids", "attention_mask", "token_type_ids", "start_positions", "end_positions"]
for ds in (train_ds, val_ds):
    ds.set_format(type="torch", columns=[c for c in cols if c in ds.column_names])

# ===== 5) Training arguments =====
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_steps=100,
    save_total_limit=1,
    seed=42,
)


# ===== 6) Trainer =====
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
)

trainer.train()
print(trainer.evaluate())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total examples: 3266
Train size: 2939
Validation size: 327


Map:   0%|          | 0/2939 [00:00<?, ? examples/s]

Map:   0%|          | 0/327 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
100,3.1147
200,1.5042
300,1.4618
400,1.2224
500,1.1966
600,1.1529


{'eval_loss': 1.040913701057434, 'eval_runtime': 13.0138, 'eval_samples_per_second': 40.957, 'eval_steps_per_second': 5.148, 'epoch': 1.0}


In [None]:
import os, json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

# ===== 1) Find the latest checkpoint in ./results =====
base_dir = "./results"
subdirs = [d for d in os.listdir(base_dir) if d.startswith("checkpoint")]
if subdirs:
    latest = max(subdirs, key=lambda x: int(x.split("-")[-1]))
    model_dir = os.path.join(base_dir, latest)
else:
    model_dir = base_dir
print(f"Loading fine-tuned model from: {model_dir}")

# ===== 2) Load tokenizer + fine-tuned model =====
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1", use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_dir)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# ===== 3) Load BioASQ test file =====
test_file = "./datasets/QA/BioASQ/BioASQ-test-factoid-4b-1.json"
with open(test_file) as f:
    test_data = json.load(f)

# Flatten into (id, question, context)
test_examples = []
for article in test_data["data"]:
    for p in article["paragraphs"]:
        context = p["context"]
        for qa in p["qas"]:
            test_examples.append({
                "id": qa["id"],
                "question": qa["question"],
                "context": context
            })

print(f"Loaded {len(test_examples)} test questions")

# ===== 4) Run inference with fine-tuned model =====
preds = []
for ex in test_examples:
    result = qa_pipeline(question=ex["question"], context=ex["context"])
    base_id = ex["id"].split("_")[0]  # strip suffix to match gold IDs
    preds.append({
        "id": base_id,
        "question": ex["question"],
        "predicted_answer": result["answer"],
        "score": result["score"]
    })

# Save predictions
out_file = "bioasq_predictions_finetuned.json"
with open(out_file, "w") as f:
    json.dump(preds, f, indent=2)
print(f"Saved fine-tuned predictions to {out_file}")

# ===== 5) Load golden answers =====
gold_file = "./datasets/QA/BioASQ/4B1_golden.json"
with open(gold_file) as f:
    gold_data = json.load(f)

def extract_texts(ans):
    texts = []
    if isinstance(ans, str):
        texts = [ans]
    elif isinstance(ans, list):
        for item in ans:
            texts.extend(extract_texts(item))
    return [t.strip() for t in texts if t]

gold_refs = {}
for q in gold_data["questions"]:
    if "exact_answer" in q and q["exact_answer"]:
        texts = extract_texts(q["exact_answer"])
        if texts:
            gold_refs[q["id"]] = {"text": texts, "answer_start": [0] * len(texts)}

# ===== 6) Align predictions with gold =====
predictions = []
references = []
for p in preds:
    qid = p["id"]
    if qid in gold_refs:
        predictions.append({"id": qid, "prediction_text": p["predicted_answer"]})
        references.append({"id": qid, "answers": gold_refs[qid]})

print(f"Evaluating {len(predictions)} questions with gold answers (fine-tuned BioBERT)")

# ===== 7) Compute metrics =====
if predictions:
    metric = evaluate.load("squad")
    results = metric.compute(predictions=predictions, references=references)
    print("\n📊 Fine-tuned BioBERT Performance:")
    print("Exact Match:", results["exact_match"])
    print("F1:", results["f1"])
else:
    print("⚠️ No evaluable questions found")


Loading fine-tuned model from: ./results/checkpoint-622


Device set to use cuda:0


Loaded 167 test questions
Saved fine-tuned predictions to bioasq_predictions_finetuned.json
Evaluating 167 questions with gold answers (fine-tuned BioBERT)

📊 Fine-tuned BioBERT Performance:
Exact Match: 17.964071856287426
F1: 28.222658692639783


In [None]:
import json
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import evaluate

# ===== 1) Load raw BioBERT (no fine-tuning) =====
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

# ===== 2) Load BioASQ test file =====
test_file = "./datasets/QA/BioASQ/BioASQ-test-factoid-4b-1.json"
with open(test_file) as f:
    test_data = json.load(f)

# Flatten into (id, question, context)
test_examples = []
for article in test_data["data"]:
    for p in article["paragraphs"]:
        context = p["context"]
        for qa in p["qas"]:
            test_examples.append({
                "id": qa["id"],
                "question": qa["question"],
                "context": context
            })

print(f"Loaded {len(test_examples)} test questions")

# ===== 3) Run inference with raw BioBERT =====
preds = []
for ex in test_examples:
    result = qa_pipeline(question=ex["question"], context=ex["context"])
    base_id = ex["id"].split("_")[0]  # strip suffix to match gold
    preds.append({
        "id": base_id,
        "question": ex["question"],
        "predicted_answer": result["answer"],
        "score": result["score"]
    })

# Save predictions
out_file = "bioasq_predictions_biobert_raw.json"
with open(out_file, "w") as f:
    json.dump(preds, f, indent=2)
print(f"Saved raw BioBERT predictions to {out_file}")

# ===== 4) Load golden answers =====
gold_file = "./datasets/QA/BioASQ/4B1_golden.json"
with open(gold_file) as f:
    gold_data = json.load(f)

# Recursive extractor for gold answers
def extract_texts(ans):
    texts = []
    if isinstance(ans, str):
        texts = [ans]
    elif isinstance(ans, list):
        for item in ans:
            texts.extend(extract_texts(item))
    return [t.strip() for t in texts if t]

gold_refs = {}
for q in gold_data["questions"]:
    if "exact_answer" in q and q["exact_answer"]:
        texts = extract_texts(q["exact_answer"])
        if texts:
            gold_refs[q["id"]] = {"text": texts, "answer_start": [0] * len(texts)}

# ===== 5) Align predictions with gold =====
predictions = []
references = []
for p in preds:
    qid = p["id"]
    if qid in gold_refs:
        predictions.append({"id": qid, "prediction_text": p["predicted_answer"]})
        references.append({"id": qid, "answers": gold_refs[qid]})

print(f"Evaluating {len(predictions)} questions with gold answers (raw BioBERT)")

# ===== 6) Compute metrics =====
if predictions:
    metric = evaluate.load("squad")
    results = metric.compute(predictions=predictions, references=references)
    print("\n📊 Raw BioBERT Performance:")
    print("Exact Match:", results["exact_match"])
    print("F1:", results["f1"])
else:
    print("⚠️ No evaluable questions found")


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Loaded 167 test questions
Saved raw BioBERT predictions to bioasq_predictions_biobert_raw.json
Evaluating 167 questions with gold answers (raw BioBERT)

📊 Raw BioBERT Performance:
Exact Match: 0.0
F1: 2.90015334749396
