In [None]:
!pip install -U transformers datasets seqeval evaluate


In [None]:
from datasets import load_dataset, DatasetDict

# Load NCBI Disease dataset
dataset = load_dataset('ncbi_disease')

split_dataset = dataset['train'].train_test_split(test_size=0.1)

dataset = DatasetDict({
    'train': split_dataset['train'],
    'validation': split_dataset['test']
})

print(dataset)
print(dataset['train'][0])
print(dataset['validation'][0])


In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# Load BioBERT model
model = AutoModelForTokenClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=3)


## Tokenize Dataset and Align Labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [None]:
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)

In [None]:
import numpy as np
import evaluate

# Load metric
metric = evaluate.load("seqeval")

# Labels list
label_list = dataset['train'].features['ner_tags'].feature.names

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


## Set Training Arguments


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./biobert_disease_ner",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    report_to="none"
)


##Fine-Tune BioBERT Model


In [None]:
# Define Trainer
from transformers import Trainer, DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()


In [None]:
#  Evaluate final metrics
metrics = trainer.evaluate()
print("✅ Final Evaluation Metrics:")
for key, value in metrics.items():
    if isinstance(value, float):
        print(f"{key}: {value:.4f}")


## Plot Training Loss Curve


In [None]:
import matplotlib.pyplot as plt

# Extract loss values
training_loss = trainer.state.log_history

# Get step vs loss
steps = []
losses = []

for log in training_loss:
    if "loss" in log:
        steps.append(log["step"])
        losses.append(log["loss"])

# Plot
plt.figure(figsize=(8,5))
plt.plot(steps, losses, marker='o')
plt.title('Training Loss vs Steps')
plt.xlabel('Steps')
plt.ylabel('Training Loss')
plt.grid(True)
plt.show()


##Save and Upload the model to Hugging Face

In [None]:
from transformers import pipeline

nlp = pipeline(
    "ner",
    model="Ishan0612/biobert-ner-disease-ncbi",
    tokenizer="Ishan0612/biobert-ner-disease-ncbi",
    aggregation_strategy="simple"
)

text = "The patient has signs of diabetes mellitus and chronic obstructive pulmonary disease."

results = nlp(text)

for entity in results:
    print(f"{entity['word']} ({entity['entity_group']}) - Confidence: {entity['score']:.2f}")
