In [None]:
from datasets import load_dataset
from datasets import load_dataset, Dataset, concatenate_datasets
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
import random

In [None]:
#fetch ncbi disease dataset and split to train and validation
dataset = load_dataset("ncbi_disease")
train_data = dataset['train']
val_data = dataset['test']



Downloading data: 1.14MB [00:00, 54.0MB/s]                  
Downloading data: 200kB [00:00, 50.7MB/s]                    
Downloading data: 206kB [00:00, 213MB/s]                     
Generating train split: 100%|██████████| 5433/5433 [00:00<00:00, 30240.00 examples/s]
Generating validation split: 100%|██████████| 924/924 [00:00<00:00, 30673.27 examples/s]
Generating test split: 100%|██████████| 941/941 [00:00<00:00, 29823.94 examples/s]


In [None]:
#mock synonym foor augmentation, could add a lot more
synonyms = {
    "glioblastoma": ["GBM", "glioblastoma multiforme"],
    "tp53": ["tumor protein p53", "p53 gene"],
}

def augment_sentence(tokens):
    augmented = []
    for token in tokens:
        key = token.lower()
        if key in synonyms and random.random() < 0.3:  # 30% chance to replace
            augmented.append(random.choice(synonyms[key]))
        else:
            augmented.append(token)
    return augmented



In [None]:



# Label mappings (adapt as per dataset)
label2id = {
    "O": 0,
    "B-DISEASE": 1,
    "I-DISEASE": 2,
    # add other labels if needed
}
id2label = {v: k for k, v in label2id.items()}

# Load dataset from HF hub (NCBI Disease)
raw_train = load_dataset("ncbi_disease", split="train")
raw_val = load_dataset("ncbi_disease", split="test")

# Simple synonym dictionary for augmentation
synonyms = {
    "glioblastoma": ["GBM", "glioblastoma multiforme"],
    "tp53": ["tumor protein p53", "p53 gene"],
}

def augment_example(example):
    tokens = example["tokens"]
    augmented_tokens = []
    for t in tokens:
        key = t.lower()
        if key in synonyms and random.random() < 0.3:
            augmented_tokens.append(random.choice(synonyms[key]))
        else:
            augmented_tokens.append(t)
    example["tokens"] = augmented_tokens
    return example

# Augment training data (double data size)
augmented_train = raw_train.map(augment_example)

# Combine original + augmented
train_dataset = concatenate_datasets([raw_train, augmented_train])

# Load tokenizer and model
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

# Tokenize and align labels function here (same as before)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                current_label = label[word_idx]
                if current_label % 2 == 1:
                    label_ids.append(current_label + 1)
                else:
                    label_ids.append(current_label)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
val_dataset = raw_val.map(tokenize_and_align_labels, batched=True)

# Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

# Training arguments
training_args = TrainingArguments(
    output_dir="./ner_finetuned_rare_disease",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.evaluate()
trainer.save_model("./ner_finetuned_rare_disease_model")


Map: 100%|██████████| 5433/5433 [00:00<00:00, 31774.54 examples/s]
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([3]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([3, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 10866/10866 [00:01<00:00, 6304.94 examples/s]
Map: 100%|██████████| 941/941 [00:00<00:00, 5988.98 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
