## Training

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)
from datasets import Dataset
from seqeval.metrics import classification_report


In [4]:
label_list = [
    "O",
    "B-SYMPTOM",
    "I-SYMPTOM",
    "B-TREATMENT",
    "I-TREATMENT"
]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}


In [5]:
model_name = "dmis-lab/biobert-base-cased-v1.1"

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def read_conll_file(path):
    sentences = []
    labels = []

    words = []
    tags = []

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    labels.append(tags)
                    words, tags = [], []
            else:
                token, tag = line.split()
                words.append(token)
                tags.append(tag)

    return {"tokens": sentences, "ner_tags": labels}

In [None]:
train_data = read_conll_file("../Data/i2b2_Transformed/train.txt")
dev_data = read_conll_file("../Data/i2b2_Transformed/dev.txt")

train_dataset = Dataset.from_dict(train_data)
dev_dataset = Dataset.from_dict(dev_data)


In [8]:
train_dataset[98]

{'tokens': ['1.', 'Amiodarone', '200', 'mg', 'q.d.'],
 'ner_tags': ['O', 'B-TREATMENT', 'O', 'O', 'O']}

In [9]:
def tokenize_and_align_labels(examples):
    tokenized = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True
    )

    aligned_labels = []

    for i, labels in enumerate(examples["ner_tags"]):
        word_ids = tokenized.word_ids(batch_index=i)
        previous_word_id = None
        label_ids = []

        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)  # ignored
            elif word_id != previous_word_id:
                label_ids.append(label2id[labels[word_id]])
            else:
                label_ids.append(label2id[labels[word_id]])
            previous_word_id = word_id

        aligned_labels.append(label_ids)

    tokenized["labels"] = aligned_labels
    return tokenized


In [10]:
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
dev_dataset = dev_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/27625 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/2447 [00:00<?, ? examples/s]

In [11]:
train_dataset[98]

{'tokens': ['1.', 'Amiodarone', '200', 'mg', 'q.d.'],
 'ner_tags': ['O', 'B-TREATMENT', 'O', 'O', 'O'],
 'input_ids': [101,
  122,
  119,
  1821,
  2660,
  7858,
  4798,
  2363,
  17713,
  186,
  119,
  173,
  119,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [-100, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, -100]}

In [15]:
# training_args = TrainingArguments(
#     output_dir="./biobert-ner",
#     eval_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_strategy="epoch",
#     logging_steps=50,
#     report_to="none"
# )
training_args = TrainingArguments(
    output_dir="./biobert_ner_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    greater_is_better=True,
    save_total_limit=1,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none"
)


In [16]:
data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)


  trainer = Trainer(


In [17]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.0411,0.143732
2,0.0385,0.15818
3,0.0197,0.173221


TrainOutput(global_step=5181, training_loss=0.0381317838603662, metrics={'train_runtime': 910.5521, 'train_samples_per_second': 91.016, 'train_steps_per_second': 5.69, 'total_flos': 2136249767306460.0, 'train_loss': 0.0381317838603662, 'epoch': 3.0})

In [None]:
from datasets import Dataset
test_data = read_conll_file("../Data/i2b2_Transformed/test.txt")
test_dataset = Dataset.from_dict(test_data)
test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [19]:

predictions, labels, _ = trainer.predict(test_dataset)


In [20]:
import numpy as np

predictions = np.argmax(predictions, axis=2)

true_labels = []
true_predictions = []

for pred, label in zip(predictions, labels):
    curr_true = []
    curr_pred = []

    for p, l in zip(pred, label):
        if l == -100:
            continue
        curr_true.append(id2label[l])
        curr_pred.append(id2label[p])

    true_labels.append(curr_true)
    true_predictions.append(curr_pred)


In [21]:
from seqeval.metrics import classification_report, f1_score

print(classification_report(true_labels, true_predictions))

              precision    recall  f1-score   support

     SYMPTOM       0.83      0.90      0.86       100
   TREATMENT       1.00      1.00      1.00         2

   micro avg       0.83      0.90      0.86       102
   macro avg       0.91      0.95      0.93       102
weighted avg       0.83      0.90      0.86       102



## Prediction
Loading saved model after training

In [8]:
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_path = "../Models/biobert_ner_model/checkpoint-5181"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

model.eval()

  from .autonotebook import tqdm as notebook_tqdm


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [10]:
label_list = [
    "O",
    "B-SYMPTOM",
    "I-SYMPTOM",
    "B-TREATMENT",
    "I-TREATMENT"
]

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

In [11]:
def run_ner(text):
    tokens = text.split()
    inputs = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt"
    )

    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    labels = [id2label[p] for p in predictions]

    return list(zip(tokens, labels))



In [12]:
def extract_entities(tokens, labels):
    entities = {"Symptoms": [], "Treatment": []}
    curr, curr_type = [], None

    for t, l in zip(tokens, labels):
        if l.startswith("B-"):
            if curr:
                entities[curr_type].append(" ".join(curr))
            curr = [t]
            curr_type = "Symptoms" if "SYMPTOM" in l else "Treatment"

        elif l.startswith("I-") and curr:
            curr.append(t)
        else:
            if curr:
                entities[curr_type].append(" ".join(curr))
                curr, curr_type = [], None

    if curr:
        entities[curr_type].append(" ".join(curr))

    return entities

In [13]:
def extract_diagnosis(text):
    keywords = ["diagnosed with", "diagnosis", "impression"]
    for line in text.split("."):
        if any(k in line.lower() for k in keywords):
            return line.strip()
    return "Not mentioned"

In [14]:
def extract_prognosis(text):
    prognosis_terms = ["expected", "recovery", "improving", "stable"]
    for line in text.split("."):
        if any(p in line.lower() for p in prognosis_terms):
            return line.strip()
    return "Not mentioned"

In [15]:
def build_final_json(text, patient_name="Janet Jones"):
    ner_out = run_ner(text)
    tokens, labels = zip(*ner_out)
    ents = extract_entities(tokens, labels)

    return {
        "Patient_Name": patient_name,
        "Symptoms": ents["Symptoms"],
        "Diagnosis": extract_diagnosis(text),
        "Treatment": ents["Treatment"],
        "Current_Status": ents["Symptoms"][-1] if ents["Symptoms"] else "Not mentioned",
        "Prognosis": extract_prognosis(text)
    }

In [26]:
build_final_json("Did you receive treatment? Patient: Yes, I had ten physiotherapy sessions, and now I only have occasional back pain.", "Nitin")

{'Patient_Name': 'Nitin',
 'Symptoms': [],
 'Diagnosis': 'Not mentioned',
 'Treatment': ['Patient:', 'I', 'only have occasional back pain.'],
 'Current_Status': 'Not mentioned',
 'Prognosis': 'Not mentioned'}