In [26]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
%autoreload
import os
import sys
from typing import Any

from tqdm.notebook import tqdm

curr_dir = os.getcwd()
while os.path.basename(curr_dir) != "candidate_ranking":
    curr_dir = os.path.dirname(curr_dir)
sys.path.append(curr_dir)

In [28]:
from src.codiesp import CODIESP

In [29]:
PATH_DATASET_SYNTHETIC = os.path.join(curr_dir, "data", "codiesp.synthetic.csv")

In [30]:
CODIESP.head()

Unnamed: 0,text_path,text_id,sentence
0,S1130-05582007000500007-1.txt,0,The patient is a 38-year-old male who attended...
1,S1130-05582007000500007-1.txt,0,1.
2,S1130-05582007000500007-1.txt,0,The pathological antecedents include being exa...
3,S1130-05582007000500007-1.txt,0,The examination revealed a right mandibular tu...
4,S1130-05582007000500007-1.txt,0,The orthopantomography showed a mixed lesion w...


# Task 1: Injecting Random Transcription Errors

We’ll write a function to:
Randomly introduce errors (typos, swaps, deletions, insertions).
Prioritize errors in medical entities if specified.
Ensure a user-defined ratio of altered words.

In [31]:
%autoreload

import random
import re
import pandas as pd

def introduce_typo(word):
    """Introduce a random character-level error in a word."""
    if len(word) < 3:
        return word  # Avoid breaking very short words
    typo_type = random.choice(["swap", "delete", "insert", "substitute"])

    if typo_type == "swap" and len(word) > 1:
        idx = random.randint(0, len(word) - 2)
        return word[:idx] + word[idx + 1] + word[idx] + word[idx + 2:]
    elif typo_type == "delete":
        idx = random.randint(0, len(word) - 1)
        return word[:idx] + word[idx + 1:]
    elif typo_type == "insert":
        idx = random.randint(0, len(word))
        char = random.choice("abcdefghijklmnopqrstuvwxyz")
        return word[:idx] + char + word[idx:]
    elif typo_type == "substitute":
        idx = random.randint(0, len(word) - 1)
        char = random.choice("abcdefghijklmnopqrstuvwxyz")
        return word[:idx] + char + word[idx + 1:]
    return word

def corrupt_text(text, medical_entities, entity_error_prob=0.5, other_word_prob=0.1):
    """
    Introduces errors in text based on probabilities:
    - entity_error_prob: probability of corrupting a medical entity
    - other_word_prob: probability of corrupting any other word
    """
    words = text.split()
    corrupted_words = []
    
    for word in words:
        if word in medical_entities and random.random() < entity_error_prob:
            corrupted_words.append(introduce_typo(word))
        elif word not in medical_entities and random.random() < other_word_prob:
            corrupted_words.append(introduce_typo(word))
        else:
            corrupted_words.append(word)
    
    return " ".join(corrupted_words)

In [32]:
CODIESP

Unnamed: 0,text_path,text_id,sentence
0,S1130-05582007000500007-1.txt,0,The patient is a 38-year-old male who attended...
1,S1130-05582007000500007-1.txt,0,1.
2,S1130-05582007000500007-1.txt,0,The pathological antecedents include being exa...
3,S1130-05582007000500007-1.txt,0,The examination revealed a right mandibular tu...
4,S1130-05582007000500007-1.txt,0,The orthopantomography showed a mixed lesion w...
...,...,...,...
8027,S0365-66912007001200011-1.txt,499,"In the denuded area, an amniotic membrane graf..."
8028,S0365-66912007001200011-1.txt,499,During follow-up there was a progressive reepi...
8029,S0365-66912007001200011-1.txt,499,Three weeks after surgery there was a regular ...
8030,S0365-66912007001200011-1.txt,499,The VA of the left eye improved to 4/10.


In [33]:
dataset = CODIESP.copy()
corrupted_dataset = dataset.copy()

def positive_case(x: Any) -> int:
    return 1

def negative_case(x: Any) -> int:
    return 0

In [34]:
dataset["label"] = dataset["sentence"].apply(negative_case)

In [35]:
# medical_entities = set(dataset["entities"].explode().dropna())  # Extract medical entities from dataset

corrupted_dataset["noisy_sentence"] = corrupted_dataset["sentence"].apply(
    lambda x: corrupt_text(x, [], entity_error_prob=0.7, other_word_prob=0.3)
)

corrupted_dataset = (
    corrupted_dataset
    .drop("sentence", axis=1)
    .rename(columns={"noisy_sentence": "sentence"})
)
corrupted_dataset["label"] = corrupted_dataset.sentence.apply(positive_case)

# Merge and shuffle dataset
final_dataset = pd.concat([dataset, corrupted_dataset])
final_dataset.to_csv(PATH_DATASET_SYNTHETIC, index=False)

Explanation:
1. Medical terms from CANTEMIST are identified and corrupted preferentially based on entity_error_prob.
2. Non-medical words are corrupted with a lower probability other_word_prob.
3. The output dataset (corrupted_CANTEMIST.csv) contains:
   1. Original sentences (label = 0)
   2. Corrupted sentences (label = 1)

# Task 2: Training ClinicalBERT to Detect Errors

Now, we’ll train ClinicalBERT on the synthetic dataset to detect transcription errors.

In [36]:
%autoreload
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load ClinicalBERT
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Load dataset
dataset = load_dataset("csv", data_files={"train": PATH_DATASET_SYNTHETIC})
dataset = dataset["train"].train_test_split(test_size=0.2)

print(dataset)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text_path', 'text_id', 'sentence', 'label'],
        num_rows: 12851
    })
    test: Dataset({
        features: ['text_path', 'text_id', 'sentence', 'label'],
        num_rows: 3213
    })
})


In [37]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(
        examples["sentence"],
        padding="max_length",
        truncation=True, 
        max_length=128
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training settings
training_args = TrainingArguments(
    output_dir="./error_detection",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train model
trainer.train()

Map:   0%|          | 0/12851 [00:00<?, ? examples/s]

Map:   0%|          | 0/3213 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.257,0.197244
2,0.1887,0.271014
3,0.1422,0.26781


TrainOutput(global_step=4821, training_loss=0.20143407708108488, metrics={'train_runtime': 1437.5051, 'train_samples_per_second': 26.819, 'train_steps_per_second': 3.354, 'total_flos': 2535930129323520.0, 'train_loss': 0.20143407708108488, 'epoch': 3.0})

# 3. Error Detection on New Transcriptions

Once trained, ClinicalBERT can now classify new transcriptions for errors.

In [38]:
from transformers import pipeline

error_detector = pipeline("text-classification", model=model, tokenizer=tokenizer)

test_text = "The patient has dyspnaea and needs immdeiate attention."
score = error_detector(test_text)
print(score)  # [{'label': '1' (error), 'score': 0.95}]


Device set to use mps:0


[{'label': 'LABEL_1', 'score': 0.9998667240142822}]


In [40]:
test_text = "The patient has dyspnea and needs immediate attention."
score = error_detector(test_text)
score

[{'label': 'LABEL_0', 'score': 0.9850120544433594}]

- A high probability means the model detects errors.
- A low probability means the transcription is likely correct.

# Next Steps

1. Evaluate performance: Compute F1-score, precision, recall to ensure good error detection.
2. Refine error generation: Introduce more realistic mistakes (homophones, phonetic errors).
3. Fine-tune for specific error types: Add subcategories like misrecognized words vs. omitted words.