In [1]:
%load_ext autoreload

In [2]:
%autoreload
import os
import sys
from typing import Any

import pandas as pd
from tqdm.notebook import tqdm
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset


curr_dir = os.getcwd()
while os.path.basename(curr_dir) != "candidate_ranking":
    curr_dir = os.path.dirname(curr_dir)
sys.path.append(curr_dir)

In [3]:
from src.codiesp import CODIESP
from src.error_injector import corrupt_text
from src.language_model import BioClinicalBert

/Users/jordi/Documents/Profesional/Búsqueda de Trabajo/Barcelona 2024/DocPlanner/candidate_ranking


100%|████████████████████████████████████████████████████████████████████████████████████| 500/500 [00:00<00:00, 3950.29it/s]


In [4]:
BIOBERT = BioClinicalBert()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
PATH_DATASET_SYNTHETIC = os.path.join(curr_dir, "data", "codiesp.synthetic.csv")

In [6]:
CODIESP.head()

Unnamed: 0,text_path,text_id,sentence
0,S1130-05582007000500007-1.txt,0,The patient is a 38-year-old male who attended...
1,S1130-05582007000500007-1.txt,0,1.
2,S1130-05582007000500007-1.txt,0,The pathological antecedents include being exa...
3,S1130-05582007000500007-1.txt,0,The examination revealed a right mandibular tu...
4,S1130-05582007000500007-1.txt,0,The orthopantomography showed a mixed lesion w...


# Task 1: Injecting Random Transcription Errors

We’ll write a function to:
Randomly introduce errors (typos, swaps, deletions, insertions).
Prioritize errors in medical entities if specified.
Ensure a user-defined ratio of altered words.

In [7]:
dataset = CODIESP.copy()
corrupted_dataset = dataset.copy()

def positive_case(x: Any) -> int:
    return 1

def negative_case(x: Any) -> int:
    return 0

In [8]:
dataset["label"] = dataset["sentence"].apply(negative_case)

In [9]:
error_rows = list(corrupted_dataset["sentence"].apply(
    lambda x: corrupt_text(x, word_error_probability=0.2, character_error_probability=0.2)
))

In [10]:
error_columns_df = pd.DataFrame(error_rows, columns=["sentence", "num_character_errors", "num_word_errors"])

corrupted_dataset = pd.concat(
    [corrupted_dataset.drop("sentence", axis=1, inplace=True), error_columns_df],
    axis=1
)

error_columns_df

Unnamed: 0,sentence,num_character_errors,num_word_errors
0,The patient is a 38-year-old male who mttended...,7,1
1,.,1,0
2,The pathological antecedents include being exa...,2,0
3,Th examination revealed a rightz mandibuylar t...,7,0
4,The orthopantomography showed av mixed lesion ...,3,1
...,...,...,...
8027,"tIn the denudeds area, an amniotic membrane gr...",4,0
8028,Durin follow-up there was ra progressive reepi...,7,0
8029,Three weeks after surgery theae was a regular ...,5,0
8030,The V of the left eye improved to 4/10.,1,0


In [11]:
corrupted_dataset["label"] = corrupted_dataset.sentence.apply(positive_case)

# Merge and shuffle dataset
final_dataset = pd.concat([dataset, corrupted_dataset])
final_dataset.to_csv(PATH_DATASET_SYNTHETIC, index=False)

In [12]:
corrupted_dataset

Unnamed: 0,sentence,num_character_errors,num_word_errors,label
0,The patient is a 38-year-old male who mttended...,7,1,1
1,.,1,0,1
2,The pathological antecedents include being exa...,2,0,1
3,Th examination revealed a rightz mandibuylar t...,7,0,1
4,The orthopantomography showed av mixed lesion ...,3,1,1
...,...,...,...,...
8027,"tIn the denudeds area, an amniotic membrane gr...",4,0,1
8028,Durin follow-up there was ra progressive reepi...,7,0,1
8029,Three weeks after surgery theae was a regular ...,5,0,1
8030,The V of the left eye improved to 4/10.,1,0,1


Explanation:
1. Medical terms from CANTEMIST are identified and corrupted preferentially based on entity_error_prob.
2. Non-medical words are corrupted with a lower probability other_word_prob.
3. The output dataset (corrupted_CANTEMIST.csv) contains:
   1. Original sentences (label = 0)
   2. Corrupted sentences (label = 1)

# Task 2: Training ClinicalBERT to Detect Errors

Now, we’ll train ClinicalBERT on the synthetic dataset to detect transcription errors.

In [13]:
# Load ClinicalBERT


# Load dataset
dataset = load_dataset("csv", data_files={"train": PATH_DATASET_SYNTHETIC})
dataset = dataset["train"].train_test_split(test_size=0.2)

print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text_path', 'text_id', 'sentence', 'label', 'num_character_errors', 'num_word_errors'],
        num_rows: 12851
    })
    test: Dataset({
        features: ['text_path', 'text_id', 'sentence', 'label', 'num_character_errors', 'num_word_errors'],
        num_rows: 3213
    })
})


In [None]:
tokenized_datasets = dataset.map(BIOBERT.tokenize, batched=True)

# Training settings
training_args = TrainingArguments(
    output_dir="./error_detection",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
)

trainer = Trainer(
    model=BIOBERT.model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

# Train model
trainer.train()

Epoch,Training Loss,Validation Loss


# 3. Error Detection on New Transcriptions

Once trained, ClinicalBERT can now classify new transcriptions for errors.

In [None]:
from transformers import pipeline

error_detector = pipeline("text-classification", model=model, tokenizer=tokenizer)

test_text = "The patient has dyspnaea and needs immdeiate attention."
score = error_detector(test_text)
print(score)  # [{'label': '1' (error), 'score': 0.95}]


In [None]:
test_text = "The patient has dyspnea and needs immediate attention."
score = error_detector(test_text)
score

- A high probability means the model detects errors.
- A low probability means the transcription is likely correct.

# Next Steps

1. Evaluate performance: Compute F1-score, precision, recall to ensure good error detection.
2. Refine error generation: Introduce more realistic mistakes (homophones, phonetic errors).
3. Fine-tune for specific error types: Add subcategories like misrecognized words vs. omitted words.