In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random
import os
import re

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)



In [3]:
import re

def parse_annotations(file1_path, file2_path, file3_path):
    entities = []

    # Parse the first file
    with open(file1_path, "r") as f1:
        for line in f1:
            if line.startswith("T"):  # Entity line
                parts = line.strip().split("\t")
                tag_number, entity_info, text = parts
                entity_type, *offsets = entity_info.split(" ")

                # Handle multiple offset ranges
                offset_ranges = " ".join(offsets).split(";")
                for offset_range in offset_ranges:
                    try:
                        start_offset, end_offset = map(int, offset_range.split(" "))
                        entities.append([start_offset, end_offset, entity_type])
                    except ValueError:
                        continue

    # Parse the second file and add mappings
    with open(file2_path, "r") as f2:
        for line in f2:
            if line.startswith("TT"):
                parts = re.split(r'[\t;\+" "]', line)
                for entry in parts:
                    if entry.isdigit():
                        entry = int(entry)
                        for count, tag in enumerate(entities):
                            if entry == tag[0]:
                                entities[count].append(parts[1])

    # Parse the third file and add entries
    with open(file3_path, "r") as f3:
        for line in f3:
            if line.startswith("TT"):
                parts = line.strip().split("|")  # Split by tab
                print(parts)
                offset = parts[-1].split("\t")[-2].split(" ")
                
                if len(parts) > 1:
                    for entry in offset:
                        if entry.isdigit():

                            entry = int(entry)
                            for count, tag in enumerate(entities):
                                if entry == tag[0]:
                                    entities[count].append(parts[1])
                            break
    entities = list(map(list, set(map(tuple, entities))))
    return entities

print(parse_annotations("cadecv2/original/ARTHROTEC.105.ann", 
                        "cadecv2/meddra/ARTHROTEC.105.ann", 
                        "cadecv2/sct/ARTHROTEC.105.ann"))



['TT1\t300888008 ', ' Swelling of body region ', ' 55 59;60 68\tbody swelling']
['TT8\t278528006 ', ' Facial swelling ', ' 60 68;70 74\tswelling face']
['TT9\t298941006 ', ' Swelling of wrist joint ', ' 60 68;76 82\tswelling wrists']
['TT10\t60728008 ', ' Abdominal swelling ', ' 60 68;84 91\tswelling abdomen']
['TT11\t449614009 ', ' Swelling of lower limb ', ' 60 68;93 99\tswelling thighs']
['TT5\t193462001 ', ' Insomnia ', ' 121 129\tInsomina']
['TT2\t55533009 ', ' Forgetful ', ' + 40917007', 'Confusion', ' 152 179\tforgetfulnes and confussion']
['TT6\t225014007 ', ' Feeling empty ', ' 223 246\t"empty stomach" feeling']
['TT3\t398032003 ', ' Loose stool ', ' 282 294\tloose stools']
['TT7\tCONCEPT_LESS 365 385\tHA (Hyaluronic Acid)']
['TT4\t3384011000036100 ', ' Arthrotec ', ' 397 406\tArthrotec']
[[397, 406, 'Drug', ' Arthrotec '], [282, 294, 'ADR', '10024840', ' Loose stool '], [70, 74, 'ADR', '10016065'], [60, 68, 'ADR', '10042674', '10016065', '10042707', '10042679', '10011301', ' 

In [4]:
def parse_annotations(file1_path, file2_path, file3_path):
    entities = []
    meddra_entities = []
    sct_entities = []

    # Parse the first file
    with open(file1_path, "r") as f1:
        for line in f1:
            if line.startswith("T"):  # Entity line
                parts = line.strip().split("\t")
                tag_number, entity_info, text = parts
                entity_type, *offsets = entity_info.split(" ")
                offset_ranges = " ".join(offsets).split(";")
                for offset_range in offset_ranges:
                    try:
                        start_offset, end_offset = map(int, offset_range.split(" "))
                        entities.append([start_offset, end_offset, entity_type])
                    except ValueError:
                        continue

    # Parse the second file for MedDRA mappings
    if os.path.exists(file2_path):
        with open(file2_path, "r") as f2:
            for line in f2:
                if line.startswith("TT"):
                    parts = re.split(r'[\t;\+\" "]', line)
                    for entry in parts:
                        if entry.isdigit():
                            entry = int(entry)
                            for tag in entities:
                                if entry == tag[0]:
                                    meddra_entities.append(tag + [parts[1]])

    # Parse the third file for SCT mappings
    if os.path.exists(file3_path):
        with open(file3_path, "r") as f3:
            for line in f3:
                if line.startswith("TT"):
                    parts = line.strip().split("|")
                    try:
                        offset = parts[-1].split("\t")[-2].split(" ")
                    except Exception:
                        continue
                    if len(parts) > 1:
                        for entry in offset:
                            if entry.isdigit():
                                entry = int(entry)
                                for tag in entities:
                                    if entry == tag[0]:
                                        sct_entities.append(tag + [parts[1]])

    entities = list(map(list, set(map(tuple, entities))))
    meddra_entities = list(map(list, set(map(tuple, meddra_entities))))
    sct_entities = list(map(list, set(map(tuple, sct_entities))))

    return entities, meddra_entities, sct_entities

def create_iob_labels(text, entities):
    labels = ["O"] * len(text)
    label_details = [None] * len(text)

    for start, end, entity_type, *potential_labels in entities:
        primary_label = entity_type
        potential_ids = potential_labels if potential_labels else []
        labels[start] = f"B-{primary_label}"
        label_details[start] = potential_ids

        for i in range(start + 1, end):
            labels[i] = f"I-{primary_label}"
            label_details[i] = potential_ids

    return labels, label_details

def tokenize_and_label(text, labels, label_details):
    token_labels = []
    text_index = 0

    while text_index < len(text):
        if text[text_index].isspace():
            text_index += 1
            continue

        if re.match(r"\W", text[text_index]):
            label = labels[text_index]
            token_detail = label_details[text_index]
            detail = f"\t{' '.join(token_detail)}" if token_detail else ""
            token_labels.append((text[text_index], label, detail))
            text_index += 1
        else:
            end_index = text_index
            while end_index < len(text) and re.match(r"\w", text[end_index]):
                end_index += 1

            token = text[text_index:end_index]
            token_label = labels[text_index:end_index]
            label = token_label[0] if token_label else "O"
            token_detail = label_details[text_index]
            detail = f"\t{' '.join(token_detail)}" if token_detail else ""
            token_labels.append((token, label, detail))
            text_index = end_index

    return token_labels

def write_to_file(output_path, token_labels):
    with open(output_path, "w") as f:
        for token, label, detail in token_labels:
            # Write formatted token, label, and detail
            detail_str = f"\t{detail}" if detail else ""
            f.write(f"{token}\t{label}{detail_str}\n")
            if token == ".":
                f.write("\n")

# Define paths
base_path = "cadecv2"
original_path = os.path.join(base_path, "original")
text_path = os.path.join(base_path, "text")
meddra_path = os.path.join(base_path, "meddra")
sct_path = os.path.join(base_path, "sct")
output_org_path = os.path.join("OutputFolder", "trainOrg.txt")
output_meddra_path = os.path.join("OutputFolder", "trainMeddra.txt")
output_sct_path = os.path.join("OutputFolder", "trainSct.txt")
output_all_labels_path = os.path.join("OutputFolder", "trainAll.txt")

output_org_lines = []
output_meddra_lines = []
output_sct_lines = []
output_all_labels_lines = []

for text_file in os.listdir(text_path):
    text_file_path = os.path.join(text_path, text_file)
    annotation_file_path = os.path.join(original_path, text_file.replace(".txt", ".ann"))
    meddra_mapping_path = os.path.join(meddra_path, text_file.replace(".txt", ".ann"))
    sct_mapping_path = os.path.join(sct_path, text_file.replace(".txt", ".ann"))

    if os.path.exists(annotation_file_path):
        with open(text_file_path, "r") as f:
            text = f.read()

        entities, meddra_entities, sct_entities = parse_annotations(annotation_file_path, meddra_mapping_path, sct_mapping_path)

        # Process Org labels
        labels, label_details = create_iob_labels(text, entities)
        token_labels_original = tokenize_and_label(text, labels, label_details)
        output_org_lines.extend(token_labels_original)

        # Process MedDRA labels
        labels, label_details = create_iob_labels(text, meddra_entities)
        token_labels_meddra = tokenize_and_label(text, labels, label_details)
        output_meddra_lines.extend(token_labels_meddra)

        # Process SCT labels
        labels, label_details = create_iob_labels(text, sct_entities)
        token_labels_sct = tokenize_and_label(text, labels, label_details)
        output_sct_lines.extend(token_labels_sct)

# Write trainOrg.txt
write_to_file(output_org_path, output_org_lines)
write_to_file(output_meddra_path, output_meddra_lines)
write_to_file(output_sct_path, output_sct_lines)

# Make trainAll.txt identical to trainMeddra.txt
import shutil
shutil.copy(output_meddra_path, output_all_labels_path)


'OutputFolder\\trainAll.txt'

In [5]:
# Open the source and target files
with open('OutputFolder/trainSct.txt', 'r') as source_file, open('OutputFolder/trainAll.txt', 'r+') as target_file:
    # Read all lines from trainAll.txt into a list
    all_lines = target_file.readlines()
    
    # Loop through each line in the source file
    for count, line in enumerate(source_file):
        # Remove trailing whitespace and split the line into entries
        entries = line.strip().split()
        labels = []
        
        # Collect labels starting from the 3rd entry
        for entry in entries[2:]:
            labels.append(entry)
        
        # Ensure we're within bounds for trainAll.txt
        if count < len(all_lines):
            # Add a tab and then the labels to the corresponding line in trainAll.txt
            all_lines[count] = all_lines[count].strip() + '\t' + ' '.join(labels) + '\n'
        else:
            print(f"Warning: Line {count + 1} in trainMeddra.txt exceeds trainAll.txt lines.")
    
    # Move to the beginning of the file and write the updated lines
    target_file.seek(0)
    target_file.writelines(all_lines)
    target_file.truncate()

In [6]:
from collections import Counter

# Define the input file path
input_file_path = "OutputFolder/trainOrg.txt"  # Replace with your file path

# Initialize a counter for labels
label_counts = Counter()

# Processing the file to count labels
with open(input_file_path, "r") as file:
    for line in file:
        line = line.strip()
        if "\t" in line:  # Process only lines with a tab (word-label pairs)
            _, label = line.split("\t")
            label_counts[label] += 1

# Display the counts for each label
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label counts:
O: 102670
B-ADR: 6777
I-ADR: 9143
B-Drug: 1802
B-Disease: 285
B-Symptom: 289
I-Symptom: 277
I-Disease: 206
I-Drug: 237
B-Finding: 452
I-Finding: 414


In [7]:
import re

# Normalize file by standardizing line endings and cleaning whitespace
def normalize_file(input_path, output_path, encoding='ISO-8859-1'):
    """
    Normalize a text file by standardizing line endings and removing extra whitespace.
    """
    with open(input_path, 'r', encoding=encoding) as infile:
        content = infile.read()
    
    # Normalize line endings and remove trailing/leading spaces
    content = content.replace('\r\n', '\n').replace('\r', '\n')
    content = re.sub(r'[ \t]+$', '', content, flags=re.MULTILINE)  # Remove trailing spaces
    content = re.sub(r'\n{3,}', '\n\n', content)  # Replace multiple empty lines with a double newline
    content = content.strip()
    
    with open(output_path, 'w', encoding=encoding) as outfile:
        outfile.write(content + '\n')

# Normalize both files
reference_normalized = 'OutputFolder/trainOrg_normalized.txt'
target_normalized = 'OutputFolder/trainAll_normalized.txt'

normalize_file('Outputfolder/trainOrg.txt', reference_normalized)
normalize_file('OutputFolder/trainAll.txt', target_normalized)

# Check sentence counts again
def count_sentences(file_path, encoding='ISO-8859-1'):
    with open(file_path, 'r', encoding=encoding) as file:
        content = file.read()
    return len(content.strip().split('\n\n'))

org_sentence_count = count_sentences(reference_normalized)
all_sentence_count = count_sentences(target_normalized)

print(f"Reference file sentences: {org_sentence_count}")
print(f"Target file sentences: {all_sentence_count}")

# Align normalized files
def align_sentences_token_by_token(reference_path, target_path, output_path, encoding='ISO-8859-1'):
    """
    Align sentences at a token level to ensure both line and sentence consistency.
    """
    with open(reference_path, 'r', encoding=encoding) as ref_file, open(target_path, 'r', encoding=encoding) as tgt_file:
        ref_sentences = ref_file.read().strip().split('\n\n')
        tgt_sentences = tgt_file.read().strip().split('\n\n')

    if len(ref_sentences) != len(tgt_sentences):
        raise ValueError("Sentence counts still do not match after normalization. Please inspect the files.")

    aligned_sentences = []

    for i, (ref_sentence, tgt_sentence) in enumerate(zip(ref_sentences, tgt_sentences)):
        ref_tokens = [line.strip() for line in ref_sentence.split('\n') if line.strip()]
        tgt_tokens = [line.strip() for line in tgt_sentence.split('\n') if line.strip()]

        if len(ref_tokens) != len(tgt_tokens):
            print(f"❗ Token count mismatch in sentence {i+1}: Expected {len(ref_tokens)} tokens, got {len(tgt_tokens)} tokens")
            # Fallback alignment
            aligned_sentence = '\n'.join(tgt_tokens[:len(ref_tokens)])
        else:
            aligned_sentence = '\n'.join(tgt_tokens)
        
        aligned_sentences.append(aligned_sentence)

    with open(output_path, 'w', encoding=encoding) as out_file:
        out_file.write('\n\n'.join(aligned_sentences) + '\n')

    print(f"Aligned file saved to {output_path}")


# Final Alignment
output_file = 'OutputFolder/aligned_trainAll_normalized.txt'
align_sentences_token_by_token(reference_normalized, target_normalized, output_file)


Reference file sentences: 7520
Target file sentences: 7520
Aligned file saved to OutputFolder/aligned_trainAll_normalized.txt


In [8]:
import random
import os

# Create output folder if it doesn't exist
output_folder = 'Train-test-split'
os.makedirs(output_folder, exist_ok=True)

# Define file paths
files = {
    "Org": "OutputFolder/trainOrg.txt",
    "Meddra": "OutputFolder/trainMeddra.txt",
    "Sct": "OutputFolder/trainSct.txt",
    "All": "OutputFolder/aligned_trainAll_normalized.txt"
}

# Define output files in Train-test-split folder
output_files = {
    "train": {key: os.path.join(output_folder, f"train_{key}.txt") for key in files},
    "test": os.path.join(output_folder, "test_Org.txt"),
    "validation": os.path.join(output_folder, "validation_Org.txt")
}

# Helper function to read sentences from a file
def read_sentences(filepath):
    with open(filepath, 'r') as file:
        content = file.read().strip()
    # Normalize line endings and split sentences by double newlines
    content = content.replace('\r\n', '\n').strip()
    sentences = [sentence.strip() for sentence in content.split('\n\n') if sentence.strip()]
    return sentences

# Count sentences in all files
for key, path in files.items():
    sentences = read_sentences(path)
    print(f"{key}: {len(sentences)} sentences")

# Helper function to write sentences to a file
def write_sentences(filepath, sentences):
    with open(filepath, 'w') as file:
        file.write('\n\n'.join(sentences) + '\n')

# Step 1: Read sentences from all files
sentences = {key: read_sentences(files[key]) for key in files}

# Check consistency
sentence_count = len(sentences['Org'])
assert all(len(sentences[key]) == sentence_count for key in sentences), "Sentence counts are inconsistent across files!"

# Step 2: Randomly split indices into train, test, and validation
indices = list(range(sentence_count))
random.shuffle(indices)

train_ratio, test_ratio, val_ratio = 0.8, 0.1, 0.1
train_end = int(train_ratio * sentence_count)
test_end = train_end + int(test_ratio * sentence_count)

train_idx = indices[:train_end]
test_idx = indices[train_end:test_end]
val_idx = indices[test_end:]

# Step 3: Write to output files

# Write train splits (same sentences for all four train files)
for key in files:
    train_sentences = [sentences[key][i] for i in train_idx]
    write_sentences(output_files['train'][key], train_sentences)

# Write test and validation splits (only for Org)
test_sentences = [sentences['Org'][i] for i in test_idx]
validation_sentences = [sentences['Org'][i] for i in val_idx]

write_sentences(output_files['test'], test_sentences)
write_sentences(output_files['validation'], validation_sentences)

print(f"✅ Train/Test/Validation split completed successfully! Files saved in '{output_folder}'")


Org: 7520 sentences
Meddra: 7520 sentences
Sct: 7520 sentences
All: 7520 sentences
✅ Train/Test/Validation split completed successfully! Files saved in 'Train-test-split'


Test for training with different dimensionality
Org:

In [9]:
from datasets import Dataset, DatasetDict, Sequence, ClassLabel

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Function to read the BIO file
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": []}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            orgline = line
            line = line.strip()
            if line == "":  # Sentence boundary
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
            else:
                # Split the line into token and label
                parts = line.split()
                if len(parts) == 2:  # Only process lines with exactly two parts
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                else:
                    # TODO check how to add these
                    #print(f"Skipping malformed line: {orgline}")
                    continue

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)

    return sentences

# Read the training dataset
train_data = read_bio_file("Train-test-split/train_Org.txt")
val_data = read_bio_file("Train-test-split/validation_Org.txt")
test_data = read_bio_file("Train-test-split/test_Org.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data]
    })
})

# Define the ClassLabel feature for NER tags
ner_feature = ClassLabel(names=label_names)

# Cast the ner_tags column to use the ClassLabel feature
dataset = dataset.cast_column("ner_tags", Sequence(ner_feature))

# Display the dataset structure
print(dataset)

Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 6016
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 752
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 752
    })
})


In [10]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/6016 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

In [12]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [13]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [14]:
model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")
# TODO change:
num_epochs = 1
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [16]:
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

 14/376 [>.............................] - ETA: 27:07 - loss: 1.5826

KeyboardInterrupt: 

Meddra test:

In [4]:
from datasets import Dataset, DatasetDict, Sequence, ClassLabel, Value

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Function to read the BIO file with multi-annotations
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            orgline = line
            line = line.strip()
            if line == "":  # Sentence boundary
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
            else:
                # Split the line into token, label, and optional code
                parts = line.split()
                if len(parts) == 3:  # Token, label, and ADR code
                    token, label, code = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(code)
                elif len(parts) == 2:  # Token and label only
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(None)  # No ADR code
                else:
                    # Malformed line
                    print(f"Skipping malformed line: {orgline}")
                    continue

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)

    return sentences

# Read the datasets
train_data = read_bio_file("Train-test-split/train_Meddra.txt")
val_data = read_bio_file("Train-test-split/validation_Org.txt")
test_data = read_bio_file("Train-test-split/test_Org.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data],
        "adr_codes": [d["adr_codes"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data],
        "adr_codes": [d["adr_codes"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data],
        "adr_codes": [d["adr_codes"] for d in val_data]
    })
})

# Define features for the dataset

ner_feature = datasets.ClassLabel(names=label_names)

dataset = dataset.cast_column("ner_tags", datasets.Sequence(ner_feature))

# Display the dataset structure
print(dataset)


Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 6016
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 752
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 752
    })
})


In [4]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/6016 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

In [5]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

In [6]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [7]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [8]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 1
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [10]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

  9/376 [..............................] - ETA: 33:59 - loss: 1.7298

KeyboardInterrupt: 

SCT

In [3]:
import chardet  # Library for detecting file encoding

from datasets import Dataset, DatasetDict, Sequence, ClassLabel, Value

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels



# Detect file encoding
def detect_encoding(filepath):
    with open(filepath, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']

# Function to read the BIO file with multi-annotations
def read_bio_file(filepath):
    encoding = detect_encoding(filepath)  # Auto-detect encoding
    print(f"Detected encoding for {filepath}: {encoding}")
    
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}

    try:
        with open(filepath, "r", encoding=encoding) as f:
            for line in f:
                orgline = line
                line = line.strip()
                if line == "":  # Sentence boundary
                    if current_sentence["tokens"]:
                        sentences.append(current_sentence)
                        current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
                else:
                    # Split line with a maximum of 2 splits
                    parts = line.split(maxsplit=2)
                    if len(parts) == 3:  # Token, label, and ADR code
                        token, label, code = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(code)
                    elif len(parts) == 2:  # Token and label only
                        token, label = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(None)  # No ADR code
                    else:
                        # Malformed line
                        print(f"Skipping malformed line: {orgline}")
                        continue

            # Add the last sentence if the file doesn't end with a blank line
            if current_sentence["tokens"]:
                sentences.append(current_sentence)
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}. Retrying with ISO-8859-1 encoding.")
        with open(filepath, "r", encoding="ISO-8859-1") as f:
            for line in f:
                orgline = line
                line = line.strip()
                if line == "":  # Sentence boundary
                    if current_sentence["tokens"]:
                        sentences.append(current_sentence)
                        current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
                else:
                    parts = line.split(maxsplit=2)
                    if len(parts) == 3:
                        token, label, code = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(code)
                    elif len(parts) == 2:
                        token, label = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(None)
                    else:
                        print(f"Skipping malformed line: {orgline}")
                        continue

            if current_sentence["tokens"]:
                sentences.append(current_sentence)

    return sentences


# Read the datasets
train_data = read_bio_file("Train-test-split/train_Org.txt")
val_data = read_bio_file("Train-test-split/validation_Org.txt")
test_data = read_bio_file("Train-test-split/test_Org.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data],
        "adr_codes": [d["adr_codes"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data],
        "adr_codes": [d["adr_codes"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data],
        "adr_codes": [d["adr_codes"] for d in val_data]
    })
})

# Define features for the dataset
ner_feature = Sequence(ClassLabel(names=label_names))
adr_feature = Sequence(Value("string"))

# Cast columns
dataset = dataset.cast_column("ner_tags", ner_feature)
dataset = dataset.cast_column("adr_codes", adr_feature)

# Display the dataset structure
print(dataset)



Detected encoding for Train-test-split/train_Org.txt: ascii
Detected encoding for Train-test-split/validation_Org.txt: ascii
Detected encoding for Train-test-split/test_Org.txt: ascii


Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 6016
    })
    test: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 752
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags', 'adr_codes'],
        num_rows: 752
    })
})


In [6]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

Map:   0%|          | 0/6016 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

In [7]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

In [8]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [9]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [10]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 1
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [12]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

  1/376 [..............................] - ETA: 2:53:58 - loss: 2.3073

KeyboardInterrupt: 

### Final code functions

In [None]:
import chardet  # Library for detecting file encoding

from datasets import Dataset, DatasetDict, Sequence, ClassLabel, Value

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels



# Detect file encoding
def detect_encoding(filepath):
    with open(filepath, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']

# Function to read the BIO file with multi-annotations
def read_bio_file(filepath):
    encoding = detect_encoding(filepath)  # Auto-detect encoding
    print(f"Detected encoding for {filepath}: {encoding}")
    
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}

    try:
        with open(filepath, "r", encoding=encoding) as f:
            for line in f:
                orgline = line
                line = line.strip()
                if line == "":  # Sentence boundary
                    if current_sentence["tokens"]:
                        sentences.append(current_sentence)
                        current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
                else:
                    # Split line with a maximum of 2 splits
                    parts = line.split(maxsplit=2)
                    if len(parts) == 3:  # Token, label, and ADR code
                        token, label, code = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(code)
                    elif len(parts) == 2:  # Token and label only
                        token, label = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(None)  # No ADR code
                    else:
                        # Malformed line
                        print(f"Skipping malformed line: {orgline}")
                        continue

            # Add the last sentence if the file doesn't end with a blank line
            if current_sentence["tokens"]:
                sentences.append(current_sentence)
    except UnicodeDecodeError as e:
        print(f"UnicodeDecodeError: {e}. Retrying with ISO-8859-1 encoding.")
        with open(filepath, "r", encoding="ISO-8859-1") as f:
            for line in f:
                orgline = line
                line = line.strip()
                if line == "":  # Sentence boundary
                    if current_sentence["tokens"]:
                        sentences.append(current_sentence)
                        current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
                else:
                    parts = line.split(maxsplit=2)
                    if len(parts) == 3:
                        token, label, code = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(code)
                    elif len(parts) == 2:
                        token, label = parts
                        current_sentence["tokens"].append(token)
                        current_sentence["ner_tags"].append(label_to_id(label))
                        current_sentence["adr_codes"].append(None)
                    else:
                        print(f"Skipping malformed line: {orgline}")
                        continue

            if current_sentence["tokens"]:
                sentences.append(current_sentence)

    return sentences


# Read the datasets
train_data = read_bio_file("Train-test-split/train_Meddra.txt")
val_data = read_bio_file("Train-test-split/validation_Org.txt")
test_data = read_bio_file("Train-test-split/test_Org.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data],
        "adr_codes": [d["adr_codes"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data],
        "adr_codes": [d["adr_codes"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data],
        "adr_codes": [d["adr_codes"] for d in val_data]
    })
})

# Define features for the dataset
ner_feature = Sequence(ClassLabel(names=label_names))
adr_feature = Sequence(Value("string"))

# Cast columns
dataset = dataset.cast_column("ner_tags", ner_feature)
dataset = dataset.cast_column("adr_codes", adr_feature)

# Display the dataset structure
print(dataset)


# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 1
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

In [4]:
import chardet
from datasets import Dataset, DatasetDict, Sequence, ClassLabel, Value
from transformers import AutoTokenizer, DataCollatorForTokenClassification, TFAutoModelForTokenClassification, create_optimizer
import tensorflow as tf

# Define label names and mappings
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]
label_mapping = {label: idx for idx, label in enumerate(label_names)}


def label_to_id(label):
    """Map a label to its corresponding ID."""
    return label_mapping.get(label, -100)


def detect_encoding(filepath):
    """Detect file encoding using chardet."""
    with open(filepath, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']


def read_bio_file(filepath):
    """Read a BIO file and extract tokens, NER tags, and ADR codes."""
    encoding = detect_encoding(filepath)
    print(f"Detected encoding for {filepath}: {encoding}")
    
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}

    with open(filepath, "r", encoding=encoding) as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
            else:
                parts = line.split(maxsplit=2)
                if len(parts) == 3:
                    token, label, code = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(code)
                elif len(parts) == 2:
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(None)
                else:
                    print(f"Skipping malformed line: {line}")
    
    if current_sentence["tokens"]:
        sentences.append(current_sentence)
    
    return sentences


def create_dataset(train_file, val_file, test_file):
    """Create a DatasetDict from BIO files."""
    train_data = read_bio_file(train_file)
    val_data = read_bio_file(val_file)
    test_data = read_bio_file(test_file)
    
    dataset = DatasetDict({
        "train": Dataset.from_dict({
            "tokens": [d["tokens"] for d in train_data],
            "ner_tags": [d["ner_tags"] for d in train_data],
            "adr_codes": [d["adr_codes"] for d in train_data]
        }),
        "validation": Dataset.from_dict({
            "tokens": [d["tokens"] for d in val_data],
            "ner_tags": [d["ner_tags"] for d in val_data],
            "adr_codes": [d["adr_codes"] for d in val_data]
        }),
        "test": Dataset.from_dict({
            "tokens": [d["tokens"] for d in test_data],
            "ner_tags": [d["ner_tags"] for d in test_data],
            "adr_codes": [d["adr_codes"] for d in test_data]
        })
    })

    ner_feature = Sequence(ClassLabel(names=label_names))
    adr_feature = Sequence(Value("string"))

    dataset = dataset.cast_column("ner_tags", ner_feature)
    dataset = dataset.cast_column("adr_codes", adr_feature)
    
    return dataset

In [5]:
def align_labels_with_tokens(labels, word_ids):
    """Align labels with tokens after tokenization."""
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels


def tokenize_and_align_labels(dataset, tokenizer):
    """Tokenize dataset and align labels."""
    def tokenize_fn(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))
        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs
    
    return dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )

In [6]:
def prepare_tf_datasets(tokenized_datasets, tokenizer):
    """Convert tokenized datasets to TensorFlow datasets."""
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        return_tensors="tf"
    )
    
    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=True,
        batch_size=16,
    )
    
    tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=False,
        batch_size=16,
    )
    
    tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=False,
        batch_size=16,
    )
    
    return tf_train_dataset, tf_eval_dataset, tf_test_dataset

In [7]:
def initialize_model(model_checkpoint):
    """Initialize the Token Classification model."""
    id2label = {i: label for i, label in enumerate(label_names)}
    label2id = {v: k for k, v in id2label.items()}
    
    model = TFAutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label,
        label2id=label2id,
    )
    return model

In [8]:
def compile_model(model, train_dataset, num_epochs=1):
    """Compile the model with optimizer and learning rate schedule."""
    num_train_steps = len(train_dataset) * num_epochs
    
    optimizer, schedule = create_optimizer(
        init_lr=2e-5,
        num_warmup_steps=0,
        num_train_steps=num_train_steps,
        weight_decay_rate=0.01,
    )
    
    model.compile(optimizer=optimizer)
    return model

In [9]:
# Load and preprocess dataset
dataset = create_dataset("Train-test-split/train_All.txt", "Train-test-split/validation_Org.txt", "Train-test-split/test_Org.txt")
tokenized_datasets = tokenize_and_align_labels(dataset, tokenizer)

# Prepare TensorFlow datasets
tf_train, tf_val, tf_test = prepare_tf_datasets(tokenized_datasets, tokenizer)

# Initialize and compile model
model = initialize_model('bert-base-cased')
model = compile_model(model, tf_train)

# Training (fit separately)
model.fit(tf_train, validation_data=tf_val, epochs=1)

Detected encoding for Train-test-split/train_All.txt: ISO-8859-1
Detected encoding for Train-test-split/validation_Org.txt: ascii
Detected encoding for Train-test-split/test_Org.txt: ascii


Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/6016 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  3/376 [..............................] - ETA: 26:22 - loss: 2.4600

KeyboardInterrupt: 