In [32]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random
import os
import re

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [41]:
# Define paths
base_path = "cadecv2"
original_path = os.path.join(base_path, "original")
text_path = os.path.join(base_path, "text")
output_path = os.path.join(base_path, "train.txt")

# Function to parse annotations with semicolon-sliced offsets
def parse_annotations(file_path):
    entities = []
    with open(file_path, "r") as f:
        for line in f:
            if line.startswith("T"):  # Entity line
                parts = line.strip().split("\t")
                tag_number, entity_info, text = parts
                entity_type, *offsets = entity_info.split(" ")
                offset_ranges = " ".join(offsets).split(";")  # Handle semicolon-separated offsets
                for offset_range in offset_ranges:
                    start_offset, end_offset = map(int, offset_range.split(" "))
                    entities.append((start_offset, end_offset, entity_type))
    return entities

# Function to create IOB labeling
def create_iob_labels(text, entities):
    labels = ["O"] * len(text)  # Initialize all tokens with "O"
    for start, end, entity_type in entities:
        labels[start] = f"B-{entity_type}"
        for i in range(start + 1, end):
            labels[i] = f"I-{entity_type}"
    return labels

# Function to tokenize text and align labels
def tokenize_and_label(text, labels):
    tokens = text.split()
    token_labels = []
    text_index = 0
    for token in tokens:
        token_length = len(token)
        if any(char.isalnum() for char in token):  # Skip punctuation
            token_label = labels[text_index : text_index + token_length]
            label = token_label[0] if token_label else "O"
            token_labels.append((token, label))
        else:
            token_labels.append((token, "O"))
        text_index += token_length + 1  # Move index past the token and space
    return token_labels

# Process files
output_lines = []
for text_file in os.listdir(text_path):
    text_file_path = os.path.join(text_path, text_file)
    annotation_file_path = os.path.join(original_path, text_file.replace(".txt", ".ann"))
    
    if os.path.exists(annotation_file_path):
        # Read text
        with open(text_file_path, "r") as f:
            text = f.read()
        
        # Parse annotations and create labels
        entities = parse_annotations(annotation_file_path)
        labels = create_iob_labels(text, entities)
        token_labels = tokenize_and_label(text, labels)
        
        # Write to output
        for token, label in token_labels:
            output_lines.append(f"{token}\t{label}")
            if token.endswith("."):  # Add a blank line after sentences
                output_lines.append("\n")

# Write the output to train.txt
with open(output_path, "w") as f:
    f.write("\n".join(output_lines))

In [42]:
# List of special characters to remove
special_characters = [".", ","]

# Cleaning process
with open("cadecv2/train.txt", "r") as file:
    lines = file.readlines()

cleaned_lines = []
for line in lines:
    # Remove special characters
    for char in special_characters:
        line = line.replace(char, "")
    cleaned_lines.append(line)

# Writing the cleaned data to a new file
with open("train2.txt", "w") as file:
    file.writelines(cleaned_lines)


In [43]:
# Processing the file
with open("train2.txt", "r") as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    line = line.strip()
    if not line:  # Skip empty lines
        processed_lines.append("\n")
        continue

    if "\t" in line:  # Process only lines with a tab (word-label pairs)
        word, label = line.split("\t")
        if "'" in word:  # Check if the word contains an apostrophe
            base, suffix = word.split("'", 1)  # Split the word at the apostrophe
            processed_lines.append(f"{base}\t{label}\n")  # Add the base part
            processed_lines.append(f"'{suffix}\t{label}\n")  # Add the suffix with the same label
        else:
            processed_lines.append(line + "\n")  # Add the original line
    else:
        processed_lines.append(line + "\n")  # Add lines without tabs as is

# Writing the processed data to a new file
with open("train3.txt", "w") as file:
    file.writelines(processed_lines)

In [44]:
from collections import Counter

# Define the input file path
input_file_path = "train3.txt"  # Replace with your file path

# Initialize a counter for labels
label_counts = Counter()

# Processing the file to count labels
with open(input_file_path, "r") as file:
    for line in file:
        line = line.strip()
        if "\t" in line:  # Process only lines with a tab (word-label pairs)
            _, label = line.split("\t")
            label_counts[label] += 1

# Display the counts for each label
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label counts:
O: 85867
B-ADR: 6469
I-ADR: 8702
B-Drug: 1761
B-Disease: 288
B-Symptom: 285
I-Symptom: 266
I-Disease: 171
I-Drug: 176
B-Finding: 450
I-Finding: 392


In [50]:
import random

# File paths
input_file_path = "train3.txt"  # Replace with your file path
train_file_path = "trainFinal.txt"
test_file_path = "test.txt"
validation_file_path = "validation.txt"

# Percentages for splitting
test_split = 0.10
validation_split = 0.10

# Read the input file
with open(input_file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Separate sentences by blank lines
sentences = []
current_sentence = []
for line in lines:
    if line.strip():  # If the line is not empty
        current_sentence.append(line)
    else:  # If a blank line is encountered, store the sentence
        if current_sentence:
            sentences.append(current_sentence)
            current_sentence = []

# Add the last sentence if file doesn't end with a blank line
if current_sentence:
    sentences.append(current_sentence)

# Shuffle sentences
random.shuffle(sentences)

# Split sentences into train, test, and validation sets
test_size = int(len(sentences) * test_split)
validation_size = int(len(sentences) * validation_split)

test_sentences = sentences[:test_size]
validation_sentences = sentences[test_size:test_size + validation_size]
train_sentences = sentences[test_size + validation_size:]

# Function to write sentences to a file
def write_sentences_to_file(sentences, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        for sentence in sentences:
            for line in sentence:
                file.write(line)
            file.write("\n")  # Add a blank line between sentences

# Write the splits to respective files
write_sentences_to_file(train_sentences, train_file_path)
write_sentences_to_file(test_sentences, test_file_path)
write_sentences_to_file(validation_sentences, validation_file_path)

In [54]:
from datasets import Dataset, DatasetDict, Sequence, ClassLabel

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Function to read the BIO file
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": []}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            orgline = line
            line = line.strip()
            if line == "":  # Sentence boundary
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
            else:
                # Split the line into token and label
                parts = line.split()
                if len(parts) == 2:  # Only process lines with exactly two parts
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                else:
                    # TODO check how to add these
                    #print(f"Skipping malformed line: {orgline}")
                    continue

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)

    return sentences

# Read the training dataset
train_data = read_bio_file("trainFinal.txt")
val_data = read_bio_file("validation.txt")
test_data = read_bio_file("test.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data]
    })
})

# Define the ClassLabel feature for NER tags
ner_feature = ClassLabel(names=label_names)

# Cast the ner_tags column to use the ClassLabel feature
dataset = dataset.cast_column("ner_tags", Sequence(ner_feature))

# Display the dataset structure
print(dataset)


Casting the dataset: 100%|██████████| 5859/5859 [00:00<00:00, 109086.35 examples/s]
Casting the dataset: 100%|██████████| 732/732 [00:00<00:00, 20763.46 examples/s]
Casting the dataset: 100%|██████████| 730/730 [00:00<00:00, 308529.01 examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5859
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 732
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 730
    })
})



