In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random
import os
import re

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# Define paths
base_path = "cadecv2"
original_path = os.path.join(base_path, "original")
text_path = os.path.join(base_path, "text")
output_path = os.path.join(base_path, "train.txt")

# Function to parse annotations with semicolon-sliced offsets
def parse_annotations(file_path):
    entities = []
    with open(file_path, "r") as f:
        for line in f:
            if line.startswith("T"):  # Entity line
                parts = line.strip().split("\t")
                tag_number, entity_info, text = parts
                entity_type, *offsets = entity_info.split(" ")
                offset_ranges = " ".join(offsets).split(";")  # Handle semicolon-separated offsets
                for offset_range in offset_ranges:
                    start_offset, end_offset = map(int, offset_range.split(" "))
                    entities.append((start_offset, end_offset, entity_type))
    return entities

# Function to create IOB labeling
def create_iob_labels(text, entities):
    labels = ["O"] * len(text)  # Initialize all tokens with "O"
    for start, end, entity_type in entities:
        labels[start] = f"B-{entity_type}"
        for i in range(start + 1, end):
            labels[i] = f"I-{entity_type}"
    return labels

# Function to tokenize text and align labels
def tokenize_and_label(text, labels):
    tokens = text.split()
    token_labels = []
    text_index = 0
    for token in tokens:
        token_length = len(token)
        if any(char.isalnum() for char in token):  # Skip punctuation
            token_label = labels[text_index : text_index + token_length]
            label = token_label[0] if token_label else "O"
            token_labels.append((token, label))
        else:
            token_labels.append((token, "O"))
        text_index += token_length + 1  # Move index past the token and space
    return token_labels

# Process files
output_lines = []
for text_file in os.listdir(text_path):
    text_file_path = os.path.join(text_path, text_file)
    annotation_file_path = os.path.join(original_path, text_file.replace(".txt", ".ann"))
    
    if os.path.exists(annotation_file_path):
        # Read text
        with open(text_file_path, "r") as f:
            text = f.read()
        
        # Parse annotations and create labels
        entities = parse_annotations(annotation_file_path)
        labels = create_iob_labels(text, entities)
        token_labels = tokenize_and_label(text, labels)
        
        # Write to output
        for token, label in token_labels:
            output_lines.append(f"{token}\t{label}")
            if token.endswith("."):  # Add a blank line after sentences
                output_lines.append("\n")

# Write the output to train.txt
with open(output_path, "w") as f:
    f.write("\n".join(output_lines))

In [3]:
# List of special characters to remove
special_characters = [".", ","]

# Cleaning process
with open("cadecv2/train.txt", "r") as file:
    lines = file.readlines()

cleaned_lines = []
for line in lines:
    # Remove special characters
    for char in special_characters:
        line = line.replace(char, "")
    cleaned_lines.append(line)

# Writing the cleaned data to a new file
with open("train2.txt", "w") as file:
    file.writelines(cleaned_lines)


In [4]:
# Processing the file
with open("train2.txt", "r") as file:
    lines = file.readlines()

processed_lines = []
for line in lines:
    line = line.strip()
    if not line:  # Skip empty lines
        processed_lines.append("\n")
        continue

    if "\t" in line:  # Process only lines with a tab (word-label pairs)
        word, label = line.split("\t")
        if "'" in word:  # Check if the word contains an apostrophe
            base, suffix = word.split("'", 1)  # Split the word at the apostrophe
            processed_lines.append(f"{base}\t{label}\n")  # Add the base part
            processed_lines.append(f"'{suffix}\t{label}\n")  # Add the suffix with the same label
        else:
            processed_lines.append(line + "\n")  # Add the original line
    else:
        processed_lines.append(line + "\n")  # Add lines without tabs as is

# Writing the processed data to a new file
with open("train3.txt", "w") as file:
    file.writelines(processed_lines)

In [5]:
from collections import Counter

# Define the input file path
input_file_path = "train3.txt"  # Replace with your file path

# Initialize a counter for labels
label_counts = Counter()

# Processing the file to count labels
with open(input_file_path, "r") as file:
    for line in file:
        line = line.strip()
        if "\t" in line:  # Process only lines with a tab (word-label pairs)
            _, label = line.split("\t")
            label_counts[label] += 1

# Display the counts for each label
print("Label counts:")
for label, count in label_counts.items():
    print(f"{label}: {count}")

Label counts:
O: 85867
B-ADR: 6469
I-ADR: 8702
B-Drug: 1761
B-Disease: 288
B-Symptom: 285
I-Symptom: 266
I-Disease: 171
I-Drug: 176
B-Finding: 450
I-Finding: 392


In [6]:
import random

# File paths
input_file_path = "train3.txt"  # Replace with your file path
train_file_path = "trainFinal.txt"
test_file_path = "test.txt"
validation_file_path = "validation.txt"

# Percentages for splitting
test_split = 0.10
validation_split = 0.10

# Read the input file
with open(input_file_path, "r", encoding="utf-8") as file:
    lines = file.readlines()

# Separate sentences by blank lines
sentences = []
current_sentence = []
for line in lines:
    if line.strip():  # If the line is not empty
        current_sentence.append(line)
    else:  # If a blank line is encountered, store the sentence
        if current_sentence:
            sentences.append(current_sentence)
            current_sentence = []

# Add the last sentence if file doesn't end with a blank line
if current_sentence:
    sentences.append(current_sentence)

# Shuffle sentences
random.shuffle(sentences)

# Split sentences into train, test, and validation sets
test_size = int(len(sentences) * test_split)
validation_size = int(len(sentences) * validation_split)

test_sentences = sentences[:test_size]
validation_sentences = sentences[test_size:test_size + validation_size]
train_sentences = sentences[test_size + validation_size:]

# Function to write sentences to a file
def write_sentences_to_file(sentences, file_path):
    with open(file_path, "w", encoding="utf-8") as file:
        for sentence in sentences:
            for line in sentence:
                file.write(line)
            file.write("\n")  # Add a blank line between sentences

# Write the splits to respective files
write_sentences_to_file(train_sentences, train_file_path)
write_sentences_to_file(test_sentences, test_file_path)
write_sentences_to_file(validation_sentences, validation_file_path)

In [7]:
from datasets import Dataset, DatasetDict, Sequence, ClassLabel

# Define the new labels
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]

# Create a mapping from label to integer ID
label_mapping = {label: idx for idx, label in enumerate(label_names)}

# Function to map labels to integers
def label_to_id(label):
    return label_mapping.get(label, -100)  # Return -100 for unknown labels

# Function to read the BIO file
def read_bio_file(filepath):
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": []}

    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            orgline = line
            line = line.strip()
            if line == "":  # Sentence boundary
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": []}
            else:
                # Split the line into token and label
                parts = line.split()
                if len(parts) == 2:  # Only process lines with exactly two parts
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                else:
                    # TODO check how to add these
                    #print(f"Skipping malformed line: {orgline}")
                    continue

        # Add the last sentence if the file doesn't end with a blank line
        if current_sentence["tokens"]:
            sentences.append(current_sentence)

    return sentences

# Read the training dataset
train_data = read_bio_file("trainFinal.txt")
val_data = read_bio_file("validation.txt")
test_data = read_bio_file("test.txt")

# Load data into the HuggingFace dataset structure
dataset = DatasetDict({
    "train": Dataset.from_dict({
        "tokens": [d["tokens"] for d in train_data],
        "ner_tags": [d["ner_tags"] for d in train_data]
    }),
    "test": Dataset.from_dict({
        "tokens": [d["tokens"] for d in test_data],
        "ner_tags": [d["ner_tags"] for d in test_data]
    }),
    "validation": Dataset.from_dict({
        "tokens": [d["tokens"] for d in val_data],
        "ner_tags": [d["ner_tags"] for d in val_data]
    })
})

# Define the ClassLabel feature for NER tags
ner_feature = ClassLabel(names=label_names)

# Cast the ner_tags column to use the ClassLabel feature
dataset = dataset.cast_column("ner_tags", Sequence(ner_feature))

# Display the dataset structure
print(dataset)


Casting the dataset: 100%|██████████| 5858/5858 [00:00<00:00, 127491.21 examples/s]
Casting the dataset: 100%|██████████| 730/730 [00:00<00:00, 188583.51 examples/s]
Casting the dataset: 100%|██████████| 733/733 [00:00<00:00, 43399.56 examples/s]

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 5858
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 730
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 733
    })
})





In [8]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

# Pre-processing the data and tokenize

def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)


Map: 100%|██████████| 5858/5858 [00:01<00:00, 4751.53 examples/s]
Map: 100%|██████████| 730/730 [00:00<00:00, 6464.57 examples/s]
Map: 100%|██████████| 733/733 [00:00<00:00, 4611.47 examples/s]


A model is finetuned with the default hyperparameters settings on the train set and evaluate the model on the test set. These are the <b>baseline</b> results.

In [10]:
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    return_tensors="tf"
)

In [11]:
# Code obtained from: https://huggingface.co/learn/nlp-course/chapter7/2?fw=pt

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=16,
)

tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=16,
)

In [12]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [13]:
model = TFAutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
)




All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# Train in mixed-precision float16
tf.keras.mixed_precision.set_global_policy("mixed_float16")

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=num_train_steps,
    weight_decay_rate=0.01,
)
model.compile(optimizer=optimizer)

The dtype policy mixed_float16 may run slowly because this machine does not have a GPU. Only Nvidia GPUs with compute capability of at least 7.0 run quickly with mixed_float16.


In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

Epoch 1/3

Epoch 2/3
 44/367 [==>...........................] - ETA: 16:49 - loss: 0.2602

Tot hier gebleven

In [None]:
# Load seqeval metric
metric = evaluate.load("seqeval")

all_predictions = []
all_labels = []

# Evaluate on the test set
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
    batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

    for pred, true_label in zip(batch_predictions, batch_labels):
        pred_sequence = []
        label_sequence = []
        for predicted_idx, label_idx in zip(pred, true_label):
            if label_idx == -100:  # Skip padding
                continue
            pred_label = label_names[predicted_idx]
            true_label_str = label_names[label_idx]
            pred_sequence.append(pred_label)
            label_sequence.append(true_label_str)
        all_predictions.append(pred_sequence)
        all_labels.append(label_sequence)

# Compute overall metrics
results = metric.compute(predictions=all_predictions, references=all_labels)

print("Overall Metrics:")
print(f"Precision: {results['overall_precision']:.4f}")
print(f"Recall: {results['overall_recall']:.4f}")
print(f"F1-Score: {results['overall_f1']:.4f}")
print(f"Accuracy: {results['overall_accuracy']:.4f}")

# Use seqeval's classification_report to get detailed per-entity metrics
print("\nPer-Entity Metrics:")
print(seqeval_classification_report(all_labels, all_predictions))

# Calculate metrics for each label individually
print("\nMetrics Per Label:")
label_metrics = {}
for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ART', 'I-ART', 'I-MAT', 'B-MAT', 'I-CON', 'B-CON', 'I-SPE', 'B-SPE']:
    # Filter predictions and references for the current label
    label_predictions = [
        [tag if tag == label else "O" for tag in pred_seq]
        for pred_seq in all_predictions
    ]
    label_references = [
        [tag if tag == label else "O" for tag in true_seq]
        for true_seq in all_labels
    ]

    # Compute precision, recall, and F1 score for the specific label
    label_result = metric.compute(predictions=label_predictions, references=label_references)
    label_metrics[label] = {
        "precision": label_result['overall_precision'],
        "recall": label_result['overall_recall'],
        "f1": label_result['overall_f1']
    }

# Print metrics for each label
for label, metrics in label_metrics.items():
    print(f"Label: {label}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1']:.4f}")

Set up hyperparameter optimization.

In [None]:
params = {
    "learning_rates": [1e-5, 2e-5, 3e-5],
    "batch_sizes": [8, 16, 32],
    "weight_decays": [0.01, 0.001, 0.0001]
}

def random_search(params, results):
    results = pd.DataFrame(columns=["learning_rate", "batch_size", "weight_decay", "val_loss"])
    # Perform a random search on 10 combinations
    for i in range(10):
        lr = random.choice(params["learning_rates"])
        batch_size = random.choice(params["batch_sizes"])
        weight_decay = random.choice(params["weight_decays"])
        print(f"Training with learning rate: {lr}, Batch size: {batch_size}, and weight decay: {weight_decay}")

        # Create tf datasets with the current batch size
        tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
            collate_fn=data_collator,
            shuffle=True,
            batch_size=batch_size,
        )
        tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
            columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
            collate_fn=data_collator,
            shuffle=False,
            batch_size=batch_size,
        )

        # Set up optimizer with the current learning rate and weight decay
        optimizer, schedule = create_optimizer(
            init_lr=lr,
            num_warmup_steps=0,
            num_train_steps=len(tf_train_dataset) * num_epochs,
            weight_decay_rate=weight_decay,
        )
        model.compile(optimizer=optimizer)

        # Train the model
        history = model.fit(
            tf_train_dataset,
            validation_data=tf_eval_dataset,
            epochs=num_epochs,
        )

        # Get the validation loss for the final epoch
        val_loss = history.history['val_loss'][-1]
        # Append the results as a new DataFrame and concatenate
        new_row = pd.DataFrame({
            "learning_rate": [lr],
            "batch_size": [batch_size],
            "weight_decay": [weight_decay],
            "val_loss": [val_loss]
        })
        results = pd.concat([results, new_row], ignore_index=True)
    return results

results = pd.DataFrame(columns=["learning_rate", "batch_size", "weight_decay", "val_loss"])
results = random_search(params, results)

# Display all results
print(results)
results.to_csv("Results_question_4.csv")

# After optimization, evaluate on test set with best hyperparameters
best_params = results.loc[results['val_loss'].idxmin()]
print("Best hyperparameters:", best_params)

In [None]:
# Re-train with best hyperparameters and evaluate on test set
best_lr = float(best_params["learning_rate"])
best_batch_size = int(best_params["batch_size"])
best_weight_decay = float(best_params["weight_decay"])

print(best_lr, best_batch_size, best_weight_decay)

tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=True,
    batch_size=best_batch_size,
)
tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
    collate_fn=data_collator,
    shuffle=False,
    batch_size=best_batch_size,
)

# Set up optimizer with the current learning rate and weight decay
optimizer, schedule = create_optimizer(
    init_lr=best_lr,
    num_warmup_steps=0,
    num_train_steps=len(tf_train_dataset) * num_epochs,
    weight_decay_rate=best_weight_decay,
)

model.compile(optimizer=optimizer)

history = model.fit(
    tf_train_dataset,
    validation_data=tf_eval_dataset,
    epochs=num_epochs,
)

# Evaluate on test set
all_predictions = []
all_labels = []
for batch in tf_test_dataset:# Load seqeval metric
metric = evaluate.load("seqeval")

all_predictions = []
all_labels = []

# Evaluate on the test set
for batch in tf_test_dataset:
    logits = model.predict_on_batch(batch)["logits"]
    batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
    batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

    for pred, true_label in zip(batch_predictions, batch_labels):
        pred_sequence = []
        label_sequence = []
        for predicted_idx, label_idx in zip(pred, true_label):
            if label_idx == -100:  # Skip padding
                continue
            pred_label = label_names[predicted_idx]
            true_label_str = label_names[label_idx]
            pred_sequence.append(pred_label)
            label_sequence.append(true_label_str)
        all_predictions.append(pred_sequence)
        all_labels.append(label_sequence)

# Compute overall metrics
results = metric.compute(predictions=all_predictions, references=all_labels)

print("Overall Metrics:")
print(f"Precision: {results['overall_precision']:.4f}")
print(f"Recall: {results['overall_recall']:.4f}")
print(f"F1-Score: {results['overall_f1']:.4f}")
print(f"Accuracy: {results['overall_accuracy']:.4f}")

# Use seqeval's classification_report to get detailed per-entity metrics
print("\nPer-Entity Metrics:")
print(seqeval_classification_report(all_labels, all_predictions))

# Calculate metrics for each label individually
print("\nMetrics Per Label:")
label_metrics = {}
for label in ['O', 'B-PER', 'I-PER', 'B-LOC', 'I-LOC', 'B-ART', 'I-ART', 'I-MAT', 'B-MAT', 'I-CON', 'B-CON', 'I-SPE', 'B-SPE']:
    # Filter predictions and references for the current label
    label_predictions = [
        [tag if tag == label else "O" for tag in pred_seq]
        for pred_seq in all_predictions
    ]
    label_references = [
        [tag if tag == label else "O" for tag in true_seq]
        for true_seq in all_labels
    ]

    # Compute precision, recall, and F1 score for the specific label
    label_result = metric.compute(predictions=label_predictions, references=label_references)
    label_metrics[label] = {
        "precision": label_result['overall_precision'],
        "recall": label_result['overall_recall'],
        "f1": label_result['overall_f1']
    }

# Print metrics for each label
for label, metrics in label_metrics.items():
    print(f"Label: {label}")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1-Score: {metrics['f1']:.4f}")
    logits = model.predict_on_batch(batch)["logits"]
    labels = batch["labels"]
    predictions = np.argmax(logits, axis=-1)
    for prediction, label in zip(predictions, labels):
        for predicted_idx, label_idx in zip(prediction, label):
            if label_idx == -100:
                continue
            all_predictions.append(label_names[predicted_idx])
            all_labels.append(label_names[label_idx])

test_metric = metric.compute(predictions=[all_predictions], references=[all_labels])
print("Test set performance:", test_metric)

In [None]:
# Count label occurrences in the training dataset
label_counter = Counter()
for batch in tf_train_dataset:
    labels = batch["labels"].numpy()
    for label_seq in labels:
        for label in label_seq:
            if label != -100:  # Exclude padding labels
                label_counter[label_names[label]] += 1

print("Label distribution in the training set:")
for label, count in label_counter.items():
    print(f"{label}: {count}")