In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer
import datasets
import chardet  
from datasets import Dataset, DatasetDict, Sequence, ClassLabel, Value
import tensorflow as tf
from transformers import DataCollatorForTokenClassification
from transformers import TFAutoModelForTokenClassification
from transformers import create_optimizer
import tensorflow as tf
import evaluate
import numpy as np
from seqeval.metrics import classification_report as seqeval_classification_report
import pandas as pd
from collections import Counter
import random
import os
import re

model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Define label names and mappings
label_names = [
    "O", "B-ADR", "I-ADR", "B-Drug", "I-Drug",
    "B-Disease", "I-Disease", "B-Symptom", "I-Symptom",
    "B-Finding", "I-Finding"
]
label_mapping = {label: idx for idx, label in enumerate(label_names)}


def label_to_id(label):
    """Map a label to its corresponding ID."""
    return label_mapping.get(label, -100)


def detect_encoding(filepath):
    """Detect file encoding using chardet."""
    with open(filepath, 'rb') as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
    return result['encoding']


def read_bio_file(filepath):
    """Read a BIO file and extract tokens, NER tags, and ADR codes."""
    encoding = detect_encoding(filepath)
    print(f"Detected encoding for {filepath}: {encoding}")
    
    sentences = []
    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}

    with open(filepath, "r", encoding=encoding) as f:
        for line in f:
            line = line.strip()
            if line == "":
                if current_sentence["tokens"]:
                    sentences.append(current_sentence)
                    current_sentence = {"tokens": [], "ner_tags": [], "adr_codes": []}
            else:
                parts = line.split(maxsplit=2)
                if len(parts) == 3:
                    token, label, code = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(code)
                elif len(parts) == 2:
                    token, label = parts
                    current_sentence["tokens"].append(token)
                    current_sentence["ner_tags"].append(label_to_id(label))
                    current_sentence["adr_codes"].append(None)
                else:
                    print(f"Skipping malformed line: {line}")
    
    if current_sentence["tokens"]:
        sentences.append(current_sentence)
    
    return sentences


def create_dataset(train_file, val_file, test_file):
    """Create a DatasetDict from BIO files."""
    train_data = read_bio_file(train_file)
    val_data = read_bio_file(val_file)
    test_data = read_bio_file(test_file)
    
    dataset = DatasetDict({
        "train": Dataset.from_dict({
            "tokens": [d["tokens"] for d in train_data],
            "ner_tags": [d["ner_tags"] for d in train_data],
            "adr_codes": [d["adr_codes"] for d in train_data]
        }),
        "validation": Dataset.from_dict({
            "tokens": [d["tokens"] for d in val_data],
            "ner_tags": [d["ner_tags"] for d in val_data],
            "adr_codes": [d["adr_codes"] for d in val_data]
        }),
        "test": Dataset.from_dict({
            "tokens": [d["tokens"] for d in test_data],
            "ner_tags": [d["ner_tags"] for d in test_data],
            "adr_codes": [d["adr_codes"] for d in test_data]
        })
    })

    ner_feature = Sequence(ClassLabel(names=label_names))
    adr_feature = Sequence(Value("string"))

    dataset = dataset.cast_column("ner_tags", ner_feature)
    dataset = dataset.cast_column("adr_codes", adr_feature)
    
    return dataset

def align_labels_with_tokens(labels, word_ids):
    """Align labels with tokens after tokenization."""
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            new_labels.append(-100)
        else:
            label = labels[word_id]
            if label % 2 == 1:
                label += 1
            new_labels.append(label)
    return new_labels


def tokenize_and_align_labels(dataset, tokenizer):
    """Tokenize dataset and align labels."""
    def tokenize_fn(examples):
        tokenized_inputs = tokenizer(
            examples["tokens"], truncation=True, is_split_into_words=True
        )
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))
        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs
    
    return dataset.map(
        tokenize_fn,
        batched=True,
        remove_columns=dataset["train"].column_names,
    )

def prepare_tf_datasets(tokenized_datasets, tokenizer):
    """Convert tokenized datasets to TensorFlow datasets."""
    data_collator = DataCollatorForTokenClassification(
        tokenizer=tokenizer,
        return_tensors="tf"
    )
    
    tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=True,
        batch_size=16,
    )
    
    tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=False,
        batch_size=16,
    )
    
    tf_test_dataset = tokenized_datasets["test"].to_tf_dataset(
        columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
        collate_fn=data_collator,
        shuffle=False,
        batch_size=16,
    )
    
    return tf_train_dataset, tf_eval_dataset, tf_test_dataset

def initialize_model(model_checkpoint):
    """Initialize the Token Classification model."""
    id2label = {i: label for i, label in enumerate(label_names)}
    label2id = {v: k for k, v in id2label.items()}
    
    model = TFAutoModelForTokenClassification.from_pretrained(
        model_checkpoint,
        id2label=id2label,
        label2id=label2id,
    )
    return model

def compile_model(model, train_dataset, num_epochs=1):
    """Compile the model with optimizer and learning rate schedule."""
    num_train_steps = len(train_dataset) * num_epochs
    
    optimizer, schedule = create_optimizer(
        init_lr=2e-5,
        num_warmup_steps=0,
        num_train_steps=num_train_steps,
        weight_decay_rate=0.01,
    )
    
    model.compile(optimizer=optimizer)
    return model

# Load and preprocess datase
# edit training set here
dataset = create_dataset("Train-test-split/train_Org.txt", "Train-test-split/validation_Org.txt", "Train-test-split/test_Org.txt")
tokenized_datasets = tokenize_and_align_labels(dataset, tokenizer)

# Prepare TensorFlow datasets
tf_train, tf_val, tf_test = prepare_tf_datasets(tokenized_datasets, tokenizer)

# Initialize and compile model
model = initialize_model('bert-base-cased')
model = compile_model(model, tf_train)

# Training (fit separately)
model.fit(tf_train, validation_data=tf_val, epochs=3)

# Save the trained model and tokenizer
# edit name for train set
output_dir = "saved_model/Org/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")


# Load and preprocess datase
# edit training set here
dataset = create_dataset("Train-test-split/train_Meddra.txt", "Train-test-split/validation_Org.txt", "Train-test-split/test_Org.txt")
tokenized_datasets = tokenize_and_align_labels(dataset, tokenizer)

# Prepare TensorFlow datasets
tf_train, tf_val, tf_test = prepare_tf_datasets(tokenized_datasets, tokenizer)

# Initialize and compile model
model = initialize_model('bert-base-cased')
model = compile_model(model, tf_train)

# Training (fit separately)
model.fit(tf_train, validation_data=tf_val, epochs=3)

# Save the trained model and tokenizer
# edit name for train set
output_dir = "saved_model/Meddra/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

# Load and preprocess datase
# edit training set here
dataset = create_dataset("Train-test-split/train_Sct.txt", "Train-test-split/validation_Org.txt", "Train-test-split/test_Org.txt")
tokenized_datasets = tokenize_and_align_labels(dataset, tokenizer)

# Prepare TensorFlow datasets
tf_train, tf_val, tf_test = prepare_tf_datasets(tokenized_datasets, tokenizer)

# Initialize and compile model
model = initialize_model('bert-base-cased')
model = compile_model(model, tf_train)

# Training (fit separately)
model.fit(tf_train, validation_data=tf_val, epochs=3)

# Save the trained model and tokenizer
# edit name for train set
output_dir = "saved_model/Sct/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

# Load and preprocess datase
# edit training set here
dataset = create_dataset("Train-test-split/train_All.txt", "Train-test-split/validation_Org.txt", "Train-test-split/test_Org.txt")
tokenized_datasets = tokenize_and_align_labels(dataset, tokenizer)

# Prepare TensorFlow datasets
tf_train, tf_val, tf_test = prepare_tf_datasets(tokenized_datasets, tokenizer)

# Initialize and compile model
model = initialize_model('bert-base-cased')
model = compile_model(model, tf_train)

# Training (fit separately)
model.fit(tf_train, validation_data=tf_val, epochs=3)

# Save the trained model and tokenizer
# edit name for train set
output_dir = "saved_model/All/"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Model and tokenizer saved to {output_dir}")

Detected encoding for Train-test-split/train_Org.txt: ascii
Detected encoding for Train-test-split/validation_Org.txt: ascii
Detected encoding for Train-test-split/test_Org.txt: ascii


Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/6016 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/6016 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

Map:   0%|          | 0/752 [00:00<?, ? examples/s]

All PyTorch model weights were used when initializing TFBertForTokenClassification.

Some weights or buffers of the TF 2.0 model TFBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3


KeyboardInterrupt: 

In [4]:
def evaluate_model(model, tf_dataset, label_names):
    """
    Evaluate a token classification model using seqeval.

    Parameters:
        model (TFAutoModelForTokenClassification): Trained TensorFlow model.
        tf_dataset (tf.data.Dataset): TensorFlow dataset for evaluation.
        label_names (list): List of label names.

    Returns:
        dict: Overall evaluation metrics and per-label metrics.
    """
    # Load seqeval metric
    metric = evaluate.load("seqeval")

    all_predictions = []
    all_labels = []

    # Iterate over the dataset and predict
    for batch in tf_dataset:
        logits = model.predict_on_batch(batch)["logits"]
        batch_labels = batch["labels"].numpy()  # Avoid overwriting variable 'labels'
        batch_predictions = np.argmax(logits, axis=-1)  # Avoid overwriting variable 'predictions'

        for pred, true_label in zip(batch_predictions, batch_labels):
            pred_sequence = []
            label_sequence = []
            for predicted_idx, label_idx in zip(pred, true_label):
                if label_idx == -100:  # Skip padding
                    continue
                pred_label = label_names[predicted_idx]
                true_label_str = label_names[label_idx]
                pred_sequence.append(pred_label)
                label_sequence.append(true_label_str)
            all_predictions.append(pred_sequence)
            all_labels.append(label_sequence)

    # Compute overall metrics
    results = metric.compute(predictions=all_predictions, references=all_labels)
    print("Overall Metrics:")
    print(f"Precision: {results['overall_precision']:.4f}")
    print(f"Recall: {results['overall_recall']:.4f}")
    print(f"F1-Score: {results['overall_f1']:.4f}")
    print(f"Accuracy: {results['overall_accuracy']:.4f}")

    # Detailed per-entity metrics
    print("\nPer-Entity Metrics:")
    print(seqeval_classification_report(all_labels, all_predictions))

    # Metrics for each label
    print("\nMetrics Per Label:")
    label_metrics = {}
    for label in label_names:
        # Filter predictions and references for the current label
        label_predictions = [
            [tag if tag == label else "O" for tag in pred_seq]
            for pred_seq in all_predictions
        ]
        label_references = [
            [tag if tag == label else "O" for tag in true_seq]
            for true_seq in all_labels
        ]

        # Compute precision, recall, and F1 score for the specific label
        label_result = metric.compute(predictions=label_predictions, references=label_references)
        label_metrics[label] = {
            "precision": label_result['overall_precision'],
            "recall": label_result['overall_recall'],
            "f1": label_result['overall_f1']
        }

    # Print per-label metrics
    for label, metrics in label_metrics.items():
        print(f"Label: {label}")
        print(f"  Precision: {metrics['precision']:.4f}")
        print(f"  Recall: {metrics['recall']:.4f}")
        print(f"  F1-Score: {metrics['f1']:.4f}")
    
    # Return metrics
    return {
        "overall": results,
        "per_label": label_metrics
    }


### Original

In [10]:
# Load the saved model and tokenizer
output_dir = "saved_model/Org/"
loaded_model = TFAutoModelForTokenClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

evaluation_results_org = evaluate_model(loaded_model, tf_test, label_names)

Some layers from the model checkpoint at saved_model/Org/ were not used when initializing TFBertForTokenClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at saved_model/Org/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
  _warn_prf(average, modifier, msg_start, len(result))


Overall Metrics:
Precision: 0.5670
Recall: 0.6361
F1-Score: 0.5996
Accuracy: 0.9213

Per-Entity Metrics:
              precision    recall  f1-score   support

         ADR       0.52      0.65      0.58       792
     Disease       0.00      0.00      0.00        26
        Drug       0.88      0.85      0.86       183
     Finding       0.00      0.00      0.00        30
     Symptom       0.00      0.00      0.00        27

   micro avg       0.57      0.64      0.60      1058
   macro avg       0.28      0.30      0.29      1058
weighted avg       0.54      0.64      0.58      1058


Metrics Per Label:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-ADR
  Precision: 0.7459
  Recall: 0.8119
  F1-Score: 0.7775
Label: I-ADR
  Precision: 0.5190
  Recall: 0.6702
  F1-Score: 0.5850
Label: B-Drug
  Precision: 0.9586
  Recall: 0.8852
  F1-Score: 0.9205
Label: I-Drug
  Precision: 0.9249
  Recall: 0.8743
  F1-Score: 0.8989
Label: B-Disease
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Disease
  Precision: 0.1765
  Recall: 0.1429
  F1-Score: 0.1579
Label: B-Symptom
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Symptom
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-Finding
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Finding
  Precision: 0.2000
  Recall: 0.0400
  F1-Score: 0.0667


In [11]:
# Load the saved model and tokenizer
output_dir = "saved_model/Meddra/"
loaded_model = TFAutoModelForTokenClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

evaluation_results_meddra = evaluate_model(loaded_model, tf_test, label_names)

Some layers from the model checkpoint at saved_model/Meddra/ were not used when initializing TFBertForTokenClassification: ['dropout_75']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at saved_model/Meddra/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.
  _warn_prf(average, modifier, msg_start, len(result))


Overall Metrics:
Precision: 0.5586
Recall: 0.4868
F1-Score: 0.5202
Accuracy: 0.8820

Per-Entity Metrics:
              precision    recall  f1-score   support

         ADR       0.56      0.65      0.60       792
     Disease       0.00      0.00      0.00        26
        Drug       0.00      0.00      0.00       183
     Finding       0.00      0.00      0.00        30
     Symptom       0.00      0.00      0.00        27

   micro avg       0.56      0.49      0.52      1058
   macro avg       0.11      0.13      0.12      1058
weighted avg       0.42      0.49      0.45      1058


Metrics Per Label:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-ADR
  Precision: 0.7891
  Recall: 0.7841
  F1-Score: 0.7866
Label: I-ADR
  Precision: 0.5403
  Recall: 0.6579
  F1-Score: 0.5934
Label: B-Drug
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Drug
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-Disease
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Disease
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-Symptom
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Symptom
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-Finding
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Finding
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000


In [12]:
# Load the saved model and tokenizer
output_dir = "saved_model/Sct/"
loaded_model = TFAutoModelForTokenClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

evaluation_results_Sct = evaluate_model(loaded_model, tf_test, label_names)

Some layers from the model checkpoint at saved_model/Sct/ were not used when initializing TFBertForTokenClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at saved_model/Sct/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


Overall Metrics:
Precision: 0.5914
Recall: 0.5964
F1-Score: 0.5939
Accuracy: 0.9171

Per-Entity Metrics:
              precision    recall  f1-score   support

         ADR       0.54      0.61      0.57       792
     Disease       0.00      0.00      0.00        26
        Drug       0.88      0.82      0.85       183
     Finding       0.00      0.00      0.00        30
     Symptom       0.00      0.00      0.00        27

   micro avg       0.59      0.60      0.59      1058
   macro avg       0.28      0.29      0.28      1058
weighted avg       0.56      0.60      0.58      1058


Metrics Per Label:
Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-ADR
  Precision: 0.8025
  Recall: 0.7437
  F1-Score: 0.7720
Label: I-ADR
  Precision: 0.5301
  Recall: 0.6491
  F1-Score: 0.5836
Label: B-Drug
  Precision: 0.9747
  Recall: 0.8415
  F1-Score: 0.9032
Label: I-Drug
  Precision: 0.9107
  Recall: 0.8361
  F1-Score: 0.8718
Label: B-Disease
  Precision: 0.0000
  Reca

In [13]:
# Load the saved model and tokenizer
output_dir = "saved_model/All/"
loaded_model = TFAutoModelForTokenClassification.from_pretrained(output_dir)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)

evaluation_results_All = evaluate_model(loaded_model, tf_test, label_names)

Some layers from the model checkpoint at saved_model/All/ were not used when initializing TFBertForTokenClassification: ['dropout_151']
- This IS expected if you are initializing TFBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForTokenClassification were initialized from the model checkpoint at saved_model/All/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForTokenClassification for predictions without further training.


Overall Metrics:
Precision: 0.5373
Recall: 0.4972
F1-Score: 0.5164
Accuracy: 0.8843

Per-Entity Metrics:
              precision    recall  f1-score   support

         ADR       0.54      0.66      0.59       792
     Disease       0.00      0.00      0.00        26
        Drug       0.00      0.00      0.00       183
     Finding       0.00      0.00      0.00        30
     Symptom       0.00      0.00      0.00        27

   micro avg       0.54      0.50      0.52      1058
   macro avg       0.11      0.13      0.12      1058
weighted avg       0.40      0.50      0.44      1058


Metrics Per Label:
Label: O
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-ADR
  Precision: 0.7659
  Recall: 0.8056
  F1-Score: 0.7852
Label: I-ADR
  Precision: 0.5129
  Recall: 0.6632
  F1-Score: 0.5784
Label: B-Drug
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: I-Drug
  Precision: 0.0000
  Recall: 0.0000
  F1-Score: 0.0000
Label: B-Disease
  Precision: 0.0000
  Reca