In [104]:
# Load base packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch

# load utalities
from tqdm import tqdm
import re
import random
from collections import defaultdict

# load dataset tools
import datasets
from datasets import load_dataset

# preprocessing tools
from sklearn.preprocessing import OneHotEncoder

# load models
from TorchCRF import CRF


# load eval tools
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support

# load tokenizing tools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

In [105]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load data

In [106]:
# Extract handout.txt from each subdirectory of RawData
def read_handout_txt():
    data = []

    for root, dirs, files in os.walk("./data/RawData/"):
        try:
            with open(os.path.join(root, "handout.txt"), "r") as f:
                handout = f.readlines()
        except:
            print(f"{root}/handout.txt Not Found")
            continue

        for i, line in enumerate(handout):
            line = line.strip()

            # number lines
            line_dict = {
                "Drug name": root.split("/")[-1],
                "Line number": i + 1,
                "Line": line,
            }

            data.append(line_dict)

    return data

In [107]:
anno_df = pd.read_csv("data/AnnotatedData/AnnotatedDUGData.tsv", sep="\t")

In [108]:
raw_df = pd.DataFrame(read_handout_txt())

./data/RawData//handout.txt Not Found
./data/RawData/Coreg/handout.txt Not Found


In [109]:
raw_df.head()

Unnamed: 0,Drug name,Line number,Line
0,Abilify,1,Patient Educationaripiprazole intramuscular
1,Abilify,2,IMPORTANT: HOW TO USE THIS INFORMATION: This ...
2,Abilify,3,ARIPIPRAZOLE EXTENDED RELEASE - INJECTION
3,Abilify,4,(AR-i-PIP-ra-zole)
4,Abilify,5,"COMMON BRAND NAME(S): Abilify Maintena, Aristada"


# Filter data

In [110]:
anno_df = anno_df[["Drug name", "Drug number", "Advice Text"]]
anno_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text
0,Abilify,0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.


# Reassign line numbers

The line number present in the data is determined based off of scentence structure and not line number. We will locate the Advice text in the raw text, and assign it a new line number label based on the corresponing line. 

This will help us to assign IOB tags to the data.

In [111]:
def find_line_number(advice, raw_data_df):

    for i, line in raw_data_df.iterrows():
        if advice in line["Line"]:
            return line["Line number"]
    return None

In [112]:
# Find line number for each advice text
anno_df["Line number"] = anno_df["Advice Text"].apply(
    lambda x: find_line_number(x, raw_df)
)
anno_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0
1,Abilify,0,This medication may rarely make your blood sug...,20.0
2,Abilify,0,This medication may rarely cause a condition k...,21.0
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0
4,Abilify,0,Avoid alcoholic beverages.,31.0


# Merge data

In [113]:
# merge dataframes
merged_df = pd.merge(anno_df, raw_df, on=["Drug name", "Line number"])
merged_df.head()

Unnamed: 0,Drug name,Drug number,Advice Text,Line number,Line
0,Abilify,0,To reduce the risk of dizziness and lightheade...,17.0,To reduce the risk of dizziness and lightheade...
1,Abilify,0,This medication may rarely make your blood sug...,20.0,This medication may rarely make your blood sug...
2,Abilify,0,This medication may rarely cause a condition k...,21.0,This medication may rarely cause a condition k...
3,Abilify,0,This drug may make you dizzy or drowsy or caus...,31.0,This drug may make you dizzy or drowsy or caus...
4,Abilify,0,Avoid alcoholic beverages.,31.0,This drug may make you dizzy or drowsy or caus...


# IOB tagging

In [114]:
def generate_bioe_tags(text, advice):
    # basic tokenization
    text_words = text.split()
    advice_words = advice.split()

    start_idx = 0
    end_idx = 0

    advice_len = len(advice_words)

    for i in range(len(text_words)):
        if text_words[i : i + advice_len] == advice_words:
            # print('found')
            start_idx = i
            end_idx = i + advice_len
            break
    # print(start_idx, end_idx)

    # create tags
    tags = ["O"] * len(text_words)
    tags[start_idx] = "B"
    tags[end_idx - 1] = "E"
    for i in range(start_idx + 1, end_idx - 1):
        tags[i] = "I"

    return tags

In [115]:
def encode_tags(tags):
    mapping = {"O": 0, "B": 1, "I": 2, "E": 3}
    return [mapping[tag] for tag in tags]

In [116]:
tagged_df = merged_df.copy()
tagged_df["labels"] = tagged_df.apply(
    lambda x: generate_bioe_tags(x["Line"], x["Advice Text"]), axis=1
)

# reframe
tagged_df = tagged_df[["Advice Text", "Line", "labels"]]

# rename
tagged_df.columns = ["advice", "text", "labels"]

tagged_df.head()

Unnamed: 0,advice,text,labels
0,To reduce the risk of dizziness and lightheade...,To reduce the risk of dizziness and lightheade...,"[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
1,This medication may rarely make your blood sug...,This medication may rarely make your blood sug...,"[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
2,This medication may rarely cause a condition k...,This medication may rarely cause a condition k...,"[B, I, I, I, I, I, I, I, I, I, E, O, O, O, O, ..."
3,This drug may make you dizzy or drowsy or caus...,This drug may make you dizzy or drowsy or caus...,"[B, I, I, I, I, I, I, I, I, I, I, I, I, I, I, ..."
4,Avoid alcoholic beverages.,This drug may make you dizzy or drowsy or caus...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


# Convert tagged data to dataset

## Encode the labels

In [117]:
# encode tags
tagged_df["labels"] = tagged_df["labels"].apply(lambda x: encode_tags(x))
tagged_df.head()

Unnamed: 0,advice,text,labels
0,To reduce the risk of dizziness and lightheade...,To reduce the risk of dizziness and lightheade...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
1,This medication may rarely make your blood sug...,This medication may rarely make your blood sug...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
2,This medication may rarely cause a condition k...,This medication may rarely cause a condition k...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 0, 0, ..."
3,This drug may make you dizzy or drowsy or caus...,This drug may make you dizzy or drowsy or caus...,"[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."
4,Avoid alcoholic beverages.,This drug may make you dizzy or drowsy or caus...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Convert to dataset

# Evaluate the baseline

In [118]:
def pred_random_IOB(text):

    # get the length of the text
    length = len(text)

    # make array of zeros
    preds = np.zeros(length)

    # get a random number between 0 and the length of the text
    random_start = np.random.randint(0, length - 1)
    random_stop = np.random.randint(random_start + 1, length)

    # set the random start to 1
    preds[random_start] = 1

    # set the random stop to 3
    preds[random_stop] = 3

    # set the values in between to 2
    preds[random_start + 1 : random_stop] = 2

    return preds

In [119]:
preds = tagged_df["labels"].apply(pred_random_IOB)
preds.head()

0    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 2.0, ...
1    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
2    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 2.0, ...
3    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
4    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
Name: labels, dtype: object

## Evalute the baseline at token level

We will find the accuracy for evaluating the baseline at the token level.

In [120]:
def calculate_accuracy(ground_truth, preds):
    correct = 0
    total = 0
    for true, pred in zip(ground_truth, preds):
        for t, p in zip(true, pred):
            if t == p:
                correct += 1
            total += 1
    return correct / total


# calculate the accuracy of the token level predictions
calculate_accuracy(tagged_df["labels"], preds)

0.5056711012750547

## Evaluate the baseline at span-level

In [121]:
# evaluate f1 at the span level
def calculate_f1_span_level(ground_truth, preds):
    # flatten the list
    ground_truth = [tag for tags in ground_truth for tag in tags]
    preds = [tag for tags in preds for tag in tags]

    # calculate precision, recall, f1
    precision, recall, f1_score, _ = precision_recall_fscore_support(
        ground_truth, preds, average="macro"
    )

    return precision, recall, f1_score


# calculate f1 at the span level
precision, recall, f1_score = calculate_f1_span_level(
    tagged_df["labels"], preds
)

print("Span-level precision:", precision)

print("Span-level recall:", recall)

print("Span-level F1-score:", f1_score)

Span-level precision: 0.28494639680839756
Span-level recall: 0.2832972496230585
Span-level F1-score: 0.2784944941860884


# Transformer Model

In [122]:
data = tagged_df.copy()

In [123]:
model_id = "roberta-base"

In [124]:
train, test = train_test_split(data, test_size=0.2, random_state=42)
train, val = train_test_split(train, test_size=0.3, random_state=42)

In [125]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [126]:
def preprocess_text(example, max_length=512):
    # Because of the way that the BIOE tagging was setup, we need to make sure that the
    # tokenization is aligned with the tagging. This means that we need to tokenize the
    # text and then assign the correct label to each token.

    text = example["text"].split()
    tags = example["labels"]

    token_ids = [tokenizer.cls_token_id]
    label_alignment = [0]
    attention_mask = [1]
    special_label_for_subwords = -100

    for word, label in zip(text, tags):
        subword_tokens = tokenizer.tokenize(word)

        if len(subword_tokens) > 0:
            token_ids.extend(tokenizer.convert_tokens_to_ids(subword_tokens))
            # Assign the correct label to the first subword token
            label_alignment.append(label)
            # Use a special label for subsequent subword tokens
            label_alignment.extend(
                [special_label_for_subwords] * (len(subword_tokens) - 1)
            )
            attention_mask.extend([1] * len(subword_tokens))

    # add [SEP] token
    token_ids.append(tokenizer.sep_token_id)
    label_alignment.append(0)
    attention_mask.append(1)

    # pad to max length
    padding_length = max_length - len(token_ids)
    token_ids.extend([tokenizer.pad_token_id] * padding_length)
    label_alignment.extend([0] * padding_length)
    attention_mask.extend([0] * padding_length)

    # Make sure everything has correct length
    assert len(token_ids) == max_length
    assert len(label_alignment) == max_length
    assert len(attention_mask) == max_length

    return {
        "input_ids": token_ids,
        "labels": label_alignment,
        "attention_mask": attention_mask,
    }

In [127]:
train_dataset = datasets.Dataset.from_pandas(train)
val_dataset = datasets.Dataset.from_pandas(val)
test_dataset = datasets.Dataset.from_pandas(test)

In [128]:
train_dataset = train_dataset.map(preprocess_text)
val_dataset = val_dataset.map(preprocess_text)
test_dataset = test_dataset.map(preprocess_text)

train_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)
val_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)
test_dataset.set_format(
    type="torch", columns=["input_ids", "labels", "attention_mask"]
)

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/219 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

In [129]:
# look at first example input_ids and labels
train_dataset[4]

{'labels': tensor([   0,    1,    2,    2,    2,    2,    2,    2, -100,    2,    2, -100,
            2, -100, -100,    2, -100,    2,    2,    2,    3, -100, -100, -100,
         -100,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    

In [130]:
# define training arguments
from transformers import (
    TrainingArguments,
    Trainer,
    AutoModelForTokenClassification,
)

training_args = TrainingArguments(
    output_dir=f"./models/information_extraction/{model_id}",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_steps=100,
    save_total_limit=2,
)

In [131]:
class AutoModelForTokenClassificationCRF(AutoModelForTokenClassification):
    def __init__(self, config):
        super().__init__(config)
        self.crf = CRF(config.num_labels, batch_first=True)

    def forward(
        self, input_ids, attention_mask=None, labels=None, token_type_ids=None
    ):
        outputs = super().forward(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        sequence_output = outputs[0]

        if labels is not None:
            # During training, we use the CRF layer to calculate the loss
            log_likelihood = self.crf(
                sequence_output,
                labels,
                mask=attention_mask.byte(),
                reduction="mean",
            )
            loss = -log_likelihood  # Negative log-likelihood
            return loss
        else:
            # During prediction, we decode the best label sequence
            prediction = self.crf.decode(
                sequence_output, mask=attention_mask.byte()
            )
            return prediction

In [132]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained(model_id, num_labels=4)
model = AutoModelForTokenClassificationCRF.from_pretrained(
    model_id, config=config
)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [133]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # custom weights to handle class imbalance
        class_weights = torch.tensor([1.0, 4.0, 1.0, 4.0]).to(logits.device)

        loss_fct = torch.nn.CrossEntropyLoss(
            weight=class_weights, ignore_index=-100
        )
        loss = loss_fct(
            logits.view(-1, self.model.config.num_labels), labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

In [134]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [135]:
trainer.train()

  0%|          | 0/381 [00:00<?, ?it/s]

{'loss': 1.3354, 'grad_norm': 11.514975547790527, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.08}
{'loss': 1.2369, 'grad_norm': 12.123732566833496, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.16}
{'loss': 1.0215, 'grad_norm': 12.068023681640625, 'learning_rate': 3e-06, 'epoch': 0.24}
{'loss': 0.5827, 'grad_norm': 8.419600486755371, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.31}
{'loss': 0.2459, 'grad_norm': 1.9941552877426147, 'learning_rate': 5e-06, 'epoch': 0.39}
{'loss': 0.1638, 'grad_norm': 0.9868439435958862, 'learning_rate': 6e-06, 'epoch': 0.47}
{'loss': 0.1592, 'grad_norm': 1.186097502708435, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.55}
{'loss': 0.154, 'grad_norm': 0.9635775685310364, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.63}
{'loss': 0.1364, 'grad_norm': 1.0151338577270508, 'learning_rate': 9e-06, 'epoch': 0.71}
{'loss': 0.1266, 'grad_norm': 2.5791282653808594, 'learning_rate': 1e-05, 'epoch': 0.79}


  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 0.12522900104522705, 'eval_runtime': 4.371, 'eval_samples_per_second': 50.103, 'eval_steps_per_second': 12.583, 'epoch': 0.79}
{'loss': 0.1273, 'grad_norm': 0.737683892250061, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.87}
{'loss': 0.112, 'grad_norm': 2.1842164993286133, 'learning_rate': 1.2e-05, 'epoch': 0.94}
{'loss': 0.0967, 'grad_norm': 1.2556512355804443, 'learning_rate': 1.3000000000000001e-05, 'epoch': 1.02}
{'loss': 0.0839, 'grad_norm': 3.1808149814605713, 'learning_rate': 1.4000000000000001e-05, 'epoch': 1.1}
{'loss': 0.0717, 'grad_norm': 1.1634185314178467, 'learning_rate': 1.5e-05, 'epoch': 1.18}
{'loss': 0.087, 'grad_norm': 0.748416006565094, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.26}
{'loss': 0.0873, 'grad_norm': 1.6133924722671509, 'learning_rate': 1.7000000000000003e-05, 'epoch': 1.34}
{'loss': 0.1094, 'grad_norm': 0.8199810981750488, 'learning_rate': 1.8e-05, 'epoch': 1.42}
{'loss': 0.0698, 'grad_norm': 1.5296849012374878, 'learning_r

  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 0.07550052553415298, 'eval_runtime': 4.407, 'eval_samples_per_second': 49.694, 'eval_steps_per_second': 12.48, 'epoch': 1.57}
{'loss': 0.0727, 'grad_norm': 3.144752264022827, 'learning_rate': 2.1e-05, 'epoch': 1.65}
{'loss': 0.0729, 'grad_norm': 0.6598449349403381, 'learning_rate': 2.2000000000000003e-05, 'epoch': 1.73}
{'loss': 0.0772, 'grad_norm': 4.083242893218994, 'learning_rate': 2.3000000000000003e-05, 'epoch': 1.81}
{'loss': 0.0824, 'grad_norm': 1.417886734008789, 'learning_rate': 2.4e-05, 'epoch': 1.89}
{'loss': 0.0735, 'grad_norm': 0.7137104272842407, 'learning_rate': 2.5e-05, 'epoch': 1.97}
{'loss': 0.0706, 'grad_norm': 1.1941686868667603, 'learning_rate': 2.6000000000000002e-05, 'epoch': 2.05}
{'loss': 0.0775, 'grad_norm': 0.7145252823829651, 'learning_rate': 2.7000000000000002e-05, 'epoch': 2.13}
{'loss': 0.0697, 'grad_norm': 1.034682035446167, 'learning_rate': 2.8000000000000003e-05, 'epoch': 2.2}
{'loss': 0.0752, 'grad_norm': 1.5058140754699707, 'learning_ra

  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 0.08371739089488983, 'eval_runtime': 4.422, 'eval_samples_per_second': 49.525, 'eval_steps_per_second': 12.438, 'epoch': 2.36}
{'loss': 0.0651, 'grad_norm': 1.453368902206421, 'learning_rate': 3.1e-05, 'epoch': 2.44}
{'loss': 0.0729, 'grad_norm': 0.630334734916687, 'learning_rate': 3.2000000000000005e-05, 'epoch': 2.52}
{'loss': 0.0706, 'grad_norm': 0.4381190538406372, 'learning_rate': 3.3e-05, 'epoch': 2.6}
{'loss': 0.0668, 'grad_norm': 1.3177576065063477, 'learning_rate': 3.4000000000000007e-05, 'epoch': 2.68}
{'loss': 0.0845, 'grad_norm': 1.1741180419921875, 'learning_rate': 3.5e-05, 'epoch': 2.76}
{'loss': 0.0717, 'grad_norm': 1.5329148769378662, 'learning_rate': 3.6e-05, 'epoch': 2.83}
{'loss': 0.0717, 'grad_norm': 1.4395291805267334, 'learning_rate': 3.7e-05, 'epoch': 2.91}
{'loss': 0.0762, 'grad_norm': 0.783348798751831, 'learning_rate': 3.8e-05, 'epoch': 2.99}
{'train_runtime': 122.9829, 'train_samples_per_second': 12.392, 'train_steps_per_second': 3.098, 'train_l

TrainOutput(global_step=381, training_loss=0.19466114875290963, metrics={'train_runtime': 122.9829, 'train_samples_per_second': 12.392, 'train_steps_per_second': 3.098, 'train_loss': 0.19466114875290963, 'epoch': 3.0})

# Predictions and Post-Processing

In [136]:
# make prediction with the trainer on the test text
preds = trainer.predict(test_dataset)

# get the predicted labels
pred_labels = np.argmax(preds.predictions, axis=2)
pred_labels[0]

  0%|          | 0/46 [00:00<?, ?it/s]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [137]:
def postprocess_preds(pred_labels):
    # takes an array of preds and postprocesses them
    filled_array = np.copy(pred_labels)
    num_examples, width = filled_array.shape

    for i in range(num_examples):
        first_1_index = np.where(filled_array[i] == 1)[0]
        first_3_index = np.where(filled_array[i] == 3)[0]

        if len(first_1_index) > 0 and len(first_3_index) > 0:
            start_index = first_1_index[0]
            end_index = first_3_index[0]

            filled_array[i, start_index + 1 : end_index] = 2

    return filled_array

In [138]:
# calculate accuracy based on non-padding tokens
def calculate_accuracy_non_padding(predictions, labels, attention_mask):
    correct = 0
    total = 0
    for pred, label, mask in zip(predictions, labels, attention_mask):
        # print(pred, label, mask)
        for p, l, m in zip(pred, label, mask):
            if m == 1:
                if p == l:
                    correct += 1
                total += 1
    return correct / total

In [None]:
attention_mask = test_dataset["attention_mask"]

# convert pred_labels to tensor
pred_labels = torch.tensor(pred_labels).to(device)

calculate_accuracy_non_padding(
    pred_labels, test_dataset["labels"], test_dataset["attention_mask"]
)

In [None]:
# calculate span-level f1
precision, recall, f1_score = calculate_f1_span_level(
    test_dataset["labels"].to("cpu"), pred_labels.to("cpu")
)

print("Span-level precision:", precision)
print("Span-level recall:", recall)
print("Span-level F1-score:", f1_score)

In [139]:
pred_labels = postprocess_preds(pred_labels)
pred_labels[1]

array([0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [140]:
attention_mask = test_dataset["attention_mask"]

# convert pred_labels to tensor
pred_labels = torch.tensor(pred_labels).to(device)

calculate_accuracy_non_padding(
    pred_labels, test_dataset["labels"], test_dataset["attention_mask"]
)

0.3801836024058246

In [141]:
# calculate span-level f1
precision, recall, f1_score = calculate_f1_span_level(
    test_dataset["labels"].to("cpu"), pred_labels.to("cpu")
)

print("Span-level precision:", precision)
print("Span-level recall:", recall)
print("Span-level F1-score:", f1_score)

Span-level precision: 0.4072017447273148
Span-level recall: 0.6767245561594364
Span-level F1-score: 0.4813581391130929


  _warn_prf(average, modifier, msg_start, len(result))


### Extract example advice

# ---------------------