In [1]:
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3 -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

data_path = "processed_entities_tokenized.csv"
df = pd.read_csv(data_path)
df['Sent_ID'] = df['Text_ID'].astype(str) + '-' + df['Sent_ID'].astype(str)

ids_to_remove = [19214295, 20146086, 25139918, 25410883, 25853982, 26469535, 27218632, 27793101, 28120581, 28250304]
df = df[~df['Text_ID'].isin(ids_to_remove)]
remaining_ids = df['Text_ID'].unique()
random_ids = np.random.choice(remaining_ids, size=5, replace=False)
print("Randomly selected IDs for removal:", random_ids)
df = df[~df['Text_ID'].isin(random_ids)]

sentences = df.groupby('Sent_ID')['Token'].apply(list).values
labels = df.groupby('Sent_ID')['Label'].apply(list).values

unique_labels = set(label for sublist in labels for label in sublist)
num_unique_labels = len(unique_labels)
print("Количество уникальных значений:", num_unique_labels)

Randomly selected IDs for removal: [28250406 27130218 19009665 27773410 26457578]
Количество уникальных значений: 82


# Bert

In [16]:
label_encoder = LabelEncoder()
labels_flatten = label_encoder.fit_transform([item for sublist in labels for item in sublist])
labels_encoded = [label_encoder.transform(label).tolist() for label in labels]

tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
tokenized_input = tokenizer(sentences.tolist(), is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=128)

In [17]:
import joblib

label_encoder_path = "label_encoder.joblib"
joblib.dump(label_encoder, label_encoder_path)

['label_encoder.joblib']

In [18]:
def align_labels_with_tokens(labels, tokenized_input):
    labels_aligned = []
    for i, label in enumerate(labels):
        word_ids = tokenized_input.word_ids(batch_index=i)
        label_aligned = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels_aligned.append(label_aligned)
    return labels_aligned

labels_aligned = align_labels_with_tokens(labels_encoded, tokenized_input)

In [19]:
from torch.utils.data import Dataset, DataLoader
import torch

class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, labels_aligned):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels_aligned

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item


train_dataset = CustomDataset(tokenized_input, labels_aligned)


In [20]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [21]:
from transformers import BertForTokenClassification, AdamW

model = BertForTokenClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_),
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [22]:
from transformers import get_linear_schedule_with_warmup
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 10
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader, desc='Batch', leave=False)
    for batch in progress_bar:
        if 'offset_mapping' in batch:
            del batch['offset_mapping']
        
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(data_loader)


for epoch in range(epochs):
    loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}')

                                                                    

Epoch 1/10, Loss: 1.3862


                                                                    

Epoch 2/10, Loss: 0.7246


                                                                    

Epoch 3/10, Loss: 0.5023


                                                                    

Epoch 4/10, Loss: 0.3437


                                                                    

Epoch 5/10, Loss: 0.2431


                                                                     

Epoch 6/10, Loss: 0.1783


                                                                     

Epoch 7/10, Loss: 0.1294


                                                                     

Epoch 8/10, Loss: 0.1024


                                                                     

Epoch 9/10, Loss: 0.0840


                                                                     

Epoch 10/10, Loss: 0.0736




In [23]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

text = "A 20-year-old woman was diagnosed with breast cancer"

inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions.squeeze().cpu().numpy()

predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")

[CLS]: O
a: O
20: B-Age
-: I-Age
year: I-Age
-: I-Age
old: I-Age
woman: B-Sex
was: O
diagnosed: B-Clinical_event
with: O
breast: B-Disease_disorder
cancer: I-Disease_disorder
[SEP]: O


In [24]:
text = "A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea.Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings.An electrocardiogram (ECG) revealed normal sinus rhythm and a Wolff– Parkinson– White pre-excitation pattern (Fig.1: Top), produced by a right-sided accessory pathway.Transthoracic echocardiography demonstrated the presence of Ebstein's anomaly of the tricuspid valve, with apical displacement of the valve and formation of an “atrialized” right ventricle (a functional unit between the right atrium and the inlet [inflow] portion of the right ventricle) (Fig.2).The anterior tricuspid valve leaflet was elongated (Fig.2C, arrow), whereas the septal leaflet was rudimentary (Fig.2C, arrowhead).Contrast echocardiography using saline revealed a patent foramen ovale with right-to-left shunting and bubbles in the left atrium (Fig.2D).The patient underwent an electrophysiologic study with mapping of the accessory pathway, followed by radiofrequency ablation (interruption of the pathway using the heat generated by electromagnetic waves at the tip of an ablation catheter).His post-ablation ECG showed a prolonged PR interval and an odd “second” QRS complex in leads III, aVF and V2–V4 (Fig.1Bottom), a consequence of abnormal impulse conduction in the “atrialized” right ventricle.The patient reported no recurrence of palpitations at follow-up 6 months after the ablation."

In [25]:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)

inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions.squeeze().cpu().numpy()

predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")

[CLS]: O
a: O
28: B-Age
-: I-Age
year: I-Age
-: I-Age
old: I-Age
previously: B-History
healthy: I-History
man: B-Sex
presented: B-Clinical_event
with: O
a: O
6: B-Duration
-: I-Duration
week: I-Duration
history: O
of: O
pal: B-Sign_symptom
##pit: B-Sign_symptom
##ations: B-Sign_symptom
.: O
the: O
symptoms: B-Sign_symptom
occurred: O
during: O
rest: B-Clinical_event
,: O
2: B-Frequency
–: I-Frequency
3: I-Frequency
times: I-Frequency
per: I-Frequency
week: I-Frequency
,: O
lasted: O
up: B-Detailed_description
to: I-Detailed_description
30: I-Detailed_description
minutes: I-Detailed_description
at: I-Detailed_description
a: I-Frequency
time: I-Detailed_description
and: O
were: O
associated: O
with: O
d: B-Sign_symptom
##ys: B-Sign_symptom
##p: B-Sign_symptom
##nea: B-Sign_symptom
.: O
except: O
for: O
a: O
grade: B-Lab_value
2: I-Lab_value
/: I-Lab_value
6: I-Lab_value
ho: B-Detailed_description
##los: B-Detailed_description
##yst: B-Detailed_description
##olic: B-Detailed_description
t

In [26]:
word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label

if current_word:
    word_labels.append((current_word, current_label))

for word, label in word_labels:
    print(f"{word}: {label}")

[CLS]: O
a: O
28: B-Age
-: I-Age
year: I-Age
-: I-Age
old: I-Age
previously: B-History
healthy: I-History
man: B-Sex
presented: B-Clinical_event
with: O
a: O
6: B-Duration
-: I-Duration
week: I-Duration
history: O
of: O
palpitations: B-Sign_symptom
.: O
the: O
symptoms: B-Sign_symptom
occurred: O
during: O
rest: B-Clinical_event
,: O
2: B-Frequency
–: I-Frequency
3: I-Frequency
times: I-Frequency
per: I-Frequency
week: I-Frequency
,: O
lasted: O
up: B-Detailed_description
to: I-Detailed_description
30: I-Detailed_description
minutes: I-Detailed_description
at: I-Detailed_description
a: I-Frequency
time: I-Detailed_description
and: O
were: O
associated: O
with: O
dyspnea: B-Sign_symptom
.: O
except: O
for: O
a: O
grade: B-Lab_value
2: I-Lab_value
/: I-Lab_value
6: I-Lab_value
holosystolic: B-Detailed_description
tricuspid: B-Biological_structure
regurgitation: B-Sign_symptom
murmur: B-Sign_symptom
(: O
best: O
heard: O
at: O
the: O
left: B-Biological_structure
sternal: I-Biological_stru

In [15]:
model_path = "without_anomalies_bert_model"

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('without_anomalies_bert_model/tokenizer_config.json',
 'without_anomalies_bert_model/special_tokens_map.json',
 'without_anomalies_bert_model/vocab.txt',
 'without_anomalies_bert_model/added_tokens.json',
 'without_anomalies_bert_model/tokenizer.json')

# Общая часть

In [27]:
def prepare_model_and_tokenizer(model_name, num_labels):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=num_labels)
    return tokenizer, model

def load_and_prepare_data(data_path):
    df = pd.read_csv(data_path)
    df['Sent_ID'] = df['Text_ID'].astype(str) + '-' + df['Sent_ID'].astype(str)
    sentences = df.groupby('Sent_ID')['Token'].apply(list).values
    labels = df.groupby('Sent_ID')['Label'].apply(list).values
    unique_labels = set(label for sublist in labels for label in sublist)
    num_unique_labels = len(unique_labels)
    print("Количество уникальных меток:", num_unique_labels)
    return sentences, labels, num_unique_labels

def prepare_data(tokenizer, sentences, labels):
    tokenized_input = tokenizer(sentences.tolist(), is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True, max_length=128)
    label_encoder = LabelEncoder()
    labels_flatten = [item for sublist in labels for item in sublist]
    label_encoder.fit(labels_flatten)
    labels_encoded = [label_encoder.transform(label).tolist() for label in labels]
    labels_aligned = align_labels_with_tokens(labels_encoded, tokenized_input)
    return tokenized_input, labels_aligned, label_encoder

def align_labels_with_tokens(labels, tokenized_input):
    labels_aligned = []
    for i, label in enumerate(labels):
        word_ids = tokenized_input.word_ids(batch_index=i)
        label_aligned = [-100 if word_id is None else label[word_id] for word_id in word_ids]
        labels_aligned.append(label_aligned)
    return labels_aligned

In [28]:
class CustomDataset(Dataset):
    def __init__(self, tokenized_inputs, labels_aligned):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels_aligned

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

def train_model(model, train_loader, device, epochs=3):
    optimizer = AdamW(model.parameters(), lr=5e-5)
    total_steps = len(train_loader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    model.to(device)
    for epoch in range(epochs):
        loss = train_epoch(model, train_loader, optimizer, device, scheduler)
        print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss:.4f}')

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
    progress_bar = tqdm(data_loader, desc='Batch', leave=False)
    for batch in progress_bar:
        if 'offset_mapping' in batch:
            del batch['offset_mapping']
        
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    return total_loss / len(data_loader)

In [29]:
import warnings

warnings.filterwarnings('ignore')

# Biobert

In [30]:
from transformers import BertTokenizerFast, BertForTokenClassification, AdamW, AutoTokenizer, AutoModelForTokenClassification

data_path = "processed_entities_tokenized.csv"
model_name = 'dmis-lab/biobert-v1.1'
sentences, labels, num_unique_labels = load_and_prepare_data(data_path)
tokenizer, model = prepare_model_and_tokenizer(model_name, num_unique_labels)
tokenized_input, labels_aligned, label_encoder = prepare_data(tokenizer, sentences, labels)
train_dataset = CustomDataset(tokenized_input, labels_aligned)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_model(model, train_loader, device, epochs=10)

Количество уникальных меток: 82


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                    

Epoch 1/10, Loss: 1.2563


                                                                    

Epoch 2/10, Loss: 0.6610


                                                                    

Epoch 3/10, Loss: 0.4574


                                                                    

Epoch 4/10, Loss: 0.3198


                                                                     

Epoch 5/10, Loss: 0.2212


                                                                     

Epoch 6/10, Loss: 0.1616


                                                                     

Epoch 7/10, Loss: 0.1213


                                                                     

Epoch 8/10, Loss: 0.0925


                                                                     

Epoch 9/10, Loss: 0.0764


                                                                     

Epoch 10/10, Loss: 0.0674




In [31]:
def save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path):
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)
    print(f"Модель сохранена в {model_path}")
    print(f"Токенизатор сохранен в {tokenizer_path}")

model_path = "without_anomalies_BIOBERT_model"
tokenizer_path = "without_anomalies_BIOBERT_tokenizer"

save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path)

Модель сохранена в without_anomalies_BIOBERT_model
Токенизатор сохранен в without_anomalies_BIOBERT_tokenizer


# Pubmed

In [32]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

data_path = "processed_entities_tokenized.csv"
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract" 
sentences, labels, num_unique_labels = load_and_prepare_data(data_path)
tokenizer, model = prepare_model_and_tokenizer(model_name, num_unique_labels)
tokenized_input, labels_aligned, label_encoder = prepare_data(tokenizer, sentences, labels)
train_dataset = CustomDataset(tokenized_input, labels_aligned)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_model(model, train_loader, device, epochs=10)

Количество уникальных меток: 82


Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                                    

Epoch 1/10, Loss: 1.0562


                                                                    

Epoch 2/10, Loss: 0.5752


                                                                    

Epoch 3/10, Loss: 0.4036


                                                                    

Epoch 4/10, Loss: 0.2794


                                                                     

Epoch 5/10, Loss: 0.1969


                                                                     

Epoch 6/10, Loss: 0.1330


                                                                     

Epoch 7/10, Loss: 0.0989


                                                                     

Epoch 8/10, Loss: 0.0763


                                                                     

Epoch 9/10, Loss: 0.0631


                                                                     

Epoch 10/10, Loss: 0.0539




In [33]:
def save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path):
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)
    print(f"Модель сохранена в {model_path}")
    print(f"Токенизатор сохранен в {tokenizer_path}")

model_path = "without_anomalies_PUBMEDBERT_model"
tokenizer_path = "without_anomalies_PUBMEDBERT_tokenizer"

save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path)

Модель сохранена в without_anomalies_PUBMEDBERT_model
Токенизатор сохранен в without_anomalies_PUBMEDBERT_tokenizer


In [23]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

text = "A 20-year-old woman was diagnosed with breast cancer"


inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions.squeeze().cpu().numpy()
predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")

[CLS]: O
a: O
20: B-Age
-: I-Age
year: I-Age
-: I-Age
old: I-Age
woman: B-Sex
was: O
diagnosed: B-Clinical_event
with: O
breast: B-Disease_disorder
cancer: I-Disease_disorder
[SEP]: O


In [24]:
text = "A 28-year-old previously healthy man presented with a 6-week history of palpitations. The symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea.Except for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings.An electrocardiogram (ECG) revealed normal sinus rhythm and a Wolff– Parkinson– White pre-excitation pattern (Fig.1: Top), produced by a right-sided accessory pathway.Transthoracic echocardiography demonstrated the presence of Ebstein's anomaly of the tricuspid valve, with apical displacement of the valve and formation of an “atrialized” right ventricle (a functional unit between the right atrium and the inlet [inflow] portion of the right ventricle) (Fig.2).The anterior tricuspid valve leaflet was elongated (Fig.2C, arrow), whereas the septal leaflet was rudimentary (Fig.2C, arrowhead).Contrast echocardiography using saline revealed a patent foramen ovale with right-to-left shunting and bubbles in the left atrium (Fig.2D).The patient underwent an electrophysiologic study with mapping of the accessory pathway, followed by radiofrequency ablation (interruption of the pathway using the heat generated by electromagnetic waves at the tip of an ablation catheter).His post-ablation ECG showed a prolonged PR interval and an odd “second” QRS complex in leads III, aVF and V2–V4 (Fig.1Bottom), a consequence of abnormal impulse conduction in the “atrialized” right ventricle.The patient reported no recurrence of palpitations at follow-up 6 months after the ablation."

In [25]:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions.squeeze().cpu().numpy()
predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")

[CLS]: O
a: O
28: B-Age
-: I-Age
year: I-Age
-: I-Age
old: I-Age
previously: B-History
healthy: I-History
man: B-Sex
presented: B-Clinical_event
with: O
a: O
6: B-Duration
-: I-Duration
week: I-Duration
history: O
of: O
palp: B-Sign_symptom
##itations: B-Sign_symptom
.: O
the: O
symptoms: B-Sign_symptom
occurred: O
during: O
rest: B-Activity
,: O
2: B-Frequency
[UNK]: I-Frequency
3: I-Frequency
times: I-Frequency
per: I-Frequency
week: I-Frequency
,: O
lasted: O
up: B-Detailed_description
to: I-Detailed_description
30: I-Detailed_description
minutes: I-Detailed_description
at: I-Detailed_description
a: I-Detailed_description
time: I-Detailed_description
and: O
were: O
associated: O
with: O
dyspnea: B-Sign_symptom
.: O
except: O
for: O
a: O
grade: B-Lab_value
2: I-Lab_value
/: I-Lab_value
6: I-Lab_value
holo: B-Detailed_description
##s: B-Detailed_description
##yst: B-Detailed_description
##olic: B-Detailed_description
tricuspid: B-Biological_structure
regurgitation: B-Sign_symptom
mur:

In [26]:
word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label if label.startswith("B-") else "O"

if current_word:
    word_labels.append((current_word, current_label))

for word, label in word_labels:
    print(f"{word}: {label}")

[CLS]: O
a: O
28: B-Age
-: O
year: O
-: O
old: O
previously: B-History
healthy: O
man: B-Sex
presented: B-Clinical_event
with: O
a: O
6: B-Duration
-: O
week: O
history: O
of: O
palpitations: B-Sign_symptom
.: O
the: O
symptoms: B-Sign_symptom
occurred: O
during: O
rest: B-Activity
,: O
2: B-Frequency
[UNK]: O
3: O
times: O
per: O
week: O
,: O
lasted: O
up: B-Detailed_description
to: O
30: O
minutes: O
at: O
a: O
time: O
and: O
were: O
associated: O
with: O
dyspnea: B-Sign_symptom
.: O
except: O
for: O
a: O
grade: B-Lab_value
2: O
/: O
6: O
holosystolic: B-Detailed_description
tricuspid: B-Biological_structure
regurgitation: B-Sign_symptom
murmur: O
(: O
best: O
heard: O
at: O
the: O
left: B-Biological_structure
sternal: O
border: O
with: O
inspiratory: O
accentuation: O
): O
,: O
physical: B-Diagnostic_procedure
examination: O
yielded: O
unremarkable: B-Lab_value
findings: O
.: O
an: O
electrocardiogram: B-Diagnostic_procedure
(: O
ecg: B-Diagnostic_procedure
): O
revealed: O
normal: 

In [27]:
import pandas as pd

word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label if label.startswith("B-") else "O"

if current_word:
    word_labels.append((current_word, current_label))

df_word_labels = pd.DataFrame(word_labels, columns=["Word", "Label"])

In [28]:
df_word_labels

Unnamed: 0,Word,Label
0,[CLS],O
1,a,O
2,28,B-Age
3,-,O
4,year,O
...,...,...
113,.,O
114,transthoracic,B-Biological_structure
115,echocardiography,B-Diagnostic_procedure
116,demonstrated,O
