In [1]:
import pandas as pd
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
import joblib



In [2]:
from transformers import BertForTokenClassification, BertTokenizerFast

model = BertForTokenClassification.from_pretrained("PUBMEDBERT_model")
tokenizer = BertTokenizerFast.from_pretrained("PUBMEDBERT_tokenizer")

In [3]:
data = pd.read_csv('processed_entities_with_sentences_iob_18.csv')
data = data[data['Text_ID'] == 15939911]

In [4]:
import pandas as pd
from transformers import AutoTokenizer, PreTrainedTokenizerBase

def tokenize_and_preserve_labels(sentence: str, text_labels: list, tokenizer: PreTrainedTokenizerBase):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence.split(), text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        
        if n_subwords > 0:
            if label.startswith('B-'):
                labels.append(label)
                labels.extend(['I-' + label.split('-')[1]] * (n_subwords - 1))
            else:
                labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

sentence = " ".join(data['Token'].values)
text_labels = data['Label'].values.tolist()

tokenized_sentence, new_labels = tokenize_and_preserve_labels(sentence, text_labels, tokenizer)
tokenized_data_df = pd.DataFrame({'Token': tokenized_sentence, 'Label': new_labels})

In [5]:
tokenized_data_df[:10]

Unnamed: 0,Token,Label
0,case,O
1,:,O
2,a,O
3,28,B-Age
4,-,I-Age
5,year,I-Age
6,-,I-Age
7,old,I-Age
8,previously,B-History
9,healthy,I-History


In [6]:
word_labels_list = []

current_word = ""
current_label = None

for index, row in tokenized_data_df.iterrows():
    token = row['Token']
    label = row['Label']

    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels_list.append({'Word': current_word, 'Label': current_label})
        current_word = token
        current_label = label

if current_word:
    word_labels_list.append({'Word': current_word, 'Label': current_label})

new_word_labels_df = pd.DataFrame(word_labels_list)

In [7]:
new_word_labels_df

Unnamed: 0,Word,Label
0,case,O
1,:,O
2,a,O
3,28,B-Age
4,-,I-Age
...,...,...
325,months,I-Date
326,after,I-Date
327,the,O
328,ablation,O


In [8]:
csv_file_path = 'test_label.csv'
new_word_labels_df.to_csv(csv_file_path, index=False)

In [9]:
file_path = "/home/jupyter/datasphere/project/maccrobat/18/15939911.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

In [10]:
text

"CASE: A 28-year-old previously healthy man presented with a 6-week history of palpitations.\nThe symptoms occurred during rest, 2–3 times per week, lasted up to 30 minutes at a time and were associated with dyspnea.\nExcept for a grade 2/6 holosystolic tricuspid regurgitation murmur (best heard at the left sternal border with inspiratory accentuation), physical examination yielded unremarkable findings.\nAn electrocardiogram (ECG) revealed normal sinus rhythm and a Wolff– Parkinson– White pre-excitation pattern (Fig.1: Top), produced by a right-sided accessory pathway.\nTransthoracic echocardiography demonstrated the presence of Ebstein's anomaly of the tricuspid valve, with apical displacement of the valve and formation of an “atrialized” right ventricle (a functional unit between the right atrium and the inlet [inflow] portion of the right ventricle) (Fig.2).\nThe anterior tricuspid valve leaflet was elongated (Fig.2C, arrow), whereas the septal leaflet was rudimentary (Fig.2C, arro

In [11]:
label_encoder_path = "version2_label_encoder.joblib"
label_encoder = joblib.load(label_encoder_path)

In [12]:
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)

with torch.no_grad():
    outputs = model(**inputs)

logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)

predicted_label_indices = predictions[0].numpy()
predicted_labels = [label_encoder.inverse_transform([idx])[0] for idx in predicted_label_indices]

In [13]:
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
for token, label in zip(tokens, predicted_labels):
    print(f"{token}: {label}")

[CLS]: O
case: O
:: O
a: O
28: B-Age
-: B-Age
year: B-Age
-: B-Age
old: B-Age
previously: B-History
healthy: I-History
man: B-Sex
presented: B-Clinical_event
with: O
a: O
6: B-Duration
-: B-Duration
week: B-Duration
history: I-Duration
of: O
palp: B-Sign_symptom
##itations: B-Sign_symptom
.: O
the: O
symptoms: B-Sign_symptom
occurred: O
during: O
rest: B-Activity
,: O
2: B-Frequency
[UNK]: B-Frequency
3: B-Frequency
times: I-Frequency
per: I-Frequency
week: I-Frequency
,: O
lasted: O
up: B-Detailed_description
to: I-Detailed_description
30: I-Detailed_description
minutes: I-Detailed_description
at: I-Detailed_description
a: I-Detailed_description
time: I-Detailed_description
and: O
were: O
associated: O
with: O
dyspnea: B-Sign_symptom
.: O
except: O
for: O
a: O
grade: B-Lab_value
2: I-Lab_value
/: I-Lab_value
6: I-Lab_value
holo: B-Detailed_description
##s: B-Detailed_description
##yst: B-Detailed_description
##olic: B-Detailed_description
tricuspid: B-Biological_structure
regurgitatio

In [14]:
word_labels = []
current_word = ""
current_label = None

for token, label in zip(tokens, predicted_labels):
    if token.startswith("##"):
        current_word += token[2:]
    else:
        if current_word:
            word_labels.append((current_word, current_label))
        current_word = token
        current_label = label

if current_word:
    word_labels.append((current_word, current_label))

In [15]:
word_labels

[('[CLS]', 'O'),
 ('case', 'O'),
 (':', 'O'),
 ('a', 'O'),
 ('28', 'B-Age'),
 ('-', 'B-Age'),
 ('year', 'B-Age'),
 ('-', 'B-Age'),
 ('old', 'B-Age'),
 ('previously', 'B-History'),
 ('healthy', 'I-History'),
 ('man', 'B-Sex'),
 ('presented', 'B-Clinical_event'),
 ('with', 'O'),
 ('a', 'O'),
 ('6', 'B-Duration'),
 ('-', 'B-Duration'),
 ('week', 'B-Duration'),
 ('history', 'I-Duration'),
 ('of', 'O'),
 ('palpitations', 'B-Sign_symptom'),
 ('.', 'O'),
 ('the', 'O'),
 ('symptoms', 'B-Sign_symptom'),
 ('occurred', 'O'),
 ('during', 'O'),
 ('rest', 'B-Activity'),
 (',', 'O'),
 ('2', 'B-Frequency'),
 ('[UNK]', 'B-Frequency'),
 ('3', 'B-Frequency'),
 ('times', 'I-Frequency'),
 ('per', 'I-Frequency'),
 ('week', 'I-Frequency'),
 (',', 'O'),
 ('lasted', 'O'),
 ('up', 'B-Detailed_description'),
 ('to', 'I-Detailed_description'),
 ('30', 'I-Detailed_description'),
 ('minutes', 'I-Detailed_description'),
 ('at', 'I-Detailed_description'),
 ('a', 'I-Detailed_description'),
 ('time', 'I-Detailed_descri

In [16]:
word_labels_df = pd.DataFrame(word_labels, columns=['Word', 'Label'])
word_labels_df

Unnamed: 0,Word,Label
0,[CLS],O
1,case,O
2,:,O
3,a,O
4,28,B-Age
...,...,...
234,study,I-Diagnostic_procedure
235,with,O
236,mapping,B-Diagnostic_procedure
237,of,O


In [17]:
filtered_df = word_labels_df[~word_labels_df['Word'].isin(['[PAD]'])]
filtered_df = filtered_df[~filtered_df['Word'].isin(['[CLS]'])]
filtered_df = filtered_df[~filtered_df['Word'].isin(['[SEP]'])]
filtered_df

Unnamed: 0,Word,Label
1,case,O
2,:,O
3,a,O
4,28,B-Age
5,-,B-Age
...,...,...
233,electrophysiologic,B-Diagnostic_procedure
234,study,I-Diagnostic_procedure
235,with,O
236,mapping,B-Diagnostic_procedure


In [18]:
filtered_df[:10]

Unnamed: 0,Word,Label
1,case,O
2,:,O
3,a,O
4,28,B-Age
5,-,B-Age
6,year,B-Age
7,-,B-Age
8,old,B-Age
9,previously,B-History
10,healthy,I-History


In [28]:
csv_file_path = 'version2_test_pred.csv'
filtered_df.to_csv(csv_file_path, index=False)

In [19]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

test_labels_df = new_word_labels_df
test_preds_df = filtered_df

test_labels_df['Label'] = test_labels_df['Label'].str.replace(r'^(B-|I-)', '', regex=True)
test_preds_df['Label'] = test_preds_df['Label'].str.replace(r'^(B-|I-)', '', regex=True)
test_labels_df = test_labels_df[:min(len(test_labels_df), len(test_preds_df))]
test_preds_df = test_preds_df[:min(len(test_labels_df), len(test_preds_df))]

assert len(test_labels_df) == len(test_preds_df), "DataFrames have different lengths"

y_true = test_labels_df['Label'].values
y_pred = test_preds_df['Label'].values

In [20]:
y_true

array(['O', 'O', 'O', 'Age', 'Age', 'Age', 'Age', 'Age', 'History',
       'History', 'Sex', 'Clinical_event', 'O', 'O', 'Duration',
       'Duration', 'Duration', 'O', 'O', 'Sign_symptom', 'O', 'O',
       'Coreference', 'O', 'O', 'Clinical_event', 'O', 'Frequency',
       'Frequency', 'Frequency', 'Frequency', 'Frequency', 'Frequency',
       'O', 'O', 'Detailed_description', 'Detailed_description',
       'Detailed_description', 'Detailed_description',
       'Detailed_description', 'Detailed_description',
       'Detailed_description', 'O', 'O', 'O', 'O', 'Sign_symptom', 'O',
       'O', 'O', 'O', 'Lab_value', 'Lab_value', 'Lab_value', 'Lab_value',
       'Detailed_description', 'Biological_structure', 'Sign_symptom',
       'Sign_symptom', 'O', 'O', 'O', 'O', 'O', 'Biological_structure',
       'Biological_structure', 'Biological_structure', 'O',
       'Detailed_description', 'Detailed_description', 'O', 'O',
       'Diagnostic_procedure', 'Diagnostic_procedure', 'O', 'Lab_value'

In [21]:
y_pred

array(['O', 'O', 'O', 'Age', 'Age', 'Age', 'Age', 'Age', 'History',
       'History', 'Sex', 'Clinical_event', 'O', 'O', 'Duration',
       'Duration', 'Duration', 'Duration', 'O', 'Sign_symptom', 'O', 'O',
       'Sign_symptom', 'O', 'O', 'Activity', 'O', 'Frequency',
       'Frequency', 'Frequency', 'Frequency', 'Frequency', 'Frequency',
       'O', 'O', 'Detailed_description', 'Detailed_description',
       'Detailed_description', 'Detailed_description',
       'Detailed_description', 'Detailed_description',
       'Detailed_description', 'O', 'O', 'O', 'O', 'Sign_symptom', 'O',
       'O', 'O', 'O', 'Lab_value', 'Lab_value', 'Lab_value', 'Lab_value',
       'Detailed_description', 'Biological_structure', 'Sign_symptom',
       'Sign_symptom', 'O', 'O', 'O', 'O', 'O', 'Biological_structure',
       'Biological_structure', 'Biological_structure', 'O',
       'Detailed_description', 'Detailed_description', 'O', 'O',
       'Diagnostic_procedure', 'Diagnostic_procedure', 'O', 'Lab_valu

In [22]:
print("\nClassification Report:")
print(classification_report(y_true, y_pred))


Classification Report:
                      precision    recall  f1-score   support

            Activity       0.00      0.00      0.00         0
                 Age       1.00      1.00      1.00         5
Biological_structure       0.96      1.00      0.98        22
      Clinical_event       1.00      0.50      0.67         2
         Coreference       0.00      0.00      0.00         2
Detailed_description       1.00      1.00      1.00        16
Diagnostic_procedure       1.00      1.00      1.00        11
    Disease_disorder       1.00      1.00      1.00        10
            Duration       0.75      1.00      0.86         3
           Frequency       1.00      1.00      1.00         6
             History       1.00      1.00      1.00         2
           Lab_value       1.00      1.00      1.00         6
                   O       1.00      0.99      1.00       127
                 Sex       1.00      1.00      1.00         1
        Sign_symptom       0.96      1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
