In [2]:
import xml.etree.ElementTree as ET
import numpy as np
import pandas as pd

from pathlib import Path

from transformers import BertTokenizer, BertModel, BertForMaskedLM
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased')
# model = BertModel.from_pretrained('bert-base-german-cased')
model = BertForMaskedLM.from_pretrained('bert-base-german-cased')

file_path = Path("..") / "data" / "Cajal.txt" / "inception-document2176491558147248257" / "CURATION_USER.xmi"


tree = ET.parse(file_path)
root = tree.getroot()

# namespace dictionary
namespaces = {
    'xmi': 'http://www.omg.org/XMI',
    'cas': 'http:///uima/cas.ecore',
    'custom': 'http:///webanno/custom.ecore',
    'type5': 'http:///de/tudarmstadt/ukp/dkpro/core/api/segmentation/type.ecore'}

# data extraction
sofa = root.find('.//cas:Sofa', namespaces)        #  './' means "search from the current node".'//' means "search all descendants, not just immediate children". './/cas:Sofa' means "find any descendant element named 'Sofa' in the 'cas' namespace, starting from the current node".
text_content = sofa.get('sofaString')
root = tree.getroot()
sentences = root.findall('.//type5:Sentence', namespaces)
words = root.findall('.//type5:Token', namespaces)
PII_annotations = root.findall('.//custom:PHI', namespaces)

# used for finding a decent context window
sentence_info = []
for sentence in sentences:
    start = int(sentence.get('begin'))
    end = int(sentence.get('end'))
    sentence_text = text_content[start:end]
    # Tokenize 
    tokens = tokenizer(sentence_text, padding=False, truncation=False, return_tensors='pt')
    # Get the number of tokens
    token_count = tokens.input_ids.shape[1]
    
    sentence_info.append((start, end, token_count))

Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
# Bert can only process 512 tokens so we need to define a context window for the annoations
def find_context_window(word_start, sentence_info, max_tokens=512):
    # Find the sentence containing the word
    containing_sentence = next((s for s in sentence_info if s[0] <= word_start < s[1]), None)

    # Initialize the context with the containing sentence
    context_start = containing_sentence[0]
    context_end = containing_sentence[1]
    token_count = containing_sentence[2]
    
    left_index = right_index = sentence_info.index(containing_sentence)
    
    while token_count < max_tokens:
        # Try to add a sentence to the left
        if left_index > 0:
            left_sentence = sentence_info[left_index - 1]
            if token_count + left_sentence[2] <= max_tokens:
                context_start = left_sentence[0]
                token_count += left_sentence[2]
                left_index -= 1
            else:
                break
        
        # Try to add a sentence to the right
        if right_index < len(sentence_info) - 1:
            right_sentence = sentence_info[right_index + 1]
            if token_count + right_sentence[2] <= max_tokens:
                context_end = right_sentence[1]
                token_count += right_sentence[2]
                right_index += 1
            else:
                break 
        
        # can't add to either side
        if left_index == 0 and right_index == len(sentence_info) - 1:
            break

    return context_start, context_end

In [4]:
# word processing
PII_data = []
for PII_annotation in PII_annotations:
    PII_start = int(PII_annotation.get('begin'))
    PII_end = int(PII_annotation.get('end'))
    context_start, context_end = find_context_window(PII_start,sentence_info)
    PII_data.append({
        'PII_entity': text_content[PII_start:PII_end],
        'PII_category': PII_annotation.get('kind'),
        'PII_start': PII_start,
        'PII_end': PII_end,
        'bert_context_start': int(context_start),
        'bert_context_end': int(context_end)
        
    })
       
PII_df = pd.DataFrame(PII_data)        
PII_df['PII_tokenized'] = [None] * len(PII_df)
PII_df['Token_embeddings'] = [None] * len(PII_df)
PII_df['Token_mean_embedding'] = [None] * len(PII_df)

tokens_list = []
for _ , PII in PII_df.iterrows():
    tokens = tokenizer(PII['PII_entity'], add_special_tokens=False)
    # Convert the tensor to a list
    tokens_list.append(tokens['input_ids'])
    
PII_df['PII_tokenized'] = tokens_list

## Base Bert Model


In [None]:
PII_token_data = []

for _, PII in PII_df.iterrows():
    chunk = text_content[PII['bert_context_start']:PII['bert_context_end']]
    
    # Tokenize the entire chunk
    encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors='pt')
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**encoded_chunk)
    chunk_embeddings = outputs.last_hidden_state[0]  # Remove batch dimension
    chunk_tokens = encoded_chunk['input_ids'][0].tolist()  # Get integer token IDs
    
    PII_start_index = None
    for i in range(len(chunk_tokens) - len(PII['PII_tokenized']) + 1):
        if chunk_tokens[i:i+len(PII['PII_tokenized'])] == PII['PII_tokenized']:
            PII_start_index = i
            break
    
    if PII_start_index is not None:
        PII_tokenized = chunk_tokens[PII_start_index:PII_start_index+len(PII['PII_tokenized'])]
        PII_embeddings = np.array(chunk_embeddings[PII_start_index:PII_start_index+len(PII['PII_tokenized'])])#.tolist()
        
        PII_token_data.append({
            'PII_entity': PII['PII_entity'],
            'PII_tokenized': PII_tokenized,
            'Token_embeddings': PII_embeddings
        })

# Create the new DataFrame
PII_token_df = pd.DataFrame(PII_token_data)

PII_df['Token_embeddings'] = PII_token_df['Token_embeddings']

PII_df['Token_mean_embedding'] = PII_df['Token_embeddings'].apply(lambda x: np.mean(x, axis=0)) #.apply() is a pandas method that applies a function to each element of the Series
PII_df

## Masked Bert Model

In [5]:
PII_token_data = []

for _, PII in PII_df.iterrows():
    chunk = text_content[PII['bert_context_start']:PII['bert_context_end']]
    
    # Tokenize the entire chunk
    encoded_chunk = tokenizer(chunk, padding=True, truncation=True, return_tensors='pt')
    
    # Get embeddings
    with torch.no_grad():
        outputs = model(**encoded_chunk, output_hidden_states = True)
        
    chunk_embeddings = outputs.hidden_states[-1][0]  # Remove batch dimension
    chunk_tokens = encoded_chunk['input_ids'][0].tolist()  # Get integer token IDs
    
    # # predict top tokens for each token masked or not
    logits = outputs.logits
    # Get predictions for all tokens
    for i, token_id in enumerate(encoded_chunk['input_ids'][0]):
        token_logits = logits[0, i, :]
        top_5_tokens = torch.topk(token_logits, 5, dim=0).indices.tolist()
        print(f"Original token: {tokenizer.decode([token_id])}")
        print("Top 5 predictions:")
        for token in top_5_tokens:
            print(f"- {tokenizer.decode([token])}")
        print()
    
    PII_start_index = None
    for i in range(len(chunk_tokens) - len(PII['PII_tokenized']) + 1):
        if chunk_tokens[i:i+len(PII['PII_tokenized'])] == PII['PII_tokenized']:
            PII_start_index = i
            break
    
    if PII_start_index is not None:
        PII_tokenized = chunk_tokens[PII_start_index:PII_start_index+len(PII['PII_tokenized'])]
        PII_embeddings = np.array(chunk_embeddings[PII_start_index:PII_start_index+len(PII['PII_tokenized'])])#.tolist()
        
        PII_token_data.append({
            'PII_entity': PII['PII_entity'],
            'PII_tokenized': PII_tokenized,
            'Token_embeddings': PII_embeddings
        })

# Create the new DataFrame
PII_token_df = pd.DataFrame(PII_token_data)

PII_df['Token_embeddings'] = PII_token_df['Token_embeddings']

PII_df['Token_mean_embedding'] = PII_df['Token_embeddings'].apply(lambda x: np.mean(x, axis=0)) #.apply() is a pandas method that applies a function to each element of the Series
PII_df

Original token: [CLS]
Top 5 predictions:
- [unused_punctuation1]
- [unused_punctuation7]
- [UNK]
- und
- [unused_punctuation4]

Original token: Di
Top 5 predictions:
- Di
- di
- Hil
- Ka
- ##Di

Original token: ##akon
Top 5 predictions:
- ##akon
- ##af
- ##arm
- ##äd
- ##äst

Original token: ##issen
Top 5 predictions:
- ##issen
- ##ie
- ##ies
- ##ien
- ##ischen

Original token: ##kran
Top 5 predictions:
- ##kran
- ##haus
- ##ken
- Kranken
- ##ke

Original token: ##ken
Top 5 predictions:
- ##ken
- ##ke
- ##k
- ##en
- ##ks

Original token: ##haus
Top 5 predictions:
- ##haus
- ##hauses
- Haus
- ##ie
- ##ke

Original token: Berlin
Top 5 predictions:
- Berlin
- ##n
- Deutschland
- Berliner
- ##s

Original token: [UNK]
Top 5 predictions:
- [UNK]
- [unused_punctuation2]
- das
- [unused_punctuation0]
- [unused_punctuation1]

Original token: Op
Top 5 predictions:
- Op
- In
- Ne
- Hyp
- Di

Original token: ##ht
Top 5 predictions:
- ##ht
- ##ist
- ##ho
- ##ät
- ##hen

Original token: ##hal
Top 5 

Unnamed: 0,PII_entity,PII_category,PII_start,PII_end,bert_context_start,bert_context_end,PII_tokenized,Token_embeddings,Token_mean_embedding
0,Diakonissenkrankenhaus Berlin,LOCATION_HOSPITAL,0,29,0,1459,"[1824, 12810, 756, 11853, 412, 691, 715]","[[0.7418668, 0.4263821, -0.46731064, -1.480767...","[0.57505125, 0.18170688, 0.2810498, -0.8748777..."
1,Diakonissenkrankenhaus Berlin,LOCATION_HOSPITAL,61,90,0,1459,"[1824, 12810, 756, 11853, 412, 691, 715]","[[0.7418668, 0.4263821, -0.46731064, -1.480767...","[0.57505125, 0.18170688, 0.2810498, -0.8748777..."
2,12299,LOCATION_ZIP,91,96,0,1459,"[23331, 1802]","[[0.83484375, 0.48591942, 0.39794385, -0.36615...","[0.19263326, 0.08091557, 0.13412839, 0.3956380..."
3,Berlin,LOCATION_CITY,97,103,0,1459,[715],"[[-0.2692925, 0.041594855, -0.08116214, 0.1158...","[-0.2692925, 0.041594855, -0.08116214, 0.11581..."
4,Dr. med.,NAME_TITLE,113,121,0,1459,"[2682, 26914, 4394, 26914]","[[0.51936144, 0.3410783, 0.2721948, 0.01096862...","[0.67955905, 0.42792553, 0.3219271, 0.24157222..."
5,Mike Marschollek,NAME_DOCTOR,122,138,0,1459,"[14104, 17458, 2809, 26917]","[[0.9626599, 1.0076388, -0.2985245, 0.15143898...","[0.9331897, 0.76209253, 0.24608833, -0.4969524..."
6,Kantstraße. 21 a,LOCATION_STREET,139,155,0,1459,"[3686, 1592, 26914, 2439, 18]","[[0.35805866, 1.3763036, 0.41030696, -0.662212...","[0.7783621, 1.0022788, 0.5350094, 0.7290001, 0..."
7,33455,LOCATION_ZIP,156,161,0,1459,"[3936, 4389, 26957]","[[0.9816525, 1.5218419, 1.4900866, -0.1979038,...","[0.45434082, 0.5332323, 1.0224843, -0.00126563..."
8,Wiesental,LOCATION_CITY,162,171,0,1459,"[13410, 10945]","[[0.41661966, -1.185095, 1.835077, -0.9294547,...","[0.5150715, -0.878658, 1.2125416, -0.67815316,..."
9,Berlin,LOCATION_CITY,181,187,0,1459,[715],"[[-0.2692925, 0.041594855, -0.08116214, 0.1158...","[-0.2692925, 0.041594855, -0.08116214, 0.11581..."


In [6]:
# Access base BERT model and MLM head
#bert_base = model.bert
mlm_head = model.cls


mean_embedding = torch.tensor(PII_df['Token_mean_embedding'])
mean_embedding = mean_embedding.reshape(1, 25, 768) # Batch size,  tokens, hidden size

# Use the MLM head to get predictions
with torch.no_grad():
    logits = mlm_head(mean_embedding)

# logits shape: (batch_size, sequence_length, vocab_size)
# For each position, get the top 5 predicted tokens
for position in range(logits.size(1)):
    
    position_logits = logits[0, position, :]  # Get logits for this position
    
    # get top 5 logits 
    top_5_indices = torch.topk(position_logits, 5, dim=0).indices
    
    print(f"Top 5 predictions for {PII_df['PII_entity'][position]}:")
    for top_5_indice in top_5_indices:
        token = tokenizer.convert_ids_to_tokens([top_5_indice])[0]
        print(f"  {token}")
    print()

Top 5 predictions for Diakonissenkrankenhaus Berlin:
  ##haus
  ##ken
  ##ie
  ##ke
  ##en

Top 5 predictions for Diakonissenkrankenhaus Berlin:
  ##haus
  ##ken
  ##ie
  ##ke
  ##en

Top 5 predictions for 12299:
  ##99
  ##16
  ##19
  122
  99

Top 5 predictions for Berlin:
  Berlin
  ##n
  Deutschland
  Berliner
  ##s

Top 5 predictions for Dr. med.:
  [unused_punctuation0]
  Dr
  med
  [UNK]
  [unused_punctuation2]

Top 5 predictions for Mike Marschollek:
  ##n
  ##r
  ##ick
  ##k
  ##z

Top 5 predictions for Kantstraße. 21 a:
  [unused_punctuation1]
  [unused_punctuation0]
  21
  B
  a

Top 5 predictions for 33455:
  ##5
  ##45
  ##33
  ##35
  ##43

Top 5 predictions for Wiesental:
  Wiesen
  ##tal
  ##grün
  ##weg
  Wiese

Top 5 predictions for Berlin:
  Berlin
  ##n
  Deutschland
  Berliner
  ##s

Top 5 predictions for 22.06.2032:
  [unused_punctuation0]
  20
  22
  30
  21

Top 5 predictions for Marschollek:
  ##n
  ##ick
  ##k
  ##r
  Marsch

Top 5 predictions for Ramón Cajal:


  mean_embedding = torch.tensor(PII_df['Token_mean_embedding'])


In [None]:
# for name, module in model.named_children():
#     print(f"{name}: {type(module)}")
    
for name, module in model.named_modules():
    print(f"{name}: {type(module)}")
    
# for param_name, param in model.state_dict().items():
#     print(f"{param_name}: {param.shape}")
    
print(model.config)
print(model.cls)

# for name, param in model.named_parameters():
#     print(f"{name}: requires_grad = {param.requires_grad}")

BertConfig {
  "_name_or_path": "bert-base-german-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.42.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (transform_act_fn): GELUActivation()
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=30000, bias=True)
  )
)
