In [344]:
import os
from cassis import *
import pandas as pd

# Define the path to the folder containing the .xmi files
xmi_folder_path = '/home/mseiferling/Name_type_identifier/data/all_xmi_files'

# Load the type system if available (adjust path if necessary)
type_system_file = '/home/mseiferling/Name_type_identifier/data/all_xmi_files/TypeSystem.xml' 
with open(type_system_file, 'rb') as f:
    type_system = load_typesystem(f)

# Function to load all .xmi files in a folder
def load_xmi_files(folder_path, type_system):
    cas_list = []
    for filename in os.listdir(folder_path):
        if filename.endswith('.xmi'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'rb') as f:
                cas = load_cas_from_xmi(f, typesystem=type_system)
                cas_list.append(cas)
    return cas_list

# Load all .xmi files
cas_list = load_xmi_files(xmi_folder_path, type_system)

name_list = []
for cas in cas_list:
    phi_annotations = cas.select('webanno.custom.PHI')
    for phi in phi_annotations:
        if phi.kind == "NAME_DOCTOR" or phi.kind == "NAME_PATIENT" :
            entity_text = phi.get_covered_text() 
            name_list.append(entity_text)
            
name_list = list(set(name_list))

FileNotFoundError: [Errno 2] No such file or directory: '/home/mseiferling/Name_type_identifier/data/all_xmi_files/TypeSystem.xml'

In [337]:
import csv

file_path = './data/Grascco_names.csv'

# transform classes into list
def transform_classes(class_str):
    return [cls.strip() for cls in class_str.split('|')]

# for some reason reading operation is adding escaped newlines
def unescape_newlines(s):
    return s.replace('\\n', '\n')

data = []

with open(file_path, 'r', encoding='utf-8') as file:
    csv_reader = csv.reader(file)
    next(csv_reader)  
    for row in csv_reader:
        name = unescape_newlines(row[0])
        classes = transform_classes(row[1])
        data.append([name, classes])

# Rule Based Name Classification

In [340]:
# Lists of articles and prepositions that may appear in names
ARTICLES = ['der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einen', 
            'einem', 'einer', 'eines', 'el', 'la', 'los', 'las', 'le', 'les', 'l']

PREPOSITIONS = ['ab', 'an', 'auf', 'aus', 'bei', 'bis', 'durch', 'für', 'gegen', 
                'ohne', 'um', 'unter', 'über', 'vor', 'hinter', 'neben', 'zwischen', 
                'nach', 'mit', 'von', 'zu', 'gegenüber', 'während', 'trotz', 'wegen', 
                'statt', 'gemäß', 'entlang', 'seit', 'laut', 'vom', 'zur', 'zum', 
                'beim', 'van', 'des', 'de', 'del', 'dos']

#Check if a word is a preposition or article
def is_prep_or_article(word):
    return word.lower() in ARTICLES or word.lower() in PREPOSITIONS

#Classify each part of a name as First Name (FN) or Last Name (LN)
def classify_name(name):
    parts = name.split()
    
    # Rule 1: Names with commas (e.g., "LAST, First")
    if ',' in name:
        comma_index = next(i for i, part in enumerate(parts) if ',' in part)
        return ['LN'] * (comma_index + 1) + ['FN'] * (len(parts) - comma_index - 1)
    
    # Rule 2: Single word names are assumed to be last names
    if len(parts) == 1:
        return ['LN']
  
    
    # Rule 3: Two-word names are assumed to be "First Last" if there is no preposition
    if len(parts) == 2 and not any(is_prep_or_article(part) for part in parts):
        return ['FN', 'LN']


    # Rule 4: Names with more than two words
    classification = []
    last_name_started = False
    
    for i, part in enumerate(parts):
        # Start of last name if it's a preposition/article or we've already started the last name
        if is_prep_or_article(part) or last_name_started:
            classification.append('LN')
            last_name_started = True
        # Last word is always part of the last name
        elif i == len(parts) - 1:
            classification.append('LN')
        # Otherwise, it's a first name (or middle name)
        else:
            classification.append('FN')
    
    return classification

# # Test cases
# test_names = [
#     'K. Kummer',                  
#     'Ehrenberger',                
#     'Ramón Cajal',                
#     'Christian Schwach',          
#     'Katharina Fabricius-Schätzle'
#     'Eberhard Ehrenberger',       
#     'CHRIST, Charlotte',          
#     'GERODLSAUER, Gerli',         
#     'van der Waals',              
#     'Ludwig van Beethoven',       
#     'John Middle Name Smith',     
#     'Maria Ana Josefina Rodriguez',
#     'de Beauharnais',
#     'Jürgen\nSchneider',
#     'Maria'
# ]

# # Run test cases
# for name in test_names:
#     classification = classify_name(name)
#     print(f"{name}: {classification}")


# Test Accuracy of Rule based Model

In [335]:
def pred_accuracy(data):
    correct_predictions = 0  # Counter for correct predictions
    
    for name, y_true in data:
        y_pred = classify_name(name)  # Predict the label using classify_name function
        
        # Check if the prediction matches the true label
        if y_true == y_pred:
            correct_predictions += 1  # Increment counter if correct
        else:
            # Print details if there's a mismatch between prediction and true label
            print(f"Name: {name}, True Label: {y_true}, Predicted Label: {y_pred}")
    
    # Calculate accuracy
    accuracy = correct_predictions / len(data)
    
    return accuracy

accuracy = pred_accuracy(data)
print(f"Accuracy: {accuracy:.2%}") 

Name: Maria, True Label: ['FN'], Predicted Label: ['LN']
Name: Haus Horst, True Label: ['LN', 'FN'], Predicted Label: ['FN', 'LN']
Name: Hendlbein H., True Label: ['LN', 'FN'], Predicted Label: ['FN', 'LN']
Name: Huber Karina, True Label: ['LN', 'FN'], Predicted Label: ['FN', 'LN']
Name: Marijas, True Label: ['FN'], Predicted Label: ['LN']
Name: M., True Label: ['FN'], Predicted Label: ['LN']
Name: B., True Label: ['FN'], Predicted Label: ['LN']
Name: Vroni, True Label: ['FN'], Predicted Label: ['LN']
Name: Brunzli B., True Label: ['LN', 'FN'], Predicted Label: ['FN', 'LN']
Name: Marija, True Label: ['FN'], Predicted Label: ['LN']
Name: Baastrup Asger, True Label: ['LN', 'FN'], Predicted Label: ['FN', 'LN']
Accuracy: 95.36%


# Get Context from Grassco to label names

In [117]:
name_context_list = []
context_size = 100
for cas in cas_list:
    phi_annotations = cas.select('webanno.custom.PHI')
    for phi in phi_annotations:
        if phi.kind == "NAME_DOCTOR" or phi.kind == "NAME_PATIENT":
            # Calculate new begin and end with context, ensuring they don't go out of bounds
            context_begin = max(0, phi.begin - context_size)
            context_end = min(len(cas.sofa_string), phi.end + context_size)
            
            # Get the text before, the entity text, and the text after
            before_text = cas.sofa_string[context_begin:phi.begin]
            entity_text = phi.get_covered_text()
            after_text = cas.sofa_string[phi.end:context_end]
            
            # Combine them with the entity highlighted (e.g., using '**' for markdown-style bold)
            highlighted_text = f"{before_text}####{entity_text}####{after_text}"
            
            name_context_list.append(highlighted_text)
name_context_list

['Hals-Nasen-Ohren-Klinik\nKlinikdirektor: Univ. Prof. Dr. mult. ####Pinocchio Cwerg-Nase####\nA-2236 Opfing, Heldenplatz 16, Tel.: 0816/333-13283, Fax: 0816/333-13284\n \n \n\nHerrn\nDr. Pierre JOUB',
 ' Cwerg-Nase\nA-2236 Opfing, Heldenplatz 16, Tel.: 0816/333-13283, Fax: 0816/333-13284\n \n \n\nHerrn\nDr. ####Pierre JOUBERT#### Innsbrucker Landstraße 22a\nA-2236 Opfing\n \n\n\nZur Vorlage bei Ihrer/Ihrem Ärztin/Arzt\n\n\n\nDr. Pierre ',
 'JOUBERT Innsbrucker Landstraße 22a\nA-2236 Opfing\n \n\n\nZur Vorlage bei Ihrer/Ihrem Ärztin/Arzt\n\n\n\nDr. ####Pierre JOUBERT####\nGeboren am: 03.03.1973\nInnsbrucker Landstraße 22a\nA-2236 Opfing\n \nAllgemeine Ambulanz\n\nAmbulant am ',
 'Definitiver Arztbrief\n\n####Gerhard Ypsilanti####, * 3.2.1961, wohnhaft in 09221 Neukirchen\n\n\n2027-08-02\nDiagnosen\n\nMaligner Pleuraerguß links mit Me',
 'is zur BB Kontrolle beim HA, weiteres Procedere durch den\nHA.\nVoltaren 100 0- 0 -1 bei Schmerzen.\n\n\n####Yorgos Kokiniakis#### MD PhD\nStationsarz