In [71]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import re

# Liste der Abkürzungen für medizinische Einrichtungen und Geschäftliche Formen
abbreviations = {
    # Medizinische Fachbereiche und Einrichtungen
    "HNO": "Hals-Nasen-Ohren",
    "MKG": "Mund-, Kiefer- und Gesichtschirurgie",
    "FA": "Facharzt",
    "ZA": "Zahnarzt",
    "KH": "Krankenhaus",
    "LKH": "Landeskrankenhaus",
    "MVZ": "Medizinisches Versorgungszentrum",
    "ZMVZ": "Zahnmedizinisches Versorgungszentrum",
    "PHV": "Patientenheimversorgung",
    "ZAR": "Zentrum für ambulante Rehabilitation",
    "KJPP": "Kinder- und Jugendpsychiatrie und Psychotherapie",
    "UK": "Universitätsklinikum",
    "BG": "Berufsgenossenschaftliches Krankenhaus",
    "REHA": "Rehabilitationsklinik",
    "KG": "Krankengymnastik",
    "KHB": "Krankenhausbetriebsgesellschaft",
    "SPZ": "Sozialpädiatrisches Zentrum",
    "EVK": "Evangelisches Krankenhaus",
    "CVK": "Christliches Krankenhaus",
    "DRK": "Deutsches Rotes Kreuz",
    "VKK": "Verbundkrankenhaus",
    "MLK": "Malteser Krankenhaus",
    "KFO": "Kieferorthopädische Fachklinik",
    "ZPM": "Zentrum für Psychische Gesundheit",
    "ZNA": "Zentrale Notaufnahme",
    "KFH": "Kuratorium für Dialyse und Nierentransplantation",
    "PKV": "Privatklinik für Versicherte",
    
    # Geschäftliche Rechtsformen
    "e.V.": "Eingetragener Verein",
    "GmbH": "Gesellschaft mit beschränkter Haftung",
    "KGaA": "Kommanditgesellschaft auf Aktien",
    "GmbH & Co. KG": "Kombination aus GmbH und Kommanditgesellschaft",
    "GbR": "Gesellschaft bürgerlichen Rechts",
    "AG": "Aktiengesellschaft",
    "OHG": "Offene Handelsgesellschaft",
    "SE": "Europäische Aktiengesellschaft",
    "PartG": "Partnerschaftsgesellschaft",
    "PartGmbB": "Partnerschaftsgesellschaft mit beschränkter Berufshaftung",
}

# Funktion zum Erstellen des regulären Ausdrucks und zum Ersetzen der Abkürzungen
def replace_abbreviation(text, abbreviations):
    # Precompiled regular expression pattern to match any of the abbreviations
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b')
    # Replace abbreviations in the text using the dictionary
    return pattern.sub(lambda x: abbreviations[x.group()], text)

def remove_non_alphanumeric(input_string):
    # Use regex to match alphanumeric characters, spaces, points, and commas
    return ''.join(re.findall(r'[\w\s.,äöüß/]', input_string))

# Compute Embeddings
def compute_embeddings(model, sentences, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Computing Embeddings"):
        batch = sentences[i:i+batch_size]
        # Apply abbreviation replacement and remove non-alphanumeric characters
        batch = [remove_non_alphanumeric(replace_abbreviation(name, abbreviations)) for name in batch]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    return embeddings

# Load hospital Names from a Text File
def load_hospital_names(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        hospital_names = [line.strip() for line in f if line.strip()]
    return hospital_names

# Fit and Save NearestNeighbors Model
# Instance-based learning is a category of machine learning that relies on storing the training data and making predictions based on the direct comparison of new instances with stored data
def fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine'):
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) # non-parametric algorithm used for finding the closest points in a dataset.
    nn_model.fit(embeddings) # organizes the embeddings into a structure that allows efficient querying
    return nn_model

  from tqdm.autonotebook import tqdm, trange


In [72]:
import spacy
# Paths
data_path = '/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt' 
embeddings_path = '/home/mseiferling/vector_search/data/hospital_embeddings.npy'
nearest_neighbors_model_path = '/home/mseiferling/vector_search/data/nearest_neighbors_model.joblib'

# model
embedding_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

# Check if embeddings and model already exist
if os.path.exists(embeddings_path) and os.path.exists(nearest_neighbors_model_path):
    embeddings = np.load(embeddings_path)
    nn_model = joblib.load(nearest_neighbors_model_path)
    model = SentenceTransformer(embedding_model_name)
    hospital_names = load_hospital_names(data_path)
    nlp = spacy.load("de_core_news_lg")
else:
    # Load hospital names
    hospital_names = load_hospital_names(data_path)
    
    # Load embedding model
    model = SentenceTransformer(embedding_model_name)
    
    # Compute embeddings
    embeddings = compute_embeddings(model, hospital_names)
    
    # Save embeddings
    np.save(embeddings_path, embeddings)
    
    # Fit NearestNeighbors model
    nn_model = fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine')
    
    # Save NearestNeighbors model
    joblib.dump(nn_model, nearest_neighbors_model_path)
    
    # Load the SpaCy German model for NER and POS tagging
    nlp = spacy.load("de_core_news_lg")

In [None]:
import numpy as np
from Levenshtein import distance as levenshtein_distance
import logging
import re
import unicodedata

# Liste der Abkürzungen für medizinische Einrichtungen und Geschäftliche Formen
abbreviations = {
    # Medizinische Fachbereiche und Einrichtungen
    "HNO": "Hals-Nasen-Ohren",
    "MKG": "Mund-, Kiefer- und Gesichtschirurgie",
    "FA": "Facharzt",
    "ZA": "Zahnarzt",
    "KH": "Krankenhaus",
    "LKH": "Landeskrankenhaus",
    "MVZ": "Medizinisches Versorgungszentrum",
    "ZMVZ": "Zahnmedizinisches Versorgungszentrum",
    "PHV": "Patientenheimversorgung",
    "ZAR": "Zentrum für ambulante Rehabilitation",
    "KJPP": "Kinder- und Jugendpsychiatrie und Psychotherapie",
    "UK": "Universitätsklinikum",
    "BG": "Berufsgenossenschaftliches Krankenhaus",
    "REHA": "Rehabilitationsklinik",
    "KG": "Krankengymnastik",
    "KHB": "Krankenhausbetriebsgesellschaft",
    "SPZ": "Sozialpädiatrisches Zentrum",
    "EVK": "Evangelisches Krankenhaus",
    "CVK": "Christliches Krankenhaus",
    "DRK": "Deutsches Rotes Kreuz",
    "VKK": "Verbundkrankenhaus",
    "MLK": "Malteser Krankenhaus",
    "KFO": "Kieferorthopädische Fachklinik",
    "ZPM": "Zentrum für Psychische Gesundheit",
    "ZNA": "Zentrale Notaufnahme",
    "KFH": "Kuratorium für Dialyse und Nierentransplantation",
    "PKV": "Privatklinik für Versicherte",
    
    # Geschäftliche Rechtsformen
    "e.V.": "Eingetragener Verein",
    "GmbH": "Gesellschaft mit beschränkter Haftung",
    "KGaA": "Kommanditgesellschaft auf Aktien",
    "GmbH & Co. KG": "Kombination aus GmbH und Kommanditgesellschaft",
    "GbR": "Gesellschaft bürgerlichen Rechts",
    "AG": "Aktiengesellschaft",
    "OHG": "Offene Handelsgesellschaft",
    "SE": "Europäische Aktiengesellschaft",
    "PartG": "Partnerschaftsgesellschaft",
    "PartGmbB": "Partnerschaftsgesellschaft mit beschränkter Berufshaftung",
}

# List of substrings to search for
healthcare_keywords = [
    # Allgemeine Begriffe
    "arzt", "ärzt", "chirurg", "gemeinschaft", "klinik", "logie", "ologe", 
    "medizin", "praxis", "sanatorium", "therapie", "ambulanz", 

    # Fachrichtungen und Behandlungen
    "anästhesie", "augen", "cardio", "dental", "derm", "endokrin", "gastro", "gyn", 
    "hämo", "kardio", "neuro", "onko", "optik", "ortho", "osteo", "pathie", 
    "pädie", "pneumo", "psych", "uro", "zahn", "zähne","internist",

    # Verfahren und Diagnostik
    "blut", "ct", "diagnostik", "echo", "labor", "mrt", "radio", "rehabil", "spende",

    # Pflege und Behandlungsarten
    "betreuung", "ernährung", "geriatr", "hospiz", "intensiv", "palliativ", "pflege", 
    "physio", "rehaklinik", "therapeut",

    # Alternative Medizin
    "akupunkt", "heilpraktiker", "homöo", "naturheil",

    # Einrichtungen und Zentren
    "fach", "kranken", "notfall", "reha", "zentrum", "haus", "test",

    # Pädiatrie, Frauen und Spezialversorgung
    "diabetes", "frauen", "kinder", "lungen",

    # Zusätzliche Begriffe
    "apotheke", "behandl", "chirurgi", "gesundheitszentrum", 
    "klinisch", "untersuch",
    
    # titel
    "dr","phil","univ","medic","dres","med","dipl","psych","dent","vet",
    
    #abbreviations
    "hno", "mkg", "fa", "za", "kh", "lkh", "mvz", "zmvz", "phv", "zar", "kjpp", "uk", "bg", "reha", "kg", "khb", "spz", "evk", "cvk", "drk", "vkk", "mlk", "kfo", "zpm", "zna", "kfh", "pkv"
]

# Funktion zum Erstellen des regulären Ausdrucks und zum Ersetzen der Abkürzungen
def replace_abbreviation(text, abbreviations):
    """
    Replace abbreviations in a given text with their full forms based on a provided dictionary.

    This function searches for any abbreviations specified as keys in the ``abbreviations`` dictionary
    within the ``text``. If a match is found, it replaces the abbreviation with its corresponding full
    form from the dictionary. The function uses a regular expression with word boundaries to ensure
    that only exact abbreviations are matched (i.e., whole words).

    Parameters
    ----------
    text : str
        The input string where abbreviations are to be replaced.
    abbreviations : dict
        A dictionary where keys are abbreviations (as strings) and values are their full forms (as strings).

    Returns
    -------
    str
        The modified text with all found abbreviations replaced by their full forms.
    """
    # Precompiled regular expression pattern to match any of the abbreviations
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b')
    
    # Replace abbreviations in the text using the dictionary
    return pattern.sub(lambda x: abbreviations[x.group()], text)

# helper function to extract the main name before '/'
def get_name(facility):
    """
    Extract the main facility name before any '/' characters.

    This function processes a facility string and extracts the main
    facility name that appears before any '/' character.

    Parameters
    ----------
    facility : str
        The full facility string containing the name and additional info.

    Returns
    -------
    str
        The main facility name.
    """
    return facility.split('/')[0].strip()

def remove_non_alphanumeric(input_string):
    """
    Clean the input hospital name string by:
    - Removing Byte Order Mark (BOM) characters
    - Replacing newlines and carriage returns with a space
    - Removing unwanted special characters and control characters
    - Keeping only alphanumerics, spaces, periods, commas, slashes, hyphens, and specific German characters
    - Converting all text to lowercase
    - Normalizing Unicode to NFC form
    - Normalizing multiple spaces to a single space

    Parameters
    ----------
    input_string : str
        The hospital name string to clean.

    Returns
    -------
    str
        The cleaned and normalized hospital name string.
    """
    # 1. Normalize Unicode to NFC form to ensure consistency
    cleaned = unicodedata.normalize('NFC', input_string)
    
    # 2. Remove BOM characters
    # Common BOMs: \ufeff (UTF-8), \uFEFF (UTF-16), \uFFFE (Invalid, but included for robustness)
    cleaned = cleaned.replace('\ufeff', '').replace('\uFEFF', '').replace('\uFFFE', '')
    
    # 3. Replace newlines and carriage returns with a space
    cleaned = cleaned.replace('\n', ' ').replace('\r', ' ')
    
    # 4. Remove other control characters (non-printable characters)
    #    This removes characters in the range U+0000 to U+001F and U+007F
    cleaned = re.sub(r'[\x00-\x1F\x7F]', '', cleaned)
    
    # 5. Define allowed characters:
    #    - Letters (A-Z, a-z) including German umlauts and sharp S (ß)
    #    - Numbers (0-9)
    #    - Spaces
    #    - Periods (.)
    #    - Commas (,)
    #    - Slashes (/)
    #    - Hyphens (-)
    allowed_chars_pattern = re.compile(r'[A-Za-zäöüßÄÖÜẞ0-9\s.,/\-]')
    
    # 6. Keep only allowed characters
    cleaned = ''.join(allowed_chars_pattern.findall(cleaned))
    
    # 7. Convert to lowercase
    cleaned = cleaned.lower()
    
    # 8. Normalize multiple spaces to a single space and strip leading/trailing spaces
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    
    return cleaned


def extract_sensitive_data(text, nlp, healthcare_keywords):
    """
    Extract named entities and proper nouns from text, filtering out healthcare-related terms.

    Parameters
    ----------
    text : str
        Input text to process.
    healthcare_keywords : list
        List of healthcare-related keywords to filter out.

    Returns
    -------
    list
        Filtered list of unique sensitive words.
    """
    doc = nlp(text)
    unique_substrings = set()
    
    # Extract named entities of type PERSON, ORG, LOC and proper nouns
    for ent in doc.ents:
        if ent.label_ in ["PER", "ORG", "LOC"]:
            words = ent.text.split()
            # Only add words that don't contain healthcare keywords
            filtered_words = [
                word for word in words 
                if not any(keyword in word.lower() for keyword in healthcare_keywords)
            ]
            unique_substrings.update(filtered_words)
    
    # Extract proper nouns
    for token in doc:
        if token.pos_ == "PROPN":
            # Only add if it doesn't contain healthcare keywords
            if not any(keyword in token.text.lower() for keyword in healthcare_keywords):
                unique_substrings.add(token.text)
    
    return list(unique_substrings)

def filter_hospitals(hospitals, similarity_scores, sensitive_words):
    """
    Filter hospitals based on similarity scores and sensitive words.

    Parameters
    ----------
    hospitals : list
        List of hospital names to filter.
    similarity_scores : list
        List of similarity scores corresponding to each hospital.
    sensitive_words : list
        List of sensitive words to filter out.

    Returns
    -------
    list
        Filtered list of hospitals excluding exact matches and those containing sensitive words.
    """
    # Combine the filtering conditions into a single list comprehension
    filtered_hospitals = [
        hospital for hospital, score in zip(hospitals, similarity_scores)
        if score != 1 and not any(
            sensitive_word.lower() in hospital.lower() 
            for sensitive_word in sensitive_words
        )
    ]
    
    return filtered_hospitals

def normalize_levenshtein_distance(str1, str2):
    """
    Calculate the normalized Levenshtein distance between two strings.

    The normalized Levenshtein distance is the ratio of the raw Levenshtein distance to the
    length of the longer string, providing a similarity measure between 0 and 1. A result
    closer to 0 indicates higher similarity, while a result closer to 1 indicates more dissimilarity.

    Parameters
    ----------
    str1 : str
        The first string to compare.
    str2 : str
        The second string to compare.

    Returns
    -------
    float
        The normalized Levenshtein distance, ranging from 0.0 to 1.0.
        Returns 0.0 for two empty strings (an edge case).
    """
    lev_distance = levenshtein_distance(str1, str2)
    max_len = max(len(str1), len(str2))
    if max_len == 0:  # Handle edge case with empty strings
        return 0.0
    return lev_distance / max_len  # Normalize by dividing by the max string length


def calculate_average_distance(target_sensitive_data, sampled_sensitive_data):
    """
    Calculate the average normalized Levenshtein distance between target terms and sampled terms.

    For each word in the target terms, find the closest matching word in the sampled terms,
    and compute the normalized Levenshtein distance between them. The average of these
    minimum distances is returned, adjusted to produce a similarity measure from 0 to 1,
    where higher values indicate greater similarity.

    Parameters
    ----------
    target_sensitive_data : list of str
        List of target terms (e.g., words related to healthcare in the target hospital).
    sampled_sensitive_data : list of str
        List of terms from a hospital name to compare against the target terms.

    Returns
    -------
    float
        The average normalized Levenshtein distance, where 1 indicates perfect similarity
        and values closer to 0 indicate dissimilarity.
    """
    total_distance = 0
    num_comparisons = len(target_sensitive_data)
    
    # For each target term, find the closest match in the sampled terms
    for target_substring in target_sensitive_data:
        min_distance = float('inf')  # Initialize with a large value
        
        for sampled_substring in sampled_sensitive_data:
            normalized_distance = normalize_levenshtein_distance(target_substring.lower(), sampled_substring.lower())
            if normalized_distance < min_distance:
                min_distance = normalized_distance

        # Accumulate the smallest distance for this target term
        total_distance += min_distance

    # Calculate the average normalized distance
    if num_comparisons == 0:
        return 0.0
    return 1 - (total_distance / num_comparisons)


def calculate_hospital_probabilities(ranked_hospitals, temperature=0.1):
    """
    Calculate a probability distribution over hospitals based on their distances using a sigmoid function and temperature scaling.

    Parameters
    ----------
    ranked_hospitals : list of tuples
        A list where each element is a tuple of (hospital, distance).
    temperature : float
        A scaling factor to adjust the sharpness of the probability distribution.

    Returns
    -------
    tuple
        A tuple containing:
        - hospitals : tuple
            A tuple of hospital identifiers.
        - probabilities : numpy.ndarray
            An array of probabilities corresponding to each hospital.
    """
    # Split the hospitals and distances
    hospitals, distances = zip(*ranked_hospitals)
    distances = np.array(distances)
    
    # Apply sigmoid function to distances
    sigmoid_distances = 1 / (1 + np.exp(-distances))
    
    # Apply temperature scaling to make the distribution sharper
    scaled_scores = sigmoid_distances ** (1 / temperature)
    
    # Normalize the scaled scores to create a probability distribution
    probabilities = scaled_scores / np.sum(scaled_scores)
    
    return hospitals, probabilities

def rank_hospitals_by_similarity(target_hospital, filtered_hospitals, healthcare_keywords):
    """
    Identify and rank hospitals based on the similarity of healthcare-related terms in the target hospital name.

    Parameters
    ----------
    target_hospital : str
        The name of the target hospital to compare against.
    filtered_hospitals : list of str
        A list of hospital names to be evaluated.
    healthcare_keywords : list of str
        A list of keywords representing healthcare-related terms.

    Returns
    -------
    list of tuples
        A list of tuples where each tuple contains a hospital name and its average normalized
        Levenshtein distance to the healthcare-related terms in the target hospital. Only hospitals
        with an average distance above or equal to 0.5 are included.
    """
    # Extract healthcare-related words from the target hospital name
    healthcare_terms = [word for word in re.split(r'[ \-]', target_hospital) if any(keyword in word.lower() for keyword in healthcare_keywords)]
    print(healthcare_terms)
    ranked_hospitals = []
    
    # Calculate average normalized Levenshtein distance and filter hospitals
    for hospital in filtered_hospitals:
        avg_distance = calculate_average_distance(healthcare_terms, re.split(r'[ \-]', hospital))
        # print(hospital.split(),avg_distance)
        if avg_distance >= 0.5:
            ranked_hospitals.append((hospital, avg_distance))
    
    return ranked_hospitals

def query_similar_hospitals(target_sentence, model, nn_model, hospital_names, top_k=5):
    """
    Query the most similar hospitals based on a target sentence using a pre-trained model 
    and a nearest-neighbor model for similarity search.

    Parameters
    ----------
    target_sentence : str
        The input sentence describing the target hospital or criteria.
    model
        A pre-trained model used to compute embeddings for the target sentence. 
        Typically, this is a sentence transformer or similar NLP model.
    nn_model
        A trained nearest-neighbor model (e.g., sklearn's NearestNeighbors) 
        used for similarity search in the embedding space.
    hospital_names : list
        A list of hospital names corresponding to the entries 
        in the embedding space indexed by `nn_model`.
    top_k : int, optional
        The number of most similar hospitals to return. Default is 5.

    Returns
    -------
    tuple
        A tuple containing:
        - results : list
            A list of the top_k most similar hospital names.
        - similarity : list
            A list of similarity scores (float values between 0 and 1) 
            corresponding to the top_k results.
    """
    # Compute embedding for the target sentence
    target_embedding = model.encode([target_sentence], convert_to_numpy=True)
    
    # Perform similarity search with the specified top_k
    distances, indices = nn_model.kneighbors(target_embedding, n_neighbors=top_k)
    
    # Retrieve the hospital names and their similarity scores
    results = []
    similarity = []
    for idx, distance in zip(indices[0], distances[0]):
        hospital = hospital_names[idx]  # Directly use the list of hospital names
        similarity_score = 1 - distance  # Convert cosine distance to similarity
        results.append(hospital)
        similarity.append(float(similarity_score))
    return results, similarity

def query_similar_hospitals_adaptive(target_hospital, model, nn_model, nlp, hospital_names, initial_k=10, max_k=100, step_size=10, min_matches=3):
    """
    Adaptively query for similar hospitals, expanding the search until enough matches are found.

    Parameters
    ----------
    target_hospital : str
        The hospital to find matches for.
    model
        The embedding model used to compute hospital embeddings.
    nn_model
        The nearest neighbor model used for similarity search.
    hospital_names : list
        List of all hospital names.
    initial_k : int
        Initial number of hospitals to retrieve.
    max_k : int
        Maximum number of hospitals to consider.
    step_size : int
        How much to increase `k` by in each iteration.
    min_matches : int
        Minimum number of matches required.

    Returns
    -------
    tuple
        A tuple containing:
        - filtered_hospitals : list
            List of hospitals that match the criteria.
        - similarity_scores : list
            List of similarity scores corresponding to the filtered hospitals.
        - k_used : int
            The final value of `k` used to satisfy the minimum matches.
    """
    current_k = initial_k
    
    while current_k <= max_k:
        # replace abbreviation with the semantic full form
        target_hospital_extended = replace_abbreviation(target_hospital, abbreviations)
        
        # Get similar hospitals with current k
        similar_hospitals, similarity_scores = query_similar_hospitals(target_hospital_extended, model, nn_model, hospital_names, top_k=current_k)
        
        # Apply the get_name function to each similar hospital to remove the healthcare:specialty infromation
        similar_hospitals = [get_name(hospital) for hospital in similar_hospitals]
        #print(similar_hospitals)
        # Extract sensitive words
        sensitive_words = extract_sensitive_data(target_hospital_extended, nlp, healthcare_keywords)
        #print(sensitive_words)
        # Filter hospitals
        filtered_hospitals = filter_hospitals(similar_hospitals, similarity_scores, sensitive_words)
        # If we have enough matches, break
        if len(filtered_hospitals) >= min_matches:
            return filtered_hospitals, current_k
            
        # Increase k for next iteration
        current_k += step_size
        
        logging.info(f"Insufficient matches found with k={current_k-step_size}, "
                    f"expanding search to k={current_k}")
    
    # If we get here, we couldn't find enough matches even with max_k
    logging.warning(f"Could not find {min_matches} matches even with k={max_k}")
    
    return filtered_hospitals, current_k

def get_hospital_surrogate(target_hospital, model, nn_model, nlp, hospital_names, healthcare_keywords, initial_k = 10, max_k = 100, min_matches = 3):
    """
    Main function to get a surrogate hospital with adaptive search.

    Parameters
    ----------
    target_hospital : str
        The hospital to find matches for.
    model
        The embedding model used to compute hospital embeddings.
    nn_model
        The nearest neighbor model used for similarity search.
    hospital_names : list
        List of all hospital names.
    initial_k : int
        Initial number of hospitals to retrieve.
    max_k : int
        Maximum number of hospitals to consider.
    min_matches : int
        Minimum number of matches required.

    Returns
    -------
    tuple
        A tuple containing:
        - sampled_hospital : str
            The selected surrogate hospital.
        - probabilities : list
            A list of probabilities associated with the selected hospital.
        - hospitals : list
            A list of hospitals considered in the adaptive search.
        - k_used : int
            The final value of `k` used to meet the minimum matches.
    """
    target_hospital = remove_non_alphanumeric(target_hospital)
    # Get similar hospitals with adaptive k search
    similar_hospitals, k_used = query_similar_hospitals_adaptive(target_hospital, model, nn_model, nlp, hospital_names, initial_k=initial_k, max_k=max_k, min_matches=min_matches)
    print(similar_hospitals,k_used)
    # Rank hospitals
    ranked_hospitals = rank_hospitals_by_similarity(target_hospital, similar_hospitals, healthcare_keywords)
    print(ranked_hospitals)
    # Calculate probabilities and sample
    hospitals, probabilities = calculate_hospital_probabilities(ranked_hospitals)
    sampled_hospital = np.random.choice(hospitals, p=probabilities)
    
    return sampled_hospital, probabilities, hospitals, k_used


# example Usage
target_hospital = "\ufeffARCOS-KLINIK FLENSBURG\nAkademisches Lehrkrankenhaus\nder Otto-Waalkes-Universität Borkum"
surrogate_hospital, probabilities, hospitals, k_used = get_hospital_surrogate(target_hospital, model, nn_model, nlp, hospital_names, healthcare_keywords)
    
print(f"\nSearch expanded to k={k_used}")
print(f"\nSampled Hospital: {surrogate_hospital}")
print("Probabilities associated with each hospital:")
for hospital, probability in zip(hospitals, probabilities):
    print(f"{hospital} (Probability: {probability:.4f})")

arcos-klinik flensburg akademisches lehrkrankenhaus der otto-waalkes-universität borkum
['Universitätsklinikum Brandenburg an der Havel', 'Universitätsklinikum des Saarlandes Innere Medizin', 'Universitätsklinikum Hamburg-Eppendorf', 'Facharzt für Allgemeinmedizin Lehrpraxis der Albert-Ludwigs-Universität Freiburg', 'Diakonissenkrankenhaus Augsburg', 'Universitätsklinikum Freiburg', 'Universitätsklinikum Jena', 'Berufsgenossenschaftliches Universitätsklinikum Bergmannsheil', 'Rheuma Liga Nordhessen', 'Ärztehaus Oskar Fromme'] 10
['klinik', 'lehrkrankenhaus', 'universität']
[('Facharzt für Allgemeinmedizin Lehrpraxis der Albert-Ludwigs-Universität Freiburg', 0.5722222222222222)]

Search expanded to k=10

Sampled Hospital: Facharzt für Allgemeinmedizin Lehrpraxis der Albert-Ludwigs-Universität Freiburg
Probabilities associated with each hospital:
Facharzt für Allgemeinmedizin Lehrpraxis der Albert-Ludwigs-Universität Freiburg (Probability: 1.0000)


In [32]:
import re
# Liste der Abkürzungen für medizinische Einrichtungen und Geschäftliche Formen
abbreviations = {
    # Medizinische Fachbereiche und Einrichtungen
    "HNO": "Hals-Nasen-Ohren",
    "MKG": "Mund-, Kiefer- und Gesichtschirurgie",
    "FA": "Facharzt",
    "ZA": "Zahnarzt",
    "KH": "Krankenhaus",
    "LKH": "Landeskrankenhaus",
    "MVZ": "Medizinisches Versorgungszentrum",
    "ZMVZ": "Zahnmedizinisches Versorgungszentrum",
    "PHV": "Patientenheimversorgung",
    "ZAR": "Zentrum für ambulante Rehabilitation",
    "KJPP": "Kinder- und Jugendpsychiatrie und Psychotherapie",
    "UK": "Universitätsklinikum",
    "BG": "Berufsgenossenschaftliches Krankenhaus",
    "REHA": "Rehabilitationsklinik",
    "KG": "Krankengymnastik",
    "KHB": "Krankenhausbetriebsgesellschaft",
    "SPZ": "Sozialpädiatrisches Zentrum",
    "EVK": "Evangelisches Krankenhaus",
    "CVK": "Christliches Krankenhaus",
    "DRK": "Deutsches Rotes Kreuz",
    "VKK": "Verbundkrankenhaus",
    "MLK": "Malteser Krankenhaus",
    "KFO": "Kieferorthopädische Fachklinik",
    "ZPM": "Zentrum für Psychische Gesundheit",
    "ZNA": "Zentrale Notaufnahme",
    "KFH": "Kuratorium für Dialyse und Nierentransplantation",
    "PKV": "Privatklinik für Versicherte",
    
    # Geschäftliche Rechtsformen
    "e.V.": "Eingetragener Verein",
    "GmbH": "Gesellschaft mit beschränkter Haftung",
    "KGaA": "Kommanditgesellschaft auf Aktien",
    "GmbH & Co. KG": "Kombination aus GmbH und Kommanditgesellschaft",
    "GbR": "Gesellschaft bürgerlichen Rechts",
    "AG": "Aktiengesellschaft",
    "OHG": "Offene Handelsgesellschaft",
    "SE": "Europäische Aktiengesellschaft",
    "PartG": "Partnerschaftsgesellschaft",
    "PartGmbB": "Partnerschaftsgesellschaft mit beschränkter Berufshaftung",
}



# Funktion zum Erstellen des regulären Ausdrucks und zum Ersetzen der Abkürzungen
def replace_abbreviation(text, abbreviations):
    # Precompiled regular expression pattern to match any of the abbreviations
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviations.keys()) + r')\b')
    
    # Replace abbreviations in the text using the dictionary
    return pattern.sub(lambda x: abbreviations[x.group()], text)


# List of strings to process
texts = ["HNO Privat-Praxis Dr. Ingo Reimold", "MKG Spezialist Dr. Fischer", "BG Klinik in der Nähe"]

# Apply the function to each line in texts
result = [replace_abbreviation(text, abbreviations) for text in texts]

# Display results
for line in result:
    print(line)


Hals-Nasen-Ohren Privat-Praxis Dr. Ingo Reimold
Mund-, Kiefer- und Gesichtschirurgie Spezialist Dr. Fischer
Berufsgenossenschaftliches Krankenhaus Klinik in der Nähe


In [66]:
import string
import gender_guesser.detector as gender

TITLES = {"dr","phil","univ","medic","dres","med","dipl","psych","dent","vet","habil","mult","rer","päd","nat"}

names = {'Beate Albers Frau': ['Wir', 'berichten', 'über', 'lhre', 'Patient'], 
         'Albers': ['Ii', '.', 'Handgelenk', 'Bei', 'Frau'], 
         'Siewert': ['behandelnde', 'Psychiaterin', 'Frau', 'Dr', '.'], 
         'Bernwart Schulze': ['freundlichen', 'kollegialen', 'Grüßen', 'Dr.med', '.']}

def is_title(token):
    return any(title in token.lower() for title in TITLES)

def has_female_suffix(token):
    return any(female_suffix in token for female_suffix in {"in","innen"})

def is_salutation(token):
    return token in {"Herr","Frau"}

def is_punctuation(token):
    # includes characters like . , ; ! ? - ( )
    return token in string.punctuation

def detect_gender(name, gender_guesser):
    """
    Detects the gender of a name based on preceding titles, salutations, suffixes, and a fallback gender guessing model.

    Parameters
    ----------
    name : tuple
        A tuple where:
        - The first element (str) is the primary name being analyzed.
        - The second element (list of str) is a list of preceding words providing context (e.g., titles or salutations).
    gender_guesser : object
        An instance of a gender guessing model or utility that provides a `get_gender` method.

    Returns
    -------
    str
        The detected gender, which can be:
        - "male": The default gender, assigned if a male salutation, suffix, or the fallback model identifies the name as male.
        - "female": Assigned if a female salutation, suffix, or the fallback model identifies the name as female.
    """
    for preceding_word in reversed(name[1]):
        # Skip punctuation
        if is_punctuation(preceding_word):
            continue

        # Check for titles
        if is_title(preceding_word):
            continue

        # Check for salutations
        if is_salutation(preceding_word):
            if preceding_word.lower() == "herr":
                return "male"
            elif preceding_word.lower() == "frau":
                return "female"

        # Check for female suffixes
        if has_female_suffix(preceding_word):
            return "female"
        
        # If a non-title, non-salutation, non-suffix word is encountered, stop processing
        break
    first_name, *_ = name[0].split()
    gender = gender_guesser.get_gender(first_name)
    print(gender)
    if gender == "unknown":
        gender = "male"
    return gender

gender_guesser = gender.Detector()    
gender = [detect_gender(name,gender_guesser) for name in names.items()]
print(gender)

female
unknown
['female', 'female', 'female', 'male']


# the following steps ensure no sensitive information is disclosed
step 1: remove 1:1 similarity 

step 2: detect sensitive names using a POS Tagger model from spacy which can detect pROPER NOUNS
step 3: remove query results containing THESE sensitve proper noun names

# the following steps serve two purposes it increases the likelihood of semantic similiar surrogate and it ensures that the similarity in those top names is based on the healthcare facility and not on the sensitve information decreasing likelihood of choosing sensitive surrogate
step 4: detect healthcare names with a list of facility keywords 
step 5: get top levenshtein distance names with the lowest distance from the query 

# picking the surrogate
step 6: sample from the remaining results which exclude results with sensitve proper nouns and prioritize results with low levenshtein distance to the semnatic relevant healthcase facility names

### Comparison Between Semantic Similarity Approach and Random Sampling:

**Objective:**
To demonstrate that using a semantic similarity-based approach for data extraction—specifically for identifying and preserving the meaning of semantic categories like "hospital"—is more effective than random sampling, both in terms of preserving utility and maintaining anonymity.

**Approach:**

1. **Semantic Similarity Approach:**
   - **Keyword-Based Selection:** Using predefined keywords to generate a list of semantically related hospitals.
   - **Annotation and Marking:** Identifying and marking terms that semantically relate to hospitals within the dataset.
   - **Preservation of Meaning:** Ensuring that terms related to the "hospital" category retain their semantic information during data selection or anonymization.

2. **Random Sampling Approach:**
   - **Random Selection:** Extracting a random subset of the dataset without considering semantic similarity.
   - **Loss of Information:** Risk of losing key terms or relationships that are important to the category "hospital," resulting in decreased utility.

---
anonymity and utility evaluation
utility - preserve semantic meaning?
anonymity - if one is sampled how likely is it to sample the original one again? 
-------
Könnten Stations namen in Hospital Location Annotation vorkommen? 
Falls ja wäre das nicht vom jetztigen datensatz abgedeckt. 

HNO - Hals-Nasen-Ohren-Heilkunde
MKG - Mund-, Kiefer- und Gesichtschirurgie
FA – Facharzt
ZA - Zahnarzt
KH – Krankenhaus
LKH - Landeskrankenhaus
MVZ – Medizinisches Versorgungszentrum
ZMVZ - Zahnmedizinisches Versorgungszentrum 
PHV - patientenheimversorgung
ZAR - Zentrum für ambulante Rehabilitation
KJPP - Kinder- und Jugendpsychiatrie und Psychotherapie
UK – Universitätsklinikum
BG – Berufsgenossenschaftliches Krankenhaus
REHA – Rehabilitationsklinik
KG - Krankengymnastik
KHB – Krankenhausbetriebsgesellschaft
SPZ – Sozialpädiatrisches Zentrum
EVK – Evangelisches Krankenhaus
CVK – Christliches Krankenhaus
DRK – Deutsches Rotes Kreuz
VKK – Verbundkrankenhaus
MLK – Malteser Krankenhaus
KFO – Kieferorthopädische Fachklinik
ZPM – Zentrum für Psychische Gesundheit
ZNA – Zentrale Notaufnahme
KFH – Kuratorium für Dialyse und Nierentransplantation
PKV – Privatklinik für Versicherte

e.V. – Eingetragener Verein
GmbH – Gesellschaft mit beschränkter Haftung
GbR – Gesellschaft bürgerlichen Rechts
AG – Aktiengesellschaft
OHG – Offene Handelsgesellschaft

Preprocessing of Query and Hospital Data
Add healthcare:specialty information to hospital data embedding.


In [1]:
import overpy

# Initialize the Overpass API
api = overpy.Overpass() # Read Only connection to OpenStreetMap

# Overpass QL query to get all relevant healthcare facilities in Germany
overpass_query = """
[out:json][timeout:180];
area["ISO3166-1"="DE"][admin_level=2];
(
  // Healthcare facilities
  node["healthcare"](area);
  way["healthcare"](area);
  relation["healthcare"](area);
);
out body;
"""

# Execute the Overpass query
result = api.query(overpass_query)

In [3]:
for node in result.nodes:
    print(node.tags)


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



{'addr:city': 'Rehburg-Loccum', 'addr:country': 'DE', 'addr:housenumber': '4', 'addr:postcode': '31547', 'addr:street': 'Allee', 'amenity': 'doctors', 'healthcare': 'doctor', 'healthcare:speciality': 'general', 'name': 'Werner Dralle', 'opening_hours': 'Mo,Tu,Th,Fr 09:00-12:00, Mo,Th 16:30-18:00'}
{'addr:city': 'Rehburg-Loccum', 'addr:country': 'DE', 'addr:housenumber': '2', 'addr:postcode': '31547', 'addr:street': 'Allee', 'healthcare': 'midwife', 'operator': 'Maren Drewes', 'phone': '+49 5037 3255'}
{'addr:city': 'Ebensfeld', 'addr:housenumber': '25', 'addr:postcode': '96250', 'addr:street': 'Kellerstraße', 'amenity': 'dentist', 'healthcare': 'dentist', 'healthcare:speciality': 'stomatology', 'name': 'Volker Sommer', 'operator': 'Volker Sommer'}
{'addr:city': 'Ebensfeld', 'addr:housenumber': '4', 'addr:postcode': '96250', 'addr:street': 'Kirchgasse', 'amenity': 'doctors', 'healthcare': 'doctor', 'healthcare:speciality': 'general;internal;emergency', 'name': 'Dr. med. Severin Huf', 'o

In [8]:
import spacy

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg") # python -m spacy download de_dep_news_trf


# Example text
text = "Hausarzt Dr. med. Siebert"

# Process the text with the NER model
doc = nlp(text)

# Print the recognized entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}") # POS: NOUN(Common Noun) POS: PROPN (specific Names)

Siebert -> PER
Token: Hausarzt, POS: NOUN
Token: Dr., POS: NOUN
Token: med, POS: PROPN
Token: ., POS: PUNCT
Token: Siebert, POS: PROPN
