In [44]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import re

# Load hospital Names from a Text File
def load_hospital_names(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        hospital_names = [line.strip() for line in f if line.strip()]
    return hospital_names

# Compute Embeddings
def compute_embeddings(model, sentences, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Computing Embeddings"):
        batch = sentences[i:i+batch_size] # remove alpha numeric?
        batch = [remove_non_alphanumeric(name) for name in batch]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    return embeddings

# Fit and Save NearestNeighbors Model
# Instance-based learning is a category of machine learning that relies on storing the training data and making predictions based on the direct comparison of new instances with stored data
def fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine'):
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) # non-parametric algorithm used for finding the closest points in a dataset.
    nn_model.fit(embeddings) # organizes the embeddings into a structure that allows efficient querying
    return nn_model

def remove_non_alphanumeric(input_string):
    # Use regex to match alphanumeric characters, spaces, points, and commas
    return ''.join(re.findall(r'[\w\s.,äöüß]', input_string))

# Query Function
def query_similar_hospitals(target_sentence, model, nn_model, hospital_names, top_k=5):
    # Compute embedding for the target sentence
    target_embedding = model.encode([remove_non_alphanumeric(target_sentence)], convert_to_numpy=True)
    
    # Perform similarity search with the specified top_k
    distances, indices = nn_model.kneighbors(target_embedding, n_neighbors=top_k)
    
    # Retrieve the hospital names and their similarity scores
    results = []
    similarity = []
    for idx, distance in zip(indices[0], distances[0]):
        hospital = hospital_names[idx]  # Directly use the list of hospital names
        similarity_score = 1 - distance  # Convert cosine distance to similarity
        results.append(hospital)
        similarity.append(float(similarity_score))
    return results, similarity

In [3]:
# Paths
data_path = '/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt' 
embeddings_path = '/home/mseiferling/vector_search/data/hospital_embeddings.npy'
nearest_neighbors_model_path = '/home/mseiferling/vector_search/data/nearest_neighbors_model.joblib'

# model
embedding_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

# Check if embeddings and model already exist
if os.path.exists(embeddings_path) and os.path.exists(nearest_neighbors_model_path):
    embeddings = np.load(embeddings_path)
    nn_model = joblib.load(nearest_neighbors_model_path)
    model = SentenceTransformer(embedding_model_name)
    hospital_names = load_hospital_names(data_path)
else:
    # Load hospital names
    hospital_names = load_hospital_names(data_path)
    
    # Load embedding model
    model = SentenceTransformer(embedding_model_name)
    
    # Compute embeddings
    embeddings = compute_embeddings(model, hospital_names)
    
    # Save embeddings
    np.save(embeddings_path, embeddings)
    
    # Fit NearestNeighbors model
    nn_model = fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine')
    
    # Save NearestNeighbors model
    joblib.dump(nn_model, nearest_neighbors_model_path)

In [61]:
import spacy
import numpy as np
from Levenshtein import distance as levenshtein_distance

# Load the SpaCy German model for NER and POS tagging
nlp = spacy.load("de_core_news_lg")

# Function to extract proper nouns and named entities
def extract_sensitive_data(text):
    doc = nlp(text)
    unique_substrings = set()

    # Extract named entities of type PERSON, ORG, LOC and proper nouns
    for ent in doc.ents:
        if ent.label_ in ["PER", "ORG", "LOC"]:  # PERSON, ORGANIZATION, LOCATION
            words = ent.text.split()  # Split ent.text into individual words
            unique_substrings.update(words) # Add each word to the set
    for token in doc:
        if token.pos_ == "PROPN":  # Proper Noun (e.g., specific names)
            unique_substrings.add(token.text)

    return list(unique_substrings)

# Function to normalize Levenshtein distance
def normalize_levenshtein_distance(str1, str2):
    lev_distance = levenshtein_distance(str1, str2)
    max_len = max(len(str1), len(str2))
    if max_len == 0:  # To handle edge cases with empty strings
        return 0.0
    return lev_distance / max_len  # Normalize by dividing by max string length

# Function to calculate average Levenshtein distance
def calculate_average_distance(target_sensitive_data, sampled_sensitive_data):
    total_distance = 0
    num_comparisons = len(target_sensitive_data)
    
    # For each sensitive substring in the target word, find the closest match in the sampled hospital
    for target_substring in target_sensitive_data:
        min_distance = float('inf')  # Start with a large number
        
        for sampled_substring in sampled_sensitive_data:
            normalized_distance = normalize_levenshtein_distance(target_substring, sampled_substring)
            if normalized_distance < min_distance:
                min_distance = normalized_distance

        # Accumulate the smallest distance for this target substring
        total_distance += min_distance

    # Calculate the average normalized distance
    if num_comparisons == 0:
        return 0.0
    return 1 - (total_distance / num_comparisons)

# List of substrings to search for
keywords = [
    # Allgemeine Begriffe
    "arzt", "ärzt", "chirurg", "gemeinschaft", "klinik", "logie", "ologe", 
    "medizin", "praxis", "sanatorium", "therapie", "ambulanz", 

    # Fachrichtungen und Behandlungen
    "anästhesie", "augen", "cardio", "dental", "derm", "endokrin", "gastro", "gyn", 
    "hämo", "hno", "kardio", "neuro", "onko", "optik", "ortho", "osteo", "pathie", 
    "pädie", "pneumo", "psych", "uro", "zahn", "zähne",

    # Verfahren und Diagnostik
    "blut", "ct", "diagnostik", "echo", "labor", "mrt", "radio", "rehabil", "spende",

    # Pflege und Behandlungsarten
    "betreuung", "ernährung", "geriatr", "hospiz", "intensiv", "palliativ", "pflege", 
    "physio", "rehaklinik", "therapeut",

    # Alternative Medizin
    "akupunkt", "heilpraktiker", "homöo", "naturheil",

    # Einrichtungen und Zentren
    "fach", "kranken", "notfall", "reha", "zentrum", "haus", "test",

    # Pädiatrie, Frauen und Spezialversorgung
    "diabetes", "frauen", "kinder", "lungen",

    # Zusätzliche Begriffe
    "apotheke", "behandl", "chirurgi", "gesundheitszentrum", 
    "klinisch", "untersuch",
    
    # titel
    "dr","phil","univ","medic","dres","med","dipl","psych","dent","vet",
]

target_hospital = "Hausarzt Dr. med. Siebert"
top_k = 10

print(f"\nQuerying for top {top_k} similar hospitals to '{target_hospital}':")
similar_hospitals, similarity_scores = query_similar_hospitals(
    target_hospital, model, nn_model, hospital_names, top_k=top_k
)

# Exclude exact matches (similarity score of 1)
filtered_hospitals = [
    hospital for hospital, score in zip(similar_hospitals, similarity_scores) 
    if score != 1
]

print("\nTop similar hospitals:")
for hospital, score in zip(filtered_hospitals, similarity_scores):
    print(f"{hospital} (Similarity Score: {score:.4f})")

# Extract sensitive words from the target hospital name
extracted_sensitive_words = extract_sensitive_data(target_hospital)

# Remove sensitive words that falsely contain healthcare keywords
filtered_sensitive_words = [
    word for word in extracted_sensitive_words 
    if not any(keyword in word.lower() for keyword in keywords)
]

# Exclude hospitals containing any of the filtered sensitive words
filtered_hospitals = [
    hospital for hospital in filtered_hospitals 
    if not any(sensitive_word in hospital.lower() for sensitive_word in filtered_sensitive_words)
]

# Identify healthcare-related words in the target hospital name
healthcare_terms = [
    word for word in target_hospital.split() 
    if any(keyword in word.lower() for keyword in keywords)
]

ranked_hospitals = []

# Calculate average normalized Levenshtein distance and filter hospitals
for hospital in filtered_hospitals:
    avg_distance = calculate_average_distance(healthcare_terms, hospital.split())
    if avg_distance >= 0.5:
        ranked_hospitals.append((hospital, avg_distance))

# Proceed only if there are ranked hospitals
if ranked_hospitals:
    distances = np.array([distance for _, distance in ranked_hospitals])

    # Apply sigmoid function to distances
    sigmoid_distances = 1 / (1 + np.exp(-distances))

    # Apply temperature scaling to make the distribution sharper
    temperature = 0.1
    scaled_scores = sigmoid_distances ** (1 / temperature)

    # Normalize the scaled scores to create a probability distribution
    probabilities = scaled_scores / np.sum(scaled_scores)

    # Sample one hospital based on the computed probabilities
    sampled_hospital = np.random.choice(
        [hospital for hospital, _ in ranked_hospitals],
        p=probabilities
    )

    # Display the sampled hospital and associated probabilities
    print(f"\nSampled Hospital: {sampled_hospital}")
    print("Probabilities associated with each hospital:")
    for (hospital, _), probability in zip(ranked_hospitals, probabilities):
        print(f"{hospital} (Probability: {probability:.4f})")
else:
    print("\nNo hospitals met the distance criteria.")



Querying for top 10 similar hospitals to 'Hausarzt Dr. med. Siebert':

Top similar hospitals:
Hausarztpraxis Dr. Siegmund (Similarity Score: 0.9301)
Hausarztpraxis Dr. Wiese (Similarity Score: 0.8961)
Hausarztpraxis Dr. med. Steffen Walther (Similarity Score: 0.8832)
Hausarztpraxis Dr. med. Stephan Wismann (Similarity Score: 0.8802)
Praxis Dr. Siebecker (Similarity Score: 0.8640)
Arztpraxis Dr. med. Sibylle Diessner (Similarity Score: 0.8626)
Arztpraxis Dr. med. Christian Siebel (Similarity Score: 0.8604)
Hausarztpraxis Dr. Vondung und Kollegen (Similarity Score: 0.8435)
Hausarztpraxis Drs. Stütz (Similarity Score: 0.8409)
Arzt Drs. Kern & Wappelhorst (Similarity Score: 0.8403)

Sampled Hospital: Hausarztpraxis Dr. Wiese
Probabilities associated with each hospital:
Hausarztpraxis Dr. Siegmund (Probability: 0.0903)
Hausarztpraxis Dr. Wiese (Probability: 0.0903)
Hausarztpraxis Dr. med. Steffen Walther (Probability: 0.2035)
Hausarztpraxis Dr. med. Stephan Wismann (Probability: 0.2035)
Ar

# the following steps ensure no sensitive information is disclosed
step 1: remove 1:1 similarity 

step 2: detect sensitive names using a POS Tagger model from spacy which can detect pROPER NOUNS
step 3: remove query results containing THESE sensitve proper noun names

# the following steps serve two purposes it increases the likelihood of semantic similiar surrogate and it ensures that the similarity in those top names is based on the healthcare facility and not on the sensitve information decreasing likelihood of choosing sensitive surrogate
step 4: detect healthcare names with a list of facility keywords 
step 5: get top levenshtein distance names with the lowest distance from the query 

# picking the surrogate
step 6: sample from the remaining results which exclude results with sensitve proper nouns and prioritize results with low levenshtein distance to the semnatic relevant healthcase facility names

### Comparison Between Semantic Similarity Approach and Random Sampling:

**Objective:**
To demonstrate that using a semantic similarity-based approach for data extraction—specifically for identifying and preserving the meaning of semantic categories like "hospital"—is more effective than random sampling, both in terms of preserving utility and maintaining anonymity.

**Approach:**

1. **Semantic Similarity Approach:**
   - **Keyword-Based Selection:** Using predefined keywords to generate a list of semantically related hospitals.
   - **Annotation and Marking:** Identifying and marking terms that semantically relate to hospitals within the dataset.
   - **Preservation of Meaning:** Ensuring that terms related to the "hospital" category retain their semantic information during data selection or anonymization.

2. **Random Sampling Approach:**
   - **Random Selection:** Extracting a random subset of the dataset without considering semantic similarity.
   - **Loss of Information:** Risk of losing key terms or relationships that are important to the category "hospital," resulting in decreased utility.

---
anonymity and utility evaluation
utility - preserve semantic meaning?
anonymity - if one is sampled how likely is it to sample the original one again? 
-------
Könnten Stations namen in Hospital Location Annotation vorkommen? 
Falls ja wäre das nicht vom jetztigen datensatz abgedeckt. 

FA – Facharzt
Gyn – Gynäkologe: 
Uro – Urologe: 
Derm – Dermatologe: 
Päd – Pädiater: 
Radi – Radiologe: 
Neuro – Neurologe: 
Psych – Psychologe
ZA - Zahnarzt

KH – Krankenhaus
LKH - Landeskrankenhaus
MVZ – Medizinisches Versorgungszentrum
ZMVZ - Zahnmedizinisches Versorgungszentrum 
PHV - patientenheimversorgung
ZAR - Zentrum für ambulante Rehabilitation
KJPP - Kinder- und Jugendpsychiatrie und Psychotherapie
UK – Universitätsklinikum
BG – Berufsgenossenschaftliches Krankenhaus
REHA – Rehabilitationsklinik
KHB – Krankenhausbetriebsgesellschaft
SPZ – Sozialpädiatrisches Zentrum
EVK – Evangelisches Krankenhaus
CVK – Christliches Krankenhaus
DRK – Deutsches Rotes Kreuz
VKK – Verbundkrankenhaus
MLK – Malteser Krankenhaus
KFO – Kieferorthopädische Fachklinik
ZPM – Zentrum für Psychische Gesundheit
ZNA – Zentrale Notaufnahme
KFH – Kuratorium für Dialyse und Nierentransplantation
PKV – Privatklinik für Versicherte

Preprocessing of Query and Hospital Data
Add healthcare:specialty information to hospital data embedding.


In [25]:
import overpy

# Initialize the Overpass API
api = overpy.Overpass() # Read Only connection to OpenStreetMap

# Overpass QL query to get all relevant healthcare facilities in Germany
overpass_query = """
[out:json][timeout:180];
area["ISO3166-1"="DE"][admin_level=2];
(
  // Healthcare facilities
  node["healthcare"](area);
  way["healthcare"](area);
  relation["healthcare"](area);
);
out body;
"""

# Execute the Overpass query
result = api.query(overpass_query)

In [1]:
# for node in result.nodes:
#     print(node.tags)
    
#     healthcare:speciality

In [8]:
import spacy

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg") # python -m spacy download de_dep_news_trf


# Example text
text = "Hausarzt Dr. med. Siebert"

# Process the text with the NER model
doc = nlp(text)

# Print the recognized entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}") # POS: NOUN(Common Noun) POS: PROPN (specific Names)

Siebert -> PER
Token: Hausarzt, POS: NOUN
Token: Dr., POS: NOUN
Token: med, POS: PROPN
Token: ., POS: PUNCT
Token: Siebert, POS: PROPN
