In [2]:
import json
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import joblib
import re

# Load hospital Names from a Text File
def load_hospital_names(text_file):
    with open(text_file, 'r', encoding='utf-8') as f:
        hospital_names = [line.strip() for line in f if line.strip()]
    return hospital_names

# Compute Embeddings
def compute_embeddings(model, sentences, batch_size=512):
    embeddings = []
    for i in tqdm(range(0, len(sentences), batch_size), desc="Computing Embeddings"):
        batch = sentences[i:i+batch_size] # remove alpha numeric?
        batch = [remove_non_alphanumeric(name) for name in batch]
        emb = model.encode(batch, show_progress_bar=False, convert_to_numpy=True)
        embeddings.append(emb)
    embeddings = np.vstack(embeddings)
    return embeddings

# Fit and Save NearestNeighbors Model
# Instance-based learning is a category of machine learning that relies on storing the training data and making predictions based on the direct comparison of new instances with stored data
def fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine'):
    nn_model = NearestNeighbors(n_neighbors=n_neighbors, metric=metric) # non-parametric algorithm used for finding the closest points in a dataset.
    nn_model.fit(embeddings) # organizes the embeddings into a structure that allows efficient querying
    return nn_model

def remove_non_alphanumeric(input_string):
    # Use regex to match alphanumeric characters, spaces, points, and commas
    return ''.join(re.findall(r'[\w\s.,äöüß]', input_string))

# Query Function
def query_similar_hospitals(target_sentence, model, nn_model, embeddings, hospital_names, top_k=5):
    # Compute embedding for the target sentence
    target_embedding = model.encode([remove_non_alphanumeric(target_sentence)], convert_to_numpy=True)
    
    # Perform similarity search with the specified top_k
    distances, indices = nn_model.kneighbors(target_embedding, n_neighbors=top_k)
    
    # Retrieve the hospital names and their similarity scores
    results = []
    for idx, distance in zip(indices[0], distances[0]):
        hospital = hospital_names[idx]  # Directly use the list of hospital names
        similarity = 1 - distance  # Convert cosine distance to similarity
        results.append((hospital, float(similarity)))
    return results

  from tqdm.autonotebook import tqdm, trange


In [37]:
# Paths
data_path = '/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt' 
embeddings_path = '/home/mseiferling/vector_search/data/hospital_embeddings.npy'
nearest_neighbors_model_path = '/home/mseiferling/vector_search/data/nearest_neighbors_model.joblib'

# model
embedding_model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'

# Check if embeddings and model already exist
if os.path.exists(embeddings_path) and os.path.exists(nearest_neighbors_model_path):
    embeddings = np.load(embeddings_path)
    nn_model = joblib.load(nearest_neighbors_model_path)
    model = SentenceTransformer(embedding_model_name)
    hospital_names = load_hospital_names(data_path)
else:
    # Load hospital names
    hospital_names = load_hospital_names(data_path)
    
    # Load embedding model
    model = SentenceTransformer(embedding_model_name)
    
    # Compute embeddings
    embeddings = compute_embeddings(model, hospital_names)
    
    # Save embeddings
    np.save(embeddings_path, embeddings)
    
    # Fit NearestNeighbors model
    nn_model = fit_nearest_neighbors(embeddings, n_neighbors=10, metric='cosine')
    
    # Save NearestNeighbors model
    joblib.dump(nn_model, nearest_neighbors_model_path)

# Query
target_word = "child_psychiatry"
top_k = 10 

print(f"\nQuerying for top {top_k} similar hospitals to '{target_word}':")
results = query_similar_hospitals(target_word, model, nn_model, embeddings, hospital_names, top_k=top_k)

# remove if the surrogate is the same => similarity = 1
results = [result for result in results if result[1]!= 1] 

print("\nTop similar hospitals:")
for hospital, score in results:
    print(f"{hospital} (Similarity Score: {score:.4f})")


Querying for top 10 similar hospitals to 'child_psychiatry':

Top similar hospitals:
Kinder- und Jugendpsychiatrie (Similarity Score: 0.9098)
Facharzt für Kinder-/Jugendpsychiatrie und -psychotherapie (Similarity Score: 0.8957)
Praxis für Kinder- und Jugendpsychiatrie (Similarity Score: 0.8881)
Praxis für Kinder- und Jugendpsychiatrie und -psychotherapie (Similarity Score: 0.8825)
Praxis für Kinderneurologie & Jugendpsychiatrie (Similarity Score: 0.8784)
Praxisgemeinschaft Fachärzte für Kinder- und Jugendpsychiatrie und -psychotherapie (Similarity Score: 0.8766)
Praxis für Kinder- und Jugendpsychiatrie, –psychosomatik und –psychotherapie (Similarity Score: 0.8731)
Facharztpraxis für Kinder- und Jugendpsychiatrie und -psychotherapie (Similarity Score: 0.8696)
Kinder- und Jugendpsychiatrische Praxis (Similarity Score: 0.8685)
Kinder- und Jugendpsychotherapie (Similarity Score: 0.8656)


In [14]:
# Extract the scores
scores = np.array([score for _ , score in results])

# Apply sigmoid function to the scores
sigmoid_scores = 1 / (1 + np.exp(-scores))

# Normalize the probabilities to sum to 1
probabilities = sigmoid_scores / np.sum(sigmoid_scores)

# Sample one of the hospitals using these probabilities
sampled_hospital = str(np.random.choice([hospital for hospital, _ in results], p=probabilities))

# Output the sampled hospital and the probabilities
print(f"Sampled Hospital: {sampled_hospital}")
print("Probabilities associated with each hospital:")
for (hospital, score), prob in zip(results, probabilities):
    print(f"{hospital} (Probability: {prob:.4f})")

Sampled Hospital: Dr. med. P. Müller - Chirurg
Probabilities associated with each hospital:
Facharzt Michael Müller (Probability: 0.1015)
Krankengymnastik-Praxis Müller (Probability: 0.1005)
Praxis Dr. Müller (Probability: 0.1001)
Dr.med. C. Müller HNO (Probability: 0.0999)
Hausarzt Mathias Müller (Probability: 0.0999)
Dr. med. P. Müller - Chirurg (Probability: 0.0998)
Facharzt für Orthopädie Bertram Müller (Probability: 0.0997)
Arztpraxis Christoph Müller (Probability: 0.0996)
Dr. Richard Müller (Orthopädie) (Probability: 0.0995)
Kinderarzt Dr. Müller (Probability: 0.0994)


In [15]:
import spacy
import numpy as np
from Levenshtein import distance as levenshtein_distance

# Load the SpaCy German model for NER and POS tagging
nlp = spacy.load("de_core_news_lg")

# Function to extract proper nouns and named entities
def extract_sensitive_data(text):
    doc = nlp(text)
    unique_substrings = set()

    # Extract named entities of type PERSON, ORG, LOC and proper nouns
    for ent in doc.ents:
        if ent.label_ in ["PER", "ORG", "LOC"]:  # PERSON, ORGANIZATION, LOCATION
            words = ent.text.split()  # Split ent.text into individual words
            unique_substrings.update(words) # Add each word to the set
    for token in doc:
        if token.pos_ == "PROPN":  # Proper Noun (e.g., specific names)
            unique_substrings.add(token.text)

    return list(unique_substrings)

# Function to normalize Levenshtein distance
def normalize_levenshtein_distance(str1, str2):
    lev_distance = levenshtein_distance(str1, str2)
    max_len = max(len(str1), len(str2))
    if max_len == 0:  # To handle edge cases with empty strings
        return 0.0
    return lev_distance / max_len  # Normalize by dividing by max string length

# Function to calculate average Levenshtein distance
def calculate_average_distance(target_sensitive_data, sampled_sensitive_data):
    total_distance = 0
    num_comparisons = len(target_sensitive_data)
    
    # For each sensitive substring in the target word, find the closest match in the sampled hospital
    for target_substring in target_sensitive_data:
        min_distance = float('inf')  # Start with a large number
        
        for sampled_substring in sampled_sensitive_data:
            normalized_distance = normalize_levenshtein_distance(target_substring, sampled_substring)
            if normalized_distance < min_distance:
                min_distance = normalized_distance

        # Accumulate the smallest distance for this target substring
        total_distance += min_distance

    # Calculate the average normalized distance
    if num_comparisons == 0:
        return 0.0
    return total_distance / num_comparisons


# inputs
# target_word = "Urologie im 'Käthchenhof' Dr. Schönau"
# sampled_hospital = "Urologie im Käthhof dr. Schönstein"

# Extract sensitive data from both texts
target_sensitive_data = extract_sensitive_data(target_word)
sampled_sensitive_data = extract_sensitive_data(sampled_hospital)

# Calculate the average normalized Levenshtein distance
average_distance = calculate_average_distance(target_sensitive_data, sampled_sensitive_data)

# Output the results
print(f"Target sensitive data: {target_sensitive_data}")
print(f"Sampled hospital sensitive data: {sampled_sensitive_data}")
print(f"Average normalized Levenshtein distance: {average_distance:.4f}") # can be interpreted as percentage difference of characters

Target sensitive data: ['Müller']
Sampled hospital sensitive data: ['P.', 'Müller']
Average normalized Levenshtein distance: 0.0000


### Comparison Between Semantic Similarity Approach and Random Sampling:

**Objective:**
To demonstrate that using a semantic similarity-based approach for data extraction—specifically for identifying and preserving the meaning of semantic categories like "hospital"—is more effective than random sampling, both in terms of preserving utility and maintaining anonymity.

**Approach:**

1. **Semantic Similarity Approach:**
   - **Keyword-Based Selection:** Using predefined keywords to generate a list of semantically related hospitals.
   - **Annotation and Marking:** Identifying and marking terms that semantically relate to hospitals within the dataset.
   - **Preservation of Meaning:** Ensuring that terms related to the "hospital" category retain their semantic information during data selection or anonymization.

2. **Random Sampling Approach:**
   - **Random Selection:** Extracting a random subset of the dataset without considering semantic similarity.
   - **Loss of Information:** Risk of losing key terms or relationships that are important to the category "hospital," resulting in decreased utility.

---
anonymity and utility evaluation
utility - preserve semantic meaning?
anonymity - if one is sampled how likely is it to sample the original one again? 
-------
Könnten Stations namen in Hospital Location Annotation vorkommen? 
Falls ja wäre das nicht vom jetztigen datensatz abgedeckt. 

FA – Facharzt
Gyn – Gynäkologe: 
Uro – Urologe: 
Derm – Dermatologe: 
Päd – Pädiater: 
Radi – Radiologe: 
Neuro – Neurologe: 
Psych – Psychologe
ZA - Zahnarzt

KH – Krankenhaus
LKH - Landeskrankenhaus
MVZ – Medizinisches Versorgungszentrum
ZMVZ - Zahnmedizinisches Versorgungszentrum 
PHV - patientenheimversorgung
ZAR - Zentrum für ambulante Rehabilitation
KJPP - Kinder- und Jugendpsychiatrie und Psychotherapie
UK – Universitätsklinikum
BG – Berufsgenossenschaftliches Krankenhaus
REHA – Rehabilitationsklinik
KHB – Krankenhausbetriebsgesellschaft
SPZ – Sozialpädiatrisches Zentrum
EVK – Evangelisches Krankenhaus
CVK – Christliches Krankenhaus
DRK – Deutsches Rotes Kreuz
VKK – Verbundkrankenhaus
MLK – Malteser Krankenhaus
KFO – Kieferorthopädische Fachklinik
ZPM – Zentrum für Psychische Gesundheit
ZNA – Zentrale Notaufnahme
KFH – Kuratorium für Dialyse und Nierentransplantation
PKV – Privatklinik für Versicherte

Preprocessing of Query and Hospital Data
Add healthcare:specialty information to hospital data embedding.


In [25]:
import overpy

# Initialize the Overpass API
api = overpy.Overpass() # Read Only connection to OpenStreetMap

# Overpass QL query to get all relevant healthcare facilities in Germany
overpass_query = """
[out:json][timeout:180];
area["ISO3166-1"="DE"][admin_level=2];
(
  // Healthcare facilities
  node["healthcare"](area);
  way["healthcare"](area);
  relation["healthcare"](area);
);
out body;
"""

# Execute the Overpass query
result = api.query(overpass_query)

In [30]:
# for node in result.nodes:
#     print(node.tags)
    
#     healthcare:speciality