In [14]:
import overpy
import spacy
import re

# Initialize the Overpass API
api = overpy.Overpass()  # Read Only connection to OpenStreetMap

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg")

# Function to modify name by appending specialty
def modify_name(name, speciality):
    # Process speciality to replace ';' with ' / '
    if speciality:
        speciality = speciality.replace(';', ' / ')
        # Append the healthcare:speciality at the end with '/'
        modified_name = name + ' / ' + speciality
    else:
        modified_name = name
    return modified_name

# Overpass QL query to get all relevant healthcare facilities in Germany
overpass_query = """
[out:json][timeout:180];
area["ISO3166-1"="DE"][admin_level=2];
(
  // Healthcare facilities
  node["healthcare"](area);
  way["healthcare"](area);
  relation["healthcare"](area);
);
out body;
"""

try:
    # Execute the Overpass query
    result = api.query(overpass_query)

    # Initialize sets to store unique names
    all_facility_names = set()
    filtered_facility_names = set()
    person_names_removed = set()

    # Function to process an element and extract both the original name and speciality
    def process_element(element):
        # Get the "name" tag of the element
        name = element.tags.get("name")
        # Check if a name exists
        if name:
            name = name.strip()  # Remove any leading or trailing whitespace
            if name and name.lower() != "no name":
                # Get the "healthcare:speciality" tag, which provides specialty info
                speciality = element.tags.get("healthcare:speciality")
                # Clean up the speciality if it exists, otherwise set it to an empty string
                if speciality:
                    speciality = speciality.strip()  # Remove any whitespace around the speciality
                else:
                    speciality = ''  # Default to an empty string if there's no speciality
                return name, speciality  # Return both name and speciality
        # Return None, None if there's no valid name
        return None, None

    # Function to filter out person names
    def filter_person_names(name):
        doc = nlp(name)
        # Check if any entity in the name is labeled as PERSON with Spacy
        for ent in doc.ents:
            if ent.label_ == "PER":
                return False  # It's a person name
        return True  # It's a facility name

    # Combine nodes, ways, and relations into a single list for easier processing
    elements = result.nodes + result.ways + result.relations

    # Process each element (each healthcare facility) individually
    for element in elements:
        # Extract the original name and speciality
        original_name, speciality = process_element(element)
        
        # Only proceed if a valid name was found
        if original_name:
            modified_name = modify_name(original_name, speciality)
            all_facility_names.add(modified_name)
        
            # Check if the name appears to be a facility, not a person's name
            if filter_person_names(original_name):
                # If it's a facility name, add it to the filtered set
                filtered_facility_names.add(modified_name)
            else:
                person_names_removed.add(modified_name)

    # Save all facility names to a text file
    all_output_file = './data/OpenStreetMap_data/all_healthcare_facilities.txt'
    with open(all_output_file, 'w', encoding='utf-8') as f:
        for name in sorted(all_facility_names):
            f.write(name + '\n')

    # Save the modified filtered facility names to a text file
    modified_filtered_output_file = './data/OpenStreetMap_data/filtered_healthcare_facilities.txt'
    with open(modified_filtered_output_file, 'w', encoding='utf-8') as f:
        for name in filtered_facility_names:
            f.write(name + '\n')

    # Save the person names that were removed to a separate text file
    person_removed_file = './data/OpenStreetMap_data/person_names_removed.txt'
    with open(person_removed_file, 'w', encoding='utf-8') as f:
        for name in sorted(person_names_removed):
            f.write(name + '\n')

    # Compare the two lists and report counts
    total_all = len(all_facility_names)
    total_filtered = len(filtered_facility_names)
    total_removed = len(person_names_removed)

    print("\nSummary of Results:")
    print(f"Total healthcare facility names retrieved: {total_all}")
    print(f"Total unique healthcare facility names after filtering: {total_filtered}")
    print(f"Total person name entries removed: {total_removed}")

except overpy.exception.OverpassTooManyRequests:
    print("Error: Too many requests. Please wait and try again later.")
except overpy.exception.OverpassGatewayTimeout:
    print("Error: Gateway timeout. The server took too long to respond.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Summary of Results:
Total healthcare facility names retrieved: 63837
Total unique healthcare facility names after filtering: 23450
Total person name entries removed: 40387


# Add wrongly classified strings back to the Hospital List based on keywords

In [15]:
# Function to read a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        file_content = f.readlines()
    return file_content

# Function to append to a file (without overwriting)
def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.writelines(content)
        
# Function to write a file (overwriting content)
def write_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(content)
        
# helper function to extract the main name before '/'
def get_main_name(facility):
    """
    Extracts the main facility name before any '/' characters.

    Parameters:
    facility (str): The full facility string containing the name and additional info.

    Returns:
    str: The main facility name.
    """
    return facility.split('/')[0].strip()

# File paths
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"

# Load the list of person names
person_names_removed = read_file(person_names_removed_path)

# List of substrings to search for
keywords = [
    # Allgemeine Begriffe
    "arzt", "ärzt", "chirurg", "gemeinschaft", "klinik", "logie", "ologe", 
    "medizin", "praxis", "sanatorium", "therapie", "ambulanz", 

    # Fachrichtungen und Behandlungen
    "anästhesie", "augen", "cardio", "dental", "derm", "endokrin", "gastro", "gyn", 
    "hämo", "hno", "kardio", "neuro", "onko", "optik", "ortho", "osteo", "pathie", 
    "pädie", "pneumo", "psych", "uro", "zahn", "zähne", "internist",

    # Verfahren und Diagnostik
    "blut", "ct", "diagnostik", "echo", "labor", "mrt", "radio", "rehabil", "spende",

    # Pflege und Behandlungsarten
    "betreuung", "ernährung", "geriatr", "hospiz", "intensiv", "palliativ", "pflege", 
    "physio", "rehaklinik", "therapeut",

    # Alternative Medizin
    "akupunkt", "heilpraktiker", "homöo", "naturheil",

    # Einrichtungen und Zentren
    "fach", "kranken", "notfall", "reha", "zentrum", "haus", "test",

    # Pädiatrie, Frauen und Spezialversorgung
    "diabetes", "frauen", "kinder", "lungen",

    # Zusätzliche Begriffe
    "apotheke", "arztpraxis", "behandl", "chirurgi", "gesundheitszentrum", 
    "klinisch", "untersuch"
]


# Initialize lists to store the matched and unmatched facilities
matched_facilities = []
unmatched_facilities = []

# Loop through each healthcare facility name
for facility in person_names_removed:
    # Extract the main name before '/'
    main_name = get_main_name(facility)
    
    # Check if any of the keywords are in the main name (case insensitive)
    contains_keywords = any(keyword.lower() in main_name.lower() for keyword in keywords)

    # Add facility to the matched list if it contains a keyword
    if contains_keywords:
        matched_facilities.append(facility)
    else:
        unmatched_facilities.append(facility)

# Save the matched facilities to the filtered_healthcare_facilities.txt file
append_to_file(filtered_healthcare_facilities_path, matched_facilities)

# Save the remaining (unmatched) facilities back to the person_names_removed.txt file
write_to_file(person_names_removed_path, unmatched_facilities)

# Remove Apoptheke from lists

In [16]:
# Define paths for the input and output files
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
apotheke_path = "./data/OpenStreetMap_data/apotheke_entries.txt"

def remove_and_store_apotheke_entries(input_path, output_path):
    # Read the input file
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Filter out lines that contain 'apotheke' and store them in a new list
    apotheke_entries = [line for line in lines if 'apotheke' in line.lower()]
    filtered_lines = [line for line in lines if 'apotheke' not in line.lower()]

    # Write the filtered lines back to the original file
    with open(input_path, 'w', encoding='utf-8') as file:
        file.writelines(filtered_lines)

    # Append the 'apotheke' entries to the output file
    if apotheke_entries:
        with open(output_path, 'a', encoding='utf-8') as file:
            file.writelines(apotheke_entries)

# Run the function for both input files
remove_and_store_apotheke_entries(person_names_removed_path, apotheke_path)
remove_and_store_apotheke_entries(filtered_healthcare_facilities_path, apotheke_path)


# Remove Doctor Names from Hospital list with Regex Pattern

In [17]:
import re

# File paths
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"

# 1. Pattern to match one or more titles followed by optional punctuation and spaces
title_pattern = r"(?:(?:dr|phil|univ|medic|dres|med|dipl|psych|dent|vet)(?:\.|\b)\s*)+"

# 2. Name pattern when a title is present: up to two words, allowing hyphens and apostrophes
name_pattern_with_title = r"([A-Z][\w'-.]+(?:\s[A-Z][\w'-]+)?)"

# 3. Name pattern when no title is present: initial followed by a word
name_pattern_without_title = r"([A-Z]\.\s[A-Z][\w'-]+)\s*"

# 4. Full pattern for names with titles
full_pattern_with_title = rf"""
    ^                                   # Start of the string
    (?P<title>{title_pattern})          # One or more titles (mandatory for the first name)
    (?P<name>{name_pattern_with_title}) # First name following the title
    (?:\s*(?:&|/|und|\+)\s*             # Optional connector (&, /, und, or +) with spaces
        (?:{title_pattern}\s*)?         # Optional second title (may or may not be present)
        {name_pattern_with_title}       # Second name (title is optional here)
    )*                                  # Zero or more additional title-name pairs
    \s*$                                # End of the string, allowing for trailing whitespace
"""

# 5. Full pattern for names without titles
full_pattern_without_title = rf"^{name_pattern_without_title}$"

# 6. Compile the regex patterns with VERBOSE and IGNORECASE flags for readability and case-insensitivity
compiled_pattern_with_title = re.compile(full_pattern_with_title, re.IGNORECASE | re.VERBOSE)
compiled_pattern_without_title = re.compile(full_pattern_without_title, re.IGNORECASE)

# Load the filtered healthcare facilities file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

facilities = read_file(filtered_healthcare_facilities_path)

# List to store recognized names
recognized_names = []
# List to store lines that don't match the regex (i.e., non-names)
remaining_lines = []

# Iterate over each line in the facilities file
for line in facilities:
    line = line.strip()  # Strip leading/trailing whitespaces and newline characters
    line_name = get_main_name(line)
    match = compiled_pattern_with_title.fullmatch(line_name)
    if match:
        # If a match is found, append the name (with or without title) to the recognized names
        recognized_names.append(line + '\n')  # Ensure newline
    else:
        match = compiled_pattern_without_title.fullmatch(line_name)
        if match:
            recognized_names.append(line + '\n')  # Ensure newline
        else:
            # If no match, retain the line in the remaining_lines
            remaining_lines.append(line + '\n')  # Ensure newline

# Function to append to a file (without overwriting)
def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.writelines(content)

# Function to write a file (overwriting content)
def write_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(content)

# Append the recognized names to the person_names_removed.txt file
append_to_file(person_names_removed_path, recognized_names)

# Write the remaining lines back to the filtered_healthcare_facilities.txt file
write_to_file(filtered_healthcare_facilities_path, remaining_lines)

# Combine two Hospital Lists

In [18]:
# Define the file paths
healthcare_path = "/home/mseiferling/vector_search/data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
krankenhaus_path = "/home/mseiferling/vector_search/data/Krankenhaus.txt"
output_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt"

# Initialize a dictionary to map main names to full entries
combined_hospitals = {}

# Read the contents of the first file and populate the dictionary
with open(healthcare_path, 'r', encoding='utf-8') as file1:
    for line in file1:
        full_name = line.strip()
        if not full_name:
            continue  # Skip empty lines
        main_name = get_main_name(full_name)
        combined_hospitals[main_name.lower()] = full_name  # Use lowercase for case-insensitive matching

# Read the contents of the second file and add new unique entries to the dictionary
with open(krankenhaus_path, 'r', encoding='utf-8') as file2:
    for line in file2:
        full_name = line.strip()
        if not full_name:
            continue  # Skip empty lines
        main_name = get_main_name(full_name)
        main_name_lower = main_name.lower()
        if main_name_lower not in combined_hospitals:
            combined_hospitals[main_name_lower] = full_name

# Prepare the combined list of full names
updated_hospitals = sorted(combined_hospitals.values())

# Save the combined list to the output file
with open(output_file, 'w', encoding='utf-8') as output:
    for hospital in updated_hospitals:
        output.write(hospital + '\n')

print(f"Combined list saved to {output_file}")

Combined list saved to /home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt


In [19]:
# Define the input and output file paths
input_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt"
output_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/new_Combined_healthcare_facilities.txt"

# Open the input file, clean the lines, and write to the output file
with open(input_file, 'r', encoding='utf-8', errors='replace') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # Replace unusual line terminators (PS, LS) with standard newlines
        cleaned_line = line.replace('\u2028', '\n').replace('\u2029', '\n').strip()  # LS = \u2028, PS = \u2029
        outfile.write(cleaned_line + '\n')


# Playground

In [2]:
import spacy

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg") # python -m spacy download de_dep_news_trf


# Example text
text = "Allgemeinarztpraxis Dr. Killer"

# Process the text with the NER model
doc = nlp(text)

# Print the recognized entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}") # POS: NOUN(Common Noun) POS: PROPN (specific Names)

Allgemeinarztpraxis Dr. Killer -> MISC
Token: Allgemeinarztpraxis, POS: NOUN
Token: Dr., POS: NOUN
Token: Killer, POS: NOUN
