In [140]:
import overpy
import spacy

# Initialize the Overpass API
api = overpy.Overpass() # Read Only connection to OpenStreetMap

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg")

# Overpass QL query to get all relevant healthcare facilities in Germany
overpass_query = """
[out:json][timeout:180];
area["ISO3166-1"="DE"][admin_level=2];
(
  // Healthcare facilities
  node["healthcare"](area);
  way["healthcare"](area);
  relation["healthcare"](area);
);
out body;
"""

try:
    # Execute the Overpass query
    result = api.query(overpass_query)

    # Initialize sets to store unique names
    all_facility_names = set()
    filtered_facility_names = set()
    person_names_removed = set()

    # Function to extract and add names to the set
    def extract_name(element):
        name = element.tags.get("name")
        if name:
            name = name.strip()
            if name and name.lower() != "no name":
                all_facility_names.add(name)

    # Process nodes
    for node in result.nodes:
        extract_name(node)

    # Process ways
    for way in result.ways:
        extract_name(way)

    # Process relations
    for relation in result.relations:
        extract_name(relation)


    # Save all facility names to a text file
    all_output_file = './data/OpenStreetMap_data/all_healthcare_facilities.txt'
    with open(all_output_file, 'w', encoding='utf-8') as f:
        for name in sorted(all_facility_names):
            f.write(name + '\n')

    # Function to filter out person names
    def filter_person_names(name):
        doc = nlp(name)
        # Check if any entity in the name is labeled as PERSON with Spacy
        for ent in doc.ents:
            if ent.label_ == "PER":
                return False  # It's a person name
        return True  # It's a facility name

    # Apply the filter to all facility names
    for name in all_facility_names:
        if filter_person_names(name):
            filtered_facility_names.add(name)
        else:
            person_names_removed.add(name)


    # Save the filtered facility names to a text file
    filtered_output_file = './data/OpenStreetMap_data/filtered_healthcare_facilities.txt'
    with open(filtered_output_file, 'w', encoding='utf-8') as f:
        for name in sorted(filtered_facility_names):
            f.write(name + '\n')

    # Save the person names that were removed to a separate text file
    person_removed_file = './data/OpenStreetMap_data/person_names_removed.txt'
    with open(person_removed_file, 'w', encoding='utf-8') as f:
        for name in sorted(person_names_removed):
            f.write(name + '\n')

    # Compare the two lists and report counts
    total_all = len(all_facility_names)
    total_filtered = len(filtered_facility_names)
    total_removed = len(person_names_removed)

    print("\nSummary of Results:")
    print(f"Total healthcare facility names retrieved: {total_all}")
    print(f"Total unique healthcare facility names after filtering: {total_filtered}")
    print(f"Total person name entries removed: {total_removed}")

except overpy.exception.OverpassTooManyRequests:
    print("Error: Too many requests. Please wait and try again later.")
except overpy.exception.OverpassGatewayTimeout:
    print("Error: Gateway timeout. The server took too long to respond.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Summary of Results:
Total healthcare facility names retrieved: 62584
Total unique healthcare facility names after filtering: 22647
Total person name entries removed: 39937


# Add wrongly classified strings back to the Hospital List based on keywords

In [141]:
# Function to read a file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        file_content = f.readlines()
    return file_content

# Function to append to a file (without overwriting)
def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.writelines(content)
        
# Function to write a file (overwriting content)
def write_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(content)

# File paths
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"

# Load the list of person names
person_names_removed = read_file(person_names_removed_path)

# List of substrings to search for
keywords = [
    # General terms
    "klinik", "praxis", "arzt", "ärzt", "therapie", "haus", "medizin", "zentrum", 
    "chirurg", "pflege", "ambulanz", "sanatorium", "ologe", "gemeinschaft","logie","psychiatrie",

    # Specialties and treatments
    "herz", "uro", "neuro", "kardio", "onko", "gyn", "pneumo", 
    "derm", "endokrin", "psycho", "anästhesie", "zahn", "zähne", "optik", "hno", 
    "ortho", "osteo", "pathie", "augen", "uro", "hämo", "dental","gastro","pädie",

    # Common procedures and diagnostic terms
    "labor", "mrt", "ct", "diagnostik", "radio", "rehabil", "blut", 
    "spende","echo", 

    # Types of care and treatment
    "physio", "palliativ", "intensiv", "pflege", "betreuung", "hospiz", 
    "geriatr", "rehaklinik", "ernährung","therapeut"

    # Alternative medicine
    "heil", "natur", "homöo", "akupunkt",

    # Facilities and centers
    "logo", "fach", "zentrum", "kranken", "notfall", "prax", "chir", "reha",

    # Pediatric, women's and specialty care
    "kinder", "frauen", "diabetes", "lungen"
]

# Initialize lists to store the matched and unmatched facilities
matched_facilities = []
unmatched_facilities = []

# Loop through each healthcare facility name
for facility in person_names_removed:
    # Check if any of the keywords are in the facility name (case insensitive)
    contains_keywords = any(keyword.lower() in facility.lower() for keyword in keywords)

    # Add facility to the matched list if it contains a keyword or "med"/"dent" but not "med."/"dent."
    if contains_keywords:
        matched_facilities.append(facility)
    else:
        unmatched_facilities.append(facility)

# Save the matched facilities to the filtered_healthcare_facilities.txt file
append_to_file(filtered_healthcare_facilities_path, matched_facilities)

# Save the remaining (unmatched) facilities back to the person_names_removed.txt file
write_to_file(person_names_removed_path, unmatched_facilities)


# Remove Apoptheke from lists

In [142]:
# Define paths for the input and output files
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
apotheke_path = "./data/OpenStreetMap_data/apotheke_entries.txt"

def remove_and_store_apotheke_entries(input_path, output_path):
    # Read the input file
    with open(input_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Filter out lines that contain 'apotheke' and store them in a new list
    apotheke_entries = [line for line in lines if 'apotheke' in line.lower()]
    filtered_lines = [line for line in lines if 'apotheke' not in line.lower()]

    # Write the filtered lines back to the original file
    with open(input_path, 'w', encoding='utf-8') as file:
        file.writelines(filtered_lines)

    # Append the 'apotheke' entries to the output file
    if apotheke_entries:
        with open(output_path, 'a', encoding='utf-8') as file:
            file.writelines(apotheke_entries)

# Run the function for both input files
remove_and_store_apotheke_entries(person_names_removed_path, apotheke_path)
remove_and_store_apotheke_entries(filtered_healthcare_facilities_path, apotheke_path)


# Remove Doctor Names from Hospital list with Regex Pattern

In [153]:
import re

# File paths
filtered_healthcare_facilities_path = "./data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
person_names_removed_path = "./data/OpenStreetMap_data/person_names_removed.txt"

# 1. Pattern to match one or more titles followed by optional punctuation and spaces
title_pattern = r"(?:(?:dr|phil|univ|medic|dres|med|dipl|psych|dent|vet)(?:\.|\b)\s*)+"

# 2. Name pattern when a title is present: up to two words, allowing hyphens and apostrophes
name_pattern_with_title = r"([A-Z][\w'-.]+(?:\s[A-Z][\w'-]+)?)"

# 3. Name pattern when no title is present: initial followed by a word
name_pattern_without_title = r"([A-Z]\.\s[A-Z][\w'-]+)\s*"

# 4. Full pattern for names with titles
full_pattern_with_title = rf"""
    ^                                   # Start of the string
    (?P<title>{title_pattern})          # One or more titles (mandatory for the first name)
    (?P<name>{name_pattern_with_title}) # First name following the title
    (?:\s*(?:&|/|und|\+)\s*             # Optional connector (&, /, und, or +) with spaces
        (?:{title_pattern}\s*)?         # Optional second title (may or may not be present)
        {name_pattern_with_title}       # Second name (title is optional here)
    )*                                  # Zero or more additional title-name pairs
    \s*$                                # End of the string, allowing for trailing whitespace
"""

# 5. Full pattern for names without titles
full_pattern_without_title = rf"^{name_pattern_without_title}$"

# 6. Compile the regex patterns with VERBOSE and IGNORECASE flags for readability and case-insensitivity
compiled_pattern_with_title = re.compile(full_pattern_with_title, re.IGNORECASE | re.VERBOSE)
compiled_pattern_without_title = re.compile(full_pattern_without_title, re.IGNORECASE)

# Load the filtered healthcare facilities file
def read_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.readlines()

facilities = read_file(filtered_healthcare_facilities_path)

# List to store recognized names
recognized_names = []
# List to store lines that don't match the regex (i.e., non-names)
remaining_lines = []

# Iterate over each line in the facilities file
for line in facilities:
    line = line.strip()  # Strip leading/trailing whitespaces and newline characters
    match = compiled_pattern_with_title.fullmatch(line)
    if match:
        # If a match is found, append the name (with or without title) to the recognized names
        recognized_names.append(match.group() + '\n')  # Ensure newline
    else:
        match = compiled_pattern_without_title.fullmatch(line)
        if match:
            recognized_names.append(match.group() + '\n')  # Ensure newline
        else:
            # If no match, retain the line in the remaining_lines
            remaining_lines.append(line + '\n')  # Ensure newline

# Function to append to a file (without overwriting)
def append_to_file(file_path, content):
    with open(file_path, 'a', encoding='utf-8') as f:
        f.writelines(content)

# Function to write a file (overwriting content)
def write_to_file(file_path, content):
    with open(file_path, 'w', encoding='utf-8') as f:
        f.writelines(content)

# Append the recognized names to the person_names_removed.txt file
append_to_file(person_names_removed_path, recognized_names)

# Write the remaining lines back to the filtered_healthcare_facilities.txt file
write_to_file(filtered_healthcare_facilities_path, remaining_lines)


# Combine two Hospital Lists

In [12]:
# Define the file paths
healthcare_path = "/home/mseiferling/vector_search/data/OpenStreetMap_data/filtered_healthcare_facilities.txt"
krankenhaus_path = "/home/mseiferling/vector_search/data/Krankenhaus.txt"
output_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt"

# Read the contents of the first file
with open(healthcare_path, 'r') as file1:
    hospitals_in_file1 = set(line.strip() for line in file1)

# Read the contents of the second file
with open(krankenhaus_path, 'r') as file2:
    hospitals_in_file2 = set(line.strip() for line in file2)

# Find hospitals that are in the second file but not in the first file
new_hospitals = hospitals_in_file2 - hospitals_in_file1

# Combine the contents of the first file with the new hospitals
updated_hospitals = hospitals_in_file1.union(new_hospitals)

# save the updated list
with open(output_file, 'w') as output:
    for hospital in sorted(updated_hospitals):  # Sort the list for consistency
        output.write(hospital + '\n')

In [13]:
# Define the input and output file paths
input_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/Combined_healthcare_facilities.txt"
output_file = "/home/mseiferling/vector_search/data/OpenStreetMap_data/new_Combined_healthcare_facilities.txt"

# Open the input file, clean the lines, and write to the output file
with open(input_file, 'r', encoding='utf-8', errors='replace') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        # Replace unusual line terminators (PS, LS) with standard newlines
        cleaned_line = line.replace('\u2028', '\n').replace('\u2029', '\n').strip()  # LS = \u2028, PS = \u2029
        outfile.write(cleaned_line + '\n')


In [5]:
import spacy

# Load the SpaCy German model for NER
nlp = spacy.load("de_core_news_lg") # python -m spacy download de_dep_news_trf


# Example text
text = "PETER CARSTENS"

# Process the text with the NER model
doc = nlp(text)

# Print the recognized entities and their labels
for ent in doc.ents:
    print(f"{ent.text} -> {ent.label_}")

for token in doc:
    print(f"Token: {token.text}, POS: {token.pos_}")

CARSTENS -> ORG
Token: PETER, POS: PROPN
Token: CARSTENS, POS: PROPN
