In [5]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from geopy.extra.rate_limiter import RateLimiter
import time

In [6]:


geolocator = Nominatim(user_agent="HPI-DE-Disease-Scanner", timeout=5)  # <-- Erhöhe timeout

In [7]:
# Load datasets
strokeunit_df = pd.read_csv("stroke_units_geocoded.csv")





In [8]:

# read both exports, tag each row with its OPS code (taken from the file name)
dfs = [pd.read_csv(f).assign(code=f.split('_')[-1].split('.')[0])
       for f in ['custom_results_8-98b.csv', 'custom_results_8-981.csv']]

# merge, sum cases and build the code column
results_df = (pd.concat(dfs)
                .groupby(['ik', 'name','full_address'], as_index=False)
                .agg(total_cases=('total_cases', 'sum'),
                     code=('code', lambda s: '&'.join(sorted(s.unique())))))

In [9]:
def normalize_name(name):
    return name.strip().lower() if isinstance(name, str) else ""

strokeunit_df["name_cleaned"] = strokeunit_df["name"].apply(normalize_name)
results_df["name_cleaned"] = results_df["name"].apply(normalize_name)

In [10]:
results_df.sort_values(by="total_cases", ascending=False).reset_index(drop=True)

Unnamed: 0,ik,name,full_address,total_cases,code,name_cleaned
0,261101878,Vivantes - Netzwerk für Gesundheit GmbH,"Aroser Allee 72-76, 13407 Berlin",4176,8-981,vivantes - netzwerk für gesundheit gmbh
1,261101015,Charité - Universitätsmedizin Berlin,"Charitéplatz 1, 10117 Berlin",2065,8-981,charité - universitätsmedizin berlin
2,260530283,Universitätsklinikum Köln,"Kerpener Straße 62, 50937 Köln",1674,8-981,universitätsklinikum köln
3,260510666,Kliniken Maria Hilf GmbH,"Viersener Straße 450, 41063 Mönchengladbach",1636,8-981,kliniken maria hilf gmbh
4,260950099,Klinikum Nürnberg Süd,"Prof.-Ernst-Nathan-Str. 1, 90419 Nürnberg",1501,8-981,klinikum nürnberg süd
...,...,...,...,...,...,...
462,260660441,Asklepios Klinik Bad Wildungen GmbH - Stadtkli...,"Brunnenallee 19, 34537 Bad Wildungen",5,8-98b,asklepios klinik bad wildungen gmbh - stadtkli...
463,260510906,Helios Universitätsklinikum Wuppertal - Campus...,"Arrenberger Str. 20, 42117 Wuppertal",5,8-981,helios universitätsklinikum wuppertal - campus...
464,261200402,Immanuel Klinik Rüdersdorf,"Seebad 82/83, 15562 Rüdersdorf bei Berlin",5,8-981,immanuel klinik rüdersdorf
465,260510256,Helios St. Johannes Klinik,"Dieselstr. 185, 47166 Duisburg",5,8-98b,helios st. johannes klinik


In [11]:
def is_probably_same_clinic(name, address, strokeunit_df, threshold=90):
    name = normalize_name(name)
    address = normalize_name(address)
    for _, row in strokeunit_df.iterrows():
        name_score = fuzz.token_set_ratio(name, row["name_cleaned"])
        address_score = fuzz.token_set_ratio(address, normalize_name(row.get("address", "")))
        if name_score > threshold or address_score > threshold:
            return True
    return False

In [12]:
# Only clinics with >100 OPS procedures
relevant_results = results_df[results_df["total_cases"] > 100]

# Filter missing clinics
missing_clinics = relevant_results[
    ~relevant_results.apply(
        lambda row: is_probably_same_clinic(row["name"], row["full_address"], strokeunit_df),
        axis=1
    )
].copy()

In [13]:
missing_clinics.reset_index(drop=True, inplace=True)
missing_clinics

Unnamed: 0,ik,name,full_address,total_cases,code,name_cleaned
0,260100660,AMEOS Klinikum Heiligenhafen,"Friedrich-Ebert-Straße 100, 23774 Heiligenhafen",302,8-981,ameos klinikum heiligenhafen
1,260101865,Westküstenkliniken Brunsbüttel und Heide gGmbH,"Esmarchstraße 50, 25746 Heide",968,8-981,westküstenkliniken brunsbüttel und heide ggmbh
2,260310367,HELIOS Klinikum Gifhorn GmbH,"Campus 6, 38518 Gifhorn",334,8-98b,helios klinikum gifhorn gmbh
3,260310539,Helios St. Marienberg Klinik Helmstedt GmbH,"Conringstraße 26, 38350 Helmstedt",369,8-98b,helios st. marienberg klinik helmstedt gmbh
4,260310595,HELIOS Albert-Schweitzer-Klinik Northeim [NOM],"Albert-Schweitzer-Weg 1 , 37154 Northeim",256,8-98b,helios albert-schweitzer-klinik northeim [nom]
...,...,...,...,...,...,...
109,261500882,HELIOS Klinik Sangerhausen,"Am Beinschuh 2a, 06526 Sangerhausen",621,8-981,helios klinik sangerhausen
110,261501008,AMEOS Klinikum Haldensleben,"Kiefholzstraße 27, 39340 Haldensleben",225,8-981,ameos klinikum haldensleben
111,261520023,Asklepios Klinik Weißenfels,"Naumburger Straße 76, 06667 Weißenfels",116,8-98b,asklepios klinik weißenfels
112,261600521,Hufeland Klinikum GmbH,"Rudolph-Weiss-Straße 1-5, 99947 Bad Langensalza",322,8-98b,hufeland klinikum gmbh


In [14]:
def safe_geocode(row):
    try:
        result = geolocator.geocode(row["full_address"])
        if result is None:
            print(f"Address failed, trying clinic name: {row['name']}")
            result = geolocator.geocode(row["name"])
        return result
    except Exception as e:
        print(f"Geocoding failed for: {row['name']} ({row['full_address']}) – {e}")
        return None

rate_limited = RateLimiter(safe_geocode, min_delay_seconds=2, max_retries=3, error_wait_seconds=10)

missing_clinics["geocode"] = missing_clinics.apply(rate_limited, axis=1)
missing_clinics["latitude"] = missing_clinics["geocode"].apply(lambda x: x.latitude if x else np.nan)
missing_clinics["longitude"] = missing_clinics["geocode"].apply(lambda x: x.longitude if x else np.nan)

Address failed, trying clinic name: Hufeland Klinikum GmbH


In [15]:
# Ensure consistent columns
missing_clinics["level"] = np.nan
missing_clinics["is_thrombectomy_center"] = False

# Fix missing address column
if "address" not in missing_clinics.columns and "full_address" in missing_clinics.columns:
    missing_clinics["address"] = missing_clinics["full_address"]

# Use same columns as strokeunit_df
# Keep only columns that exist in both DataFrames
cols_to_use = strokeunit_df.columns.intersection(missing_clinics.columns)
extended_df = pd.concat([strokeunit_df, missing_clinics[cols_to_use]], ignore_index=True)



In [16]:
# Save to file
extended_df.to_csv("stroke_units_extended_geocoded.csv", index=False)

print("Extended file saved as: stroke_units_extended_geocoded.csv")

Extended file saved as: stroke_units_extended_geocoded.csv
