In [4]:
import requests
import pandas as pd
import google.generativeai as genai
import pandas as pd
from rapidfuzz import fuzz, process
import json

NPI API to get the addressed of the hospitals in NY

In [5]:
def get_addresses_from_npi(hospitals, state='NY'):
    base_url = "https://npiregistry.cms.hhs.gov/api/"
    results = {}

    for hospital in hospitals:
        params = {
            "version": "2.1",
            "organization_name": hospital,
            "state": state,
            "limit": 10  # Increase limit to get more matches
        }
        try:
            response = requests.get(base_url, params=params)
            data = response.json()

            if data["result_count"] > 0:
                names_addresses = [
                    (
                        result.get("basic", {}).get("organization_name", ""),
                        next((addr for addr in result.get("addresses", []) if addr["address_purpose"] == "LOCATION"), None)
                    )
                    for result in data["results"]
                ]

                # Fuzzy match the hospital name
                best_match = process.extractOne(
                    hospital,
                    [name for name, addr in names_addresses if name],
                    scorer=fuzz.token_sort_ratio
                )

                if best_match:
                    matched_name = best_match[0]
                    for name, address in names_addresses:
                        if name == matched_name and address:
                            street = address.get("address_1", "")
                            street_2 = address.get("address_2", "")
                            city = address.get("city", "")
                            state_code = address.get("state", "")
                            raw_zip = address.get("postal_code", "")

                            full_zip = raw_zip[:5] if raw_zip else ""
                            full_street = f"{street} {street_2}".strip()

                            # Build the full address
                            full_address = f"{full_street}, {city}, {state_code} {full_zip}".strip(', ')

                            results[hospital] = full_address
                            break
                    else:
                        results[hospital] = None
                else:
                    results[hospital] = None
            else:
                results[hospital] = None
        except Exception as e:
            results[hospital] = f"Error: {str(e)}"

    return results

# Load your CSV and get unique hospital names
SPARCS_df = pd.read_csv('./Hospital_Inpatient_Discharges__SPARCS_De-Identified___Cost_Transparency__Beginning_2009_20250419.csv')
unique_hospital_names = list(SPARCS_df['Facility Name'].unique())


So we have 324 unique hospital names

In [6]:
len(unique_hospital_names)

324

In [7]:
hospital_to_add = get_addresses_from_npi(unique_hospital_names)

Unfortunately we can only find 183 of these hospital addresses from NPI

In [8]:
error_hospitals = []
for key, value in hospital_to_add.items():
    if value == None:
        error_hospitals.append(key)
len(error_hospitals)

141

Lets use google's genai to get the rest using prompt engineering

In [9]:
genai.configure(api_key="AIzaSyD3E7aq4GC9LD7_Kodxo7dsRVQSJXcgPvs")

def generate_address(hospital_names_string):
    prompt = f"""
        **Task:** Find the full street address for each hospital listed below.

        **Context:** These hospitals are expected to be located either within New York State (NY) or in states immediately adjacent to it (New Jersey - NJ, Connecticut - CT, Pennsylvania - PA, Massachusetts - MA, Vermont - VT). Please prioritize results within these specific states.

        **Input Hospital List:**
        {hospital_names_string}

        **Instructions:**
        1.  For each hospital name provided in the input list:
        2.  Search for the official name and primary location of the hospital.
        3.  Extract its full street address, including street number, street name, city, state abbreviation (e.g., NY, NJ, CT, PA, MA, VT), and ZIP code.
        4.  Focus your search on locations within NY, NJ, CT, PA, MA, or VT.
        5.  If a hospital name is ambiguous (e.g., multiple hospitals with similar names exist, even within the target states), try to identify the most prominent or likely match based on the name provided. If significant ambiguity remains (e.g., "General Hospital" exists in multiple relevant cities), please note the ambiguity and provide potential matches if possible, or put "Not Found").
        6.  If you cannot confidently locate a specific hospital from the list, please list its address as 'Not Found'.
        7.  If the address comed out like this 95 BRADHURST AVE PHARMACY DEPARTMENT Dont put the pharmacy department just keep 95 BRADHURST AVE 
        8.  If the address comes out like this  101, 103, 105, 107, 109 Jones Memorial Dr just select the smallest number and pass it on like 101 Jones Memorial Dr
        7.  DO NOT PUT ANY COMMENTS INSIDE OF THE JSON 

        **Output Format:**
        Please structure the output as a list of items, where each item clearly pairs the original hospital name with its found full address or a status note ("Not Found"). A JSON-like list of objects is preferred:

        Example:
        [
        {{
            "hospital_name": "Mount Sinai Hospital",
            "full_address": "1 Gustave L. Levy Pl, New York, NY, 10029"
        }},
        {{
            "hospital_name": "Fictional Care Center",
            "full_address": "Not Found"
        }}
        ]

        **Begin Processing:**
    """

    model = genai.GenerativeModel("gemini-1.5-flash")
    response = model.generate_content(prompt)
    
    return response.text.strip().removeprefix("```json").removesuffix("```").strip() if response else "Error: No response for " + hospital_names_string

I will only pass it 25 names at a time to keep the results more accurate. the next 3 cells are looped until the error hospitals length is decreased to something I can easily just search.

In [41]:
current_hospitals = error_hospitals[0:25]
hospital_names_string = "\n".join([f"- {name}" for name in current_hospitals])
cleaned_json_string = generate_address(hospital_names_string)
cleaned_json_string

'[\n  {\n    "hospital_name": "Woman\'s Christian Association",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "Beth Israel Med Center-Kings Hwy Div",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "Coler-Goldwater Spec Hosp&Nurs Fac - Goldwater Hospital Site",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "Montefiore - NY West Square Division",\n    "full_address": "Not Found" \n  },\n  {\n    "hospital_name": "Mount Sinai Hospital - Queens Division",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "New York Hospital Medical Center of Queens",\n    "full_address": "25-20 80th St, Flushing, NY 11373"\n  },\n  {\n    "hospital_name": "Seton Health System-St Mary\'s Campus",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "St Francis Hospital - St Francis Hospital Beacon Div",\n    "full_address": "Not Found"\n  },\n  {\n    "hospital_name": "TLC Health Network Tri-County Memorial Hospital",\n  

In [42]:
import json

fetched_data = json.loads(cleaned_json_string)

for dict in fetched_data:
    hospital_to_add[dict['hospital_name']] = dict['full_address']

In [43]:
error_hospitals = []
for key, value in hospital_to_add.items():
    if value == None or value.lower() == 'not found' or 'ambiguous' in value.lower():
        error_hospitals.append(key)
len(error_hospitals)

14

Now that we only have 14 hospitals that cannot be located I'll just use google maps to get their addresses manually

In [44]:
error_hospitals

["Woman's Christian Association",
 'Beth Israel Med Center-Kings Hwy Div',
 'Coler-Goldwater Spec Hosp&Nurs Fac - Goldwater Hospital Site',
 'Montefiore - NY West Square Division',
 'Mount Sinai Hospital - Queens Division',
 'New York Hospital Medical Center   of Queens',
 "Seton Health System-St Mary's Campus",
 'St Francis Hospital - St Francis Hospital Beacon Div',
 'TLC Health Network Tri-County Memorial Hospital',
 'Sisters of Charity Hospital - St. Joseph Campus',
 'St. Francis Hospital & Heart Center',
 "Samaritan Hospital - St. Mary's Campus",
 'Garnet Health Medical Center - Catskills - G. Hermann Site',
 "St. Joseph's MC-St. Vincent's Westchester Division"]

In [45]:
hospital_to_add["Woman's Christian Association"] = '207 Foote Ave, Jamestown, NY 14701'
hospital_to_add['Beth Israel Med Center-Kings Hwy Div'] = '3201 Kings Hwy, Brooklyn, NY 11234'
hospital_to_add['Coler-Goldwater Spec Hosp&Nurs Fac - Goldwater Hospital Site'] = '900 Main St, Roosevelt Island, NY 10044'
hospital_to_add['Montefiore - NY West Square Division'] = ' 2475 St Raymond Ave, Bronx, NY 10461'
hospital_to_add['Mount Sinai Hospital - Queens Division'] = '25-10 30th Ave., Long Island City, NY 11102'
hospital_to_add['New York Hospital Medical Center   of Queens'] = '56-45 Main St, Flushing, NY 11355'
hospital_to_add["Seton Health System-St Mary's Campus"] = '1300 Massachusetts Ave, Troy, NY 12180'
hospital_to_add['St Francis Hospital - St Francis Hospital Beacon Div'] = '11 Hastings Dr, Beacon, NY 12508'
hospital_to_add['TLC Health Network Tri-County Memorial Hospital'] = '529 Central Ave. Dunkirk, New York 14048'
hospital_to_add['Sisters of Charity Hospital - St. Joseph Campus'] = '2605 Harlem Rd, Cheektowaga, NY 14225'
hospital_to_add['St. Francis Hospital & Heart Center'] = '100 Port Washington Blvd, Roslyn, NY 11576'
hospital_to_add["Samaritan Hospital - St. Mary's Campus"] = '1300 Massachusetts Ave, Troy, NY 12180'
hospital_to_add['Garnet Health Medical Center - Catskills - G. Hermann Site'] = '707 E Main St, Middletown, NY 10940'
hospital_to_add["St. Joseph's MC-St. Vincent's Westchester Division"] = '127 South Broadway Yonkers, New York 10701'


Sanity check

In [46]:
error_hospitals = []
for key, value in hospital_to_add.items():
    if value == None or value.lower() == 'not found':
        error_hospitals.append(key)
len(error_hospitals)

0

And here we go we have mapped all the hospital names to an address. Lets now store this in a json format so I can use them with geopandas during EDA.

In [47]:
with open("hospital_to_add.json", "w") as file:
    json.dump(hospital_to_add, file, indent=4)