This notebook fetches the data for all entries with the code "84206" and places it at ../data/84206.csv.

In [1]:
from dotenv import dotenv_values
import pandas as pd
from sqlalchemy import create_engine, text
from sshtunnel import SSHTunnelForwarder

In [2]:
# Load credentials from .env file
envDict = dotenv_values("../secrets/.env")

In [3]:
# Create a bind to forward connections on the local port to the mysql port on the server
server = SSHTunnelForwarder(
    # Host URL and Login
    envDict['SSH_HOST'],
    ssh_username = envDict['SSH_USERNAME'],
    # Private key for SSH connections
    ssh_pkey = envDict["SSH_PKEY_PATH"],
    ssh_private_key_password = envDict['SSH_PKEY_PASSWORD'],
    # Bind to mysql port on server
    remote_bind_address = (envDict["DB_REMOTE_REF_URL"], int(envDict["DB_REMOTE_REF_PORT"])),
    # Don't look for keys on the local machine
    allow_agent = False,
    host_pkey_directories = [],
)
# Start the tunneled connection
server.start()

In [4]:
engine = create_engine('mysql+pymysql://{}:{}@{}:{}/{}'.format(envDict["DB_USERNAME"], envDict["DB_PASSWORD"], "127.0.0.1", server.local_bind_port, "hospital_price_transparency"))

In [5]:
with engine.connect() as connection:
    prices_df = pd.read_sql_query("SELECT * FROM prices WHERE code='84206'", con=connection)
    hospital_df = pd.read_sql_query("SELECT * FROM hospitals WHERE npi_number in (SELECT npi_number FROM prices WHERE code='84206')", con=connection)

merged_df = pd.merge(prices_df, hospital_df, on='npi_number')

merged_df.head()

Unnamed: 0,code,npi_number,payer,price,name,url,street_address,city,state,zip_code,publish_date
0,84206,1003281452,Aetna,173.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
1,84206,1003281452,Amerigroup,24.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
2,84206,1003281452,Blue Cross,158.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
3,84206,1003281452,CASH,908.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01
4,84206,1003281452,Cigna,391.0,Henderson Hospital,https://uhsfilecdn.eskycity.net/ac/henderson-h...,1050 West Galleria Drive,Henderson,NV,89011,2021-01-01


In [6]:
# This is the list of unique codes that match the wildcard
uniques = merged_df["code"].unique()
print("Unique count: ", len(uniques))
print("Unique codes: ", uniques)

Unique count:  1
Unique codes:  ['84206']


In [7]:
with open('../data/84206.csv', 'w', encoding="utf-8") as f:
    # line terminator prevents extra blank lines
    # index=False prevents the index from being written to the csv as an unnamed column
    merged_df.to_csv(f, lineterminator='\n', index=False)