In [22]:
from dotenv import dotenv_values
import pandas as pd
from sqlalchemy import create_engine, text
from sshtunnel import SSHTunnelForwarder

In [23]:
# Load credentials from .env file
envDict = dotenv_values("../secrets/.env")

In [24]:
# Create a bind to forward connections on the local port to the mysql port on the server
server = SSHTunnelForwarder(
    # Host URL and Login
    envDict['SSH_HOST'],
    ssh_username = envDict['SSH_USERNAME'],
    # Private key for SSH connections
    ssh_pkey = envDict["SSH_PKEY_PATH"],
    ssh_private_key_password = envDict['SSH_PKEY_PASSWORD'],
    # Bind to mysql port on server
    remote_bind_address = (envDict["DB_REMOTE_REF_URL"], int(envDict["DB_REMOTE_REF_PORT"])),
    # Don't look for keys on the local machine
    allow_agent = False,
    host_pkey_directories = [],
)
# Start the tunneled connection
server.start()

In [25]:
# Create the engine to connect to the database using the tunnel
engine = create_engine('mysql+pymysql://{}:{}@{}:{}/{}'.format(envDict["DB_USERNAME"], envDict["DB_PASSWORD"], "127.0.0.1", server.local_bind_port, "hospital_price_transparency"))

In [84]:
# This wildcard match crashes locally, takes about 35-40 seconds on the server
with engine.connect() as connection:
    prices_df = pd.read_sql_query(text("SELECT * FROM prices WHERE code LIKE '842%' LIMIT 1000000"), con=connection)
    hospital_df = pd.read_sql_query(text("SELECT * FROM hospitals WHERE npi_number in (SELECT npi_number FROM prices WHERE code LIKE '842%') LIMIT 1000000"), con=connection)

merged_df = pd.merge(prices_df, hospital_df, on='npi_number')

merged_df.head()

Unnamed: 0,code,npi_number,payer,price,name,url,street_address,city,state,zip_code,publish_date
0,842,1700979465,AETNA,14.94,Baptist Hospital,https://baptisthealthcare.pt.panaceainc.com/MR...,1000 West Moreno St,Pensacola,FL,32501,
1,842,1700979465,BC FL,15.69,Baptist Hospital,https://baptisthealthcare.pt.panaceainc.com/MR...,1000 West Moreno St,Pensacola,FL,32501,
2,842,1700979465,CHA HEALTH PLAN,14.94,Baptist Hospital,https://baptisthealthcare.pt.panaceainc.com/MR...,1000 West Moreno St,Pensacola,FL,32501,
3,842,1700979465,FL COMMUNITY CARE LTC,15.99,Baptist Hospital,https://baptisthealthcare.pt.panaceainc.com/MR...,1000 West Moreno St,Pensacola,FL,32501,
4,842,1700979465,LIGHTHOUSE MCAID PSN,15.99,Baptist Hospital,https://baptisthealthcare.pt.panaceainc.com/MR...,1000 West Moreno St,Pensacola,FL,32501,


In [85]:
print(len(merged_df)) #checking how many rows resulted from query

140835


In [91]:
#uniques = prices_df["code"].unique()
#print("Unique count: ", len(uniques))
#print("Unique codes: ", uniques)

foo = merged_df.state.unique() #checking the variety of states
print(foo)

['FL' 'NE' 'SC' 'MS' 'HI' 'CA' 'TX' 'ME' 'MA' None 'RI' 'KS' 'NC' 'TN'
 'NH' 'MN' 'ID' 'CT' 'MI' 'MD' 'MO' 'NV' 'WI' 'AL' 'KY' 'VA' 'CO' 'OK'
 'DE' 'IL' 'AZ' 'NY' 'IN' 'ND' 'NJ' 'WA' 'WY' 'NM' 'GA' 'OR' 'UT' 'DC'
 'VT' 'WV' 'OH' 'PA' 'IA' '73' 'SD' 'L ' 'AR' 'LA']


In [93]:
with open('842entries.csv', 'w', encoding="utf-8") as f:
    f.write(merged_df.to_csv())