In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import time

In [2]:
#initiate Selenium and accept Terms of Use
driver = webdriver.Chrome()

try:
    driver.get("https://qcor.cms.gov/")
    ActionChains(driver)\
        .send_keys(Keys.RETURN)\
        .perform()
except:
        pass

checkbox = driver.find_element(by=By.NAME, value="chkCertiCon")
checkbox.click()

button = driver.find_element(by=By.NAME, value="submit")
button.click()

#move through pages to the results of the Hospital Active Provider query
time.sleep(2)

hospital_link = driver.find_element(by=By.CSS_SELECTOR, value='[href="report_select.jsp?which=3"]')
hospital_link.click()

time.sleep(2)

apsc_hospital = driver.find_element(by=By.CSS_SELECTOR, value='[href="hospital_wizard.jsp?which=3&report=active.jsp"]')
apsc_hospital.click()

time.sleep(2)

#specifying current calendar, as opposed to fiscal, year
year_box = driver.find_element(by=By.NAME, value="year_type")
select_ybox = Select(year_box)
select_ybox.select_by_value("CY")

#selecting only Critical Access Hospitals ("All Other Hospital Types" is default, must deselect)
ptype_box = driver.find_element(by=By.NAME, value="view_cah")
select_ptype = Select(ptype_box)
select_ptype.deselect_by_visible_text("All Other Hospital Types")
select_ptype.select_by_visible_text("CAHs Only")

time.sleep(2)

run_report = driver.find_element(by=By.CSS_SELECTOR, value='[onclick="javascript: doSubmit();"]')
run_report.click()

time.sleep(2)

#drilling down to Tennessee facilities
atl_link = driver.find_element(by=By.CSS_SELECTOR, value='[name="R04"]')
atl_link.click()

time.sleep(2)

tn_link = driver.find_element(by=By.CSS_SELECTOR, value='[name="STN"]')
tn_link.click()

time.sleep(2)

#create soup
content = driver.page_source
soup = BS(content)
driver.close()

#pull out the facility certification numbers
drill= soup.find_all("a", attrs={"class" : "drill"})
href = [x.get("href") for x in drill]
href_df = pd.DataFrame(href)
prov_nums = href_df[0].str.extract('(\d{6})')
prov_nums = prov_nums.rename(columns = {0 : 'prov_num'}).dropna()

In [3]:
#initiate Selenium and accept Terms of Use
driver = webdriver.Chrome()

try:
    driver.get("https://qcor.cms.gov/")
    ActionChains(driver)\
        .send_keys(Keys.RETURN)\
        .perform()
except:
        pass

checkbox = driver.find_element(by=By.NAME, value="chkCertiCon")
checkbox.click()

button = driver.find_element(by=By.NAME, value="submit")
button.click()

#Create lists for desired facility information
facility_name = []
cert_num = []
facility_type = []
address = []
city_state = []
phone = []
participation_date = []
accreditation_org = []
accreditation_type = []
number_beds = []
ownership_type = []
facility_subtype = []

for ind, row in prov_nums.iterrows():
    #access hospital facility certification numbers
    prov_id = row['prov_num']
  
    #access page with each facility's details
    driver.get(f"https://qcor.cms.gov/active_popup.jsp?prvdr_intrnl_num={prov_id}&year_value=2022&jump=1&provider=01&which=3")
    
    #create soup
    content = driver.page_source
    soup = BS(content)
    
    #pull out table that contains facility details, convert to df for easier access to contents
    table = soup.find_all("table")[1]
    df = pd.read_html(str(table))[0]
    
    #pull out table components of interest
    name = df.loc[0, 1]
    cnum = df.loc[1, 1]
    ftype = df.loc[2, 1]
    addr = df.loc[3, 1]
    cit_st = df.loc[4, 1]
    pho = df.loc[5, 1]
    pdate = df.loc[6, 1]
    aorg = str(np.where(df.loc[8, 0]=='Accreditation Organization:', str(df.loc[8,1]), 'None'))
    atype = str(np.where(df.loc[8, 0]=='Accreditation Type:', str(df.loc[8,1]), 
                         (np.where(df.loc[9, 0]=='Accreditation Type:', str(df.loc[9,1]), 'None'))))
    beds = str(np.where(df.loc[9, 0]=='Number of Certified Beds:', str(df.loc[9,1]), 
                         (np.where(df.loc[10, 0]=='Number of Certified Beds:', str(df.loc[10,1]), 'None'))))
    otype = str(np.where(df.loc[10, 0]=='Ownership Type:', str(df.loc[10,1]), 
                         (np.where(df.loc[11, 0]=='Ownership Type:', str(df.loc[11,1]), 'None'))))
    fsubtype = str(np.where(df.loc[11, 0]=='Subtype:', str(df.loc[11,1]), 
                         (np.where(df.loc[12, 0]=='Subtype:', str(df.loc[12,1]), 'None'))))

    #append lists
    facility_name.append(name)
    cert_num.append(cnum)
    facility_type.append(ftype)
    address.append(addr)
    city_state.append(cit_st)
    phone.append(pho)
    participation_date.append(pdate)
    accreditation_org.append(aorg)
    accreditation_type.append(atype)
    number_beds.append(beds)
    ownership_type.append(otype)
    facility_subtype.append(fsubtype)
    
driver.close()

#create dictionary
cah_dict = {
    "facility_name" : facility_name,
    "cert_num" : cert_num,
    "facility_type" : facility_type,
    "facility_subtype" : facility_subtype,
    "address" : address,
    "city_state" : city_state,
    "phone" : phone,
    "participation_date" : participation_date,
    "ownership_type" : ownership_type,
    "accreditation_organization" : accreditation_org,
    "accreditation_type" : accreditation_type,
    "number_of_beds" : number_beds
}

#turn dictionary into a dataframe
cah_qcor_df = pd.DataFrame(cah_dict)

In [4]:
cah_qcor_df

Unnamed: 0,facility_name,cert_num,facility_type,facility_subtype,address,city_state,phone,participation_date,ownership_type,accreditation_organization,accreditation_type,number_of_beds
0,ASCENSION SAINT THOMAS THREE RIVERS,441303,Hospital,Critical Access Hospitals,451 HIGHWAY 13 SOUTH,"WAVERLY, TN 37185",931 296-4203,11/01/2000,Non-Profit,,Non-Accredited,25
1,BIG SOUTH FORK MEDICAL CENTER,441323,Hospital,Critical Access Hospitals,18797 ALBERTA STREET,"ONEIDA, TN 37841",423 286-5300,06/30/2021,For Profit,,Non-Accredited,25
2,ERLANGER BLEDSOE HOSPITAL,441306,Hospital,,71 WHEELERTOWN AVENUE,"PIKEVILLE, TN 37367",423 447-2112,11/29/2001,Government,DNV HEALTHCARE USA INC,Deemed Status,25
3,HANCOCK COUNTY HOSPITAL,441313,Hospital,Critical Access Hospitals,1519 MAIN STREET HWY 33,"SNEEDVILLE, TN 37869",423 733-5001,06/06/2005,Non-Profit,,Non-Accredited,10
4,HOUSTON COUNTY COMMUNITY HOSPITAL,441322,Hospital,,5001 EAST MAIN STREET,"ERIN, TN 37061",931 289-4211,04/20/2018,Government,JOINT COMMISSION,Deemed Status,25
5,JOHNSON COUNTY COMMUNITY HOSPITAL,441304,Hospital,Critical Access Hospitals,1901 S SHADY ST,"MOUNTAIN CITY, TN 37683",423 727-1110,09/13/2001,Non-Profit,,Non-Accredited,2
6,LAUDERDALE COMMUNITY HOSPITAL,441314,Hospital,Critical Access Hospitals,326 ASBURY AVENUE,"RIPLEY, TN 38063",731 221-2200,10/01/2005,For Profit,,Non-Accredited,25
7,MACON COMMUNITY HOSPITAL,441305,Hospital,,204 MEDICAL DRIVE,"LAFAYETTE, TN 37083",615 666-2147,07/01/2001,Non-Profit,DNV HEALTHCARE USA INC,Deemed Status,25
8,MARSHALL MEDICAL CENTER,441309,Hospital,,1080 NORTH ELLINGTON PARKWAY,"LEWISBURG, TN 37091",931 359-6276,01/01/2005,Government,JOINT COMMISSION,Deemed Status,25
9,RHEA MEDICAL CENTER,441310,Hospital,,9400 RHEA COUNTY HIGHWAY,"DAYTON, TN 37321",423 775-1121,01/10/2005,Government,JOINT COMMISSION,Deemed Status,25


In [5]:
import pickle 
with open('../capstone_data/cah_qcor_info.pickle', 'wb') as file: pickle.dump(cah_qcor_df, file) 

In [6]:
#Trying a different approach to getting data pulled from the tables on the website

In [27]:
address = df.loc[df[0].isnull()][1].iloc[0]

In [29]:
address

'CAMDEN, TN 38320'

In [None]:
#initiate Selenium and accept Terms of Use
driver = webdriver.Chrome()

try:
    driver.get("https://qcor.cms.gov/")
    ActionChains(driver)\
        .send_keys(Keys.RETURN)\
        .perform()
except:
        pass

checkbox = driver.find_element(by=By.NAME, value="chkCertiCon")
checkbox.click()

button = driver.find_element(by=By.NAME, value="submit")
button.click()

#Create lists for desired facility information
facility_name = []
cert_num = []
facility_type = []
address = []
city_state = []
phone = []
participation_date = []
accreditation_org = []
accreditation_type = []
number_beds = []
ownership_type = []
facility_subtype = []

for ind, row in prov_nums.iterrows():
    #access hospital facility certification numbers
    prov_id = row['prov_num']
  
    #access page with each facility's details
    driver.get(f"https://qcor.cms.gov/active_popup.jsp?prvdr_intrnl_num={prov_id}&year_value=2022&jump=1&provider=01&which=3")
    
    #create soup
    content = driver.page_source
    soup = BS(content)
    
    #pull out table that contains facility details, convert to df for easier access to contents
    table = soup.find_all("table")[1]
    df = pd.read_html(str(table))[0]
    
    #pull out table components of interest
    name = df.loc[df[0] == 'Provider or Supplier Name:'][1]
    cnum = df.loc[df[0] == 'CMS Certification Number:'][1]
    ftype = df.loc[df[0] == 'Provider or Supplier Type:'][1]
    addr = df.loc[df[0] == 'Address:'][1]
    cit_st = df.loc[df[0].isnull()][1].iloc[0]
    pho = df.loc[df[0] == 'Phone Number:'][1]
    pdate = df.loc[df[0] == 'Participation Date:'][1]
    aorg = str(np.where(df.loc[8, 0]=='Accreditation Organization:', str(df.loc[8,1]), 'None'))
    atype = str(np.where(df.loc[8, 0]=='Accreditation Type:', str(df.loc[8,1]), 
                         (np.where(df.loc[9, 0]=='Accreditation Type:', str(df.loc[9,1]), 'None'))))
    beds = str(np.where(df.loc[9, 0]=='Number of Certified Beds:', str(df.loc[9,1]), 
                         (np.where(df.loc[10, 0]=='Number of Certified Beds:', str(df.loc[10,1]), 'None'))))
    otype = str(np.where(df.loc[10, 0]=='Ownership Type:', str(df.loc[10,1]), 
                         (np.where(df.loc[11, 0]=='Ownership Type:', str(df.loc[11,1]), 'None'))))
    fsubtype = str(np.where(df.loc[11, 0]=='Subtype:', str(df.loc[11,1]), 
                         (np.where(df.loc[12, 0]=='Subtype:', str(df.loc[12,1]), 'None'))))

    #append lists
    facility_name.append(name)
    cert_num.append(cnum)
    facility_type.append(ftype)
    address.append(addr)
    city_state.append(cit_st)
    phone.append(pho)
    participation_date.append(pdate)
    accreditation_org.append(aorg)
    accreditation_type.append(atype)
    number_beds.append(beds)
    ownership_type.append(otype)
    facility_subtype.append(fsubtype)
    
driver.close()

#create dictionary
cah_dict = {
    "facility_name" : facility_name,
    "cert_num" : cert_num,
    "facility_type" : facility_type,
    "facility_subtype" : facility_subtype,
    "address" : address,
    "city_state" : city_state,
    "phone" : phone,
    "participation_date" : participation_date,
    "ownership_type" : ownership_type,
    "accreditation_organization" : accreditation_org,
    "accreditation_type" : accreditation_type,
    "number_of_beds" : number_beds
}

#turn dictionary into a dataframe
cah_qcor_df = pd.DataFrame(cah_dict)