In [1]:
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as BS
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.select import Select
import time

In [2]:
#initiate Selenium and accept Terms of Use
driver = webdriver.Chrome()

try:
    driver.get("https://qcor.cms.gov/")
    ActionChains(driver)\
        .send_keys(Keys.RETURN)\
        .perform()
except:
        pass

checkbox = driver.find_element(by=By.NAME, value="chkCertiCon")
checkbox.click()

button = driver.find_element(by=By.NAME, value="submit")
button.click()

#move through pages to the RHC Active Provider query
time.sleep(2)

rhc_link = driver.find_element(by=By.CSS_SELECTOR, value='[href="report_select.jsp?which=12"]')
rhc_link.click()

time.sleep(2)

apsc_rhc = driver.find_element(by=By.CSS_SELECTOR, value='[href="RHC_wizard.jsp?which=12&report=active_nh.jsp"]')
apsc_rhc.click()

time.sleep(2)

#on the query page, select current calendar, as opposed to fiscal, year and run query
year_box = driver.find_element(by=By.NAME, value="year_type")
select_box = Select(year_box)
select_box.select_by_value("CY")
    
run_report = driver.find_element(by=By.CSS_SELECTOR, value='[onclick="javascript: doSubmit();"]')
run_report.click()

time.sleep(2)

#drill down to TN facilities
atl_link = driver.find_element(by=By.CSS_SELECTOR, value='[name="R04"]')
atl_link.click()

time.sleep(2)

tn_link = driver.find_element(by=By.CSS_SELECTOR, value='[name="STN"]')
tn_link.click()

time.sleep(2)

#create soup
content = driver.page_source
soup = BS(content)
driver.close()

#save facility certification numbers to a dataframe
drill= soup.find_all("a", attrs={"class" : "drill"})
href = [x.get("href") for x in drill]
href_df = pd.DataFrame(href)
prov_nums = href_df[0].str.extract('(\d{6})')
prov_nums = prov_nums.rename(columns = {0 : 'prov_num'}).dropna()

In [3]:
#initiate Selenium and accept Terms of Use
driver = webdriver.Chrome()

try:
    driver.get("https://qcor.cms.gov/")
    ActionChains(driver)\
        .send_keys(Keys.RETURN)\
        .perform()
except:
        pass

checkbox = driver.find_element(by=By.NAME, value="chkCertiCon")
checkbox.click()

button = driver.find_element(by=By.NAME, value="submit")
button.click()

#Create lists for desired facility information
facility_name = []
cert_num = []
facility_type = []
address = []
city_state = []
phone = []
participation_date = []
ownership_type = []

for ind, row in prov_nums.iterrows():
    #access RHC facility certification numbers
    prov_id = row['prov_num']
  
    #access page with each facility's details
    driver.get(f"https://qcor.cms.gov/active_popup.jsp?prvdr_intrnl_num={prov_id}&year_value=2022&jump=1&provider=12&which=12")
    
    #create soup
    content = driver.page_source
    soup = BS(content)
    
    #pull out facility details
    tr = soup.find_all("tr")
    name = tr[1].find("td").text.strip()
    cnum = tr[2].find("td").text.strip()
    ftype = tr[3].find("td").text.strip()
    addr = tr[4].find("td").text.strip()
    cit_st = tr[5].text.strip()
    pho = tr[6].find("td").text.strip()
    pdate = tr[7].find("td").text.strip()
    otype = tr[9].find("td").text.strip()

    #append lists
    facility_name.append(name)
    cert_num.append(cnum)
    facility_type.append(ftype)
    address.append(addr)
    city_state.append(cit_st)
    phone.append(pho)
    participation_date.append(pdate)
    ownership_type.append(otype)

driver.close()

#create dictionary
rhc_dict = {
    "facility_name" : facility_name,
    "cert_num" : cert_num,
    "facility_type" : facility_type,
    "address" : address,
    "city_state" : city_state,
    "phone" : phone,
    "participation_date" : participation_date,
    "ownership_type" : ownership_type
}

#turn dictionary into a dataframe
rhc_qcor_df = pd.DataFrame(rhc_dict)

In [4]:
rhc_qcor_df

Unnamed: 0,facility_name,cert_num,facility_type,address,city_state,phone,participation_date,ownership_type
0,"ACCESS MEDICAL CARE OF MONROE COUNTY, PC",448968,Rural Health Clinic,4233 HIGHWAY 411 N,"MADISONVILLE, TN 37354",423 253-4707,09/26/2016,For Profit Corporation
1,ACCESS MEDICAL CLINIC TENNESSEE LLC,883841,Rural Health Clinic,204 GLADES RD,"GATLINBURG, TN 37738",865 436-2811,04/26/2021,For Profit Corporation
2,"ACCESS MEDICAL CLINIC TENNESSEE, LLC",883834,Rural Health Clinic,13 BOB TOLLETT LOOP,"CROSSVILLE, TN 38555",931 456-6057,02/02/2021,For Profit Partnership
3,"ADVANCED FAMILY MEDICAL CENTER, PLLC",883807,Rural Health Clinic,101 N MAIN STREET,"MIDDLETON, TN 38052",731 472-2147,11/12/2019,For Profit Corporation
4,"AGAPE FAMILY HEALTH, LLC",883855,Rural Health Clinic,915 SOUTH GARDEN STREET,"COLUMBIA, TN 38401",931 548-8090,12/07/2021,For Profit Partnership
...,...,...,...,...,...,...,...,...
238,WEST TENNESSEE MEDICAL GROUP PRIMARY CARE MARTIN,443452,Rural Health Clinic,143 KENNEDY DRIVE,"MARTIN, TN 38237",731 587-5321,08/14/2017,For Profit Corporation
239,WOMEN'S CENTER OF EAST TENNESSEE,448970,Rural Health Clinic,"135 N MEADOWS DRIVE, SUITE B","ATHENS, TN 37303",423 507-8067,10/26/2016,For Profit Corporation
240,"WOMEN'S WELLNESS & MATERNITY CENTER, INC",443947,Rural Health Clinic,3459 HWY 68,"MADISONVILLE, TN 37354",423 442-6624,02/28/2005,Non-Profit Corporation
241,WOODLAND WELLNESS INC,883830,Rural Health Clinic,115 NORTH WOODLAND STREET,"MANCHESTER, TN 37335",931 954-5219,10/13/2020,For Profit Corporation


In [5]:
import pickle 
with open('../capstone_data/rhc_qcor_info.pickle', 'wb') as file: pickle.dump(rhc_qcor_df, file)