In [1]:
import pandas as pd
import numpy as np
import requests
import lxml.html as lh
import re
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173-526911--,00.html'
#url = 'https://web.archive.org/web/20200720171507/https://www.michigan.gov/coronavirus/0,9753,7-406-98163_98173-526911--,00.html'
# TODO - Wayback machine adds two extra tables at top we need to skip past
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')

In [3]:
tables = soup.find_all('table')
#[len(a) for a in a[0].find_all('tr')]

In [4]:
tmp = re.split('/| ', tables[0].find_all('caption')[0].get_text())
reporting_date = '-'.join(tmp[0:3])
print(reporting_date)

11-09-2020


In [59]:
# Create list of facilities
rows = tables[2].find_all('tr')
# Get List of fields from Header Row
fieldList = []
fields = rows[0].find_all('th')
for field in fields:
    fieldList.append(field.get_text())
assert fieldList == ['FACILITY NAME', 'COUNTY','TYPE', 'NEW RESIDENT CASES', 'TOTAL RESIDENT CASES', 'NEW RESIDENT DEATHS', 'TOTAL RESIDENT DEATHS',
                     'NEW STAFF CASES', 'TOTAL STAFF CASES', 'NEW STAFF DEATHS', 'TOTAL STAFF DEATHS']
num_facilities = 0
facilities = []
first_row = True
for row in rows:
    if first_row:
        first_row = False
        continue
    facility = {}
    tds = row.find_all('td')
    num_reported_vals = 0
    facility['ReportingDate'] = reporting_date
    for i in range(len(tds)):
        val = tds[i].get_text()
        if val == '--':
            val = float('Nan')
        else:
            num_reported_vals = num_reported_vals +1
        facility[fieldList[i]] = val
    if num_reported_vals == 3:
        facility['Reporting'] = "No"
    else:
        facility['Reporting'] = "Yes"    
    facilities.append(facility)
print(len(facilities))
facilities[0]

1347


{'ReportingDate': '11-09-2020',
 'FACILITY NAME': 'Jamieson Nursing Home',
 'COUNTY': 'ALCONA',
 'TYPE': 'SNF',
 'NEW RESIDENT CASES': '0',
 'TOTAL RESIDENT CASES': '0',
 'NEW RESIDENT DEATHS': '0',
 'TOTAL RESIDENT DEATHS': '0',
 'NEW STAFF CASES': '0',
 'TOTAL STAFF CASES': '0',
 'NEW STAFF DEATHS': '0',
 'TOTAL STAFF DEATHS': '0',
 'Reporting': 'Yes'}

In [49]:
facilities_df = pd.DataFrame(facilities, dtype=np.int)
facilities_df = facilities_df.astype({'ReportingDate': str, 'FACILITY NAME': str, 'COUNTY': str, 'TYPE': str,
                                      'NEW RESIDENT CASES': float, 'TOTAL RESIDENT CASES': float, 'NEW RESIDENT DEATHS': float,
                      'TOTAL RESIDENT DEATHS': float, 'NEW STAFF CASES': float, 'TOTAL STAFF CASES': float,
                      'NEW STAFF DEATHS': float, 'TOTAL STAFF DEATHS': float})

In [46]:
facilities_df.dtypes

ReportingDate             object
FACILITY NAME             object
COUNTY                    object
TYPE                      object
NEW RESIDENT CASES       float64
TOTAL RESIDENT CASES     float64
NEW RESIDENT DEATHS      float64
TOTAL RESIDENT DEATHS    float64
NEW STAFF CASES          float64
TOTAL STAFF CASES        float64
NEW STAFF DEATHS         float64
TOTAL STAFF DEATHS       float64
Reporting                 object
dtype: object

In [51]:
facilities_df.sort_values(by=['NEW RESIDENT CASES'], ascending=False).head(10)

Unnamed: 0,ReportingDate,FACILITY NAME,COUNTY,TYPE,NEW RESIDENT CASES,TOTAL RESIDENT CASES,NEW RESIDENT DEATHS,TOTAL RESIDENT DEATHS,NEW STAFF CASES,TOTAL STAFF CASES,NEW STAFF DEATHS,TOTAL STAFF DEATHS,Reporting
215,11-09-2020,The Villa at the Bay,EMMET,SNF,57.0,58.0,4.0,4.0,32.0,39.0,0.0,0.0,Yes
569,11-09-2020,Valley View Care Center,KENT,SNF,44.0,77.0,2.0,2.0,27.0,36.0,0.0,0.0,Yes
151,11-09-2020,Medilodge Of Cheboygan,CHEBOYGAN,SNF,34.0,42.0,2.0,2.0,11.0,17.0,0.0,0.0,Yes
989,11-09-2020,Aspirus Ontonagon Hospital LTC,ONTONAGON,SNF,29.0,29.0,0.0,0.0,5.0,14.0,0.0,0.0,Yes
434,11-09-2020,Alamo Nursing Home,KALAMAZOO,SNF,25.0,84.0,8.0,9.0,14.0,43.0,0.0,0.0,Yes
1186,11-09-2020,Regency at Chene,WAYNE,SNF,24.0,24.0,0.0,14.0,0.0,22.0,0.0,0.0,Yes
822,11-09-2020,Sanctuary at McAuley,MUSKEGON,SNF,15.0,48.0,0.0,3.0,9.0,33.0,0.0,0.0,Yes
1126,11-09-2020,Thurston Woods Village,SAINT JOSEPH,SNF,14.0,19.0,0.0,0.0,9.0,25.0,0.0,0.0,Yes
419,11-09-2020,Faith Haven Senior Care Centre,JACKSON,SNF,14.0,21.0,0.0,0.0,4.0,8.0,0.0,0.0,Yes
266,11-09-2020,Regency at Grand Blanc,GENESEE,SNF,13.0,78.0,0.0,13.0,6.0,52.0,1.0,1.0,Yes


In [52]:
facilities_df.dtypes
facilities_df.convert_dtypes().dtypes

ReportingDate            string
FACILITY NAME            string
COUNTY                   string
TYPE                     string
NEW RESIDENT CASES        Int64
TOTAL RESIDENT CASES      Int64
NEW RESIDENT DEATHS       Int64
TOTAL RESIDENT DEATHS     Int64
NEW STAFF CASES           Int64
TOTAL STAFF CASES         Int64
NEW STAFF DEATHS          Int64
TOTAL STAFF DEATHS        Int64
Reporting                string
dtype: object

In [53]:
facilities_df.sort_values(by=['Reporting', 'NEW RESIDENT CASES'], ascending=False).to_csv('Reporting_data/MI_' + reporting_date + 'LTC_data.csv', index=False)

In [54]:
facilites_reporting_cnt = len(facilities_df[facilities_df['Reporting'] == 'Yes'])
facilites_reporting_cnt

1347

In [55]:
facilities_reporting_df = facilities_df[facilities_df['Reporting'] == 'Yes']
facilities_NOT_reporting_df = facilities_df[facilities_df['Reporting'] == 'No']

In [62]:
print(facilities_reporting_df['TYPE'].value_counts())
print(facilities_reporting_df.sum())

AFC    604
SNF    440
HFA    303
Name: TYPE, dtype: int64
ReportingDate            11-09-202011-09-202011-09-202011-09-202011-09-...
FACILITY NAME            Jamieson Nursing HomeLincoln Haven Nursing & R...
COUNTY                   ALCONAALCONAALGERALLEGANALLEGANALLEGANALLEGANA...
TYPE                     SNFSNFSNFSNFAFCHFAAFCSNFAFCAFCSNFAFCAFCHFAAFCA...
NEW RESIDENT CASES                                                     599
TOTAL RESIDENT CASES                                                 11384
NEW RESIDENT DEATHS                                                     83
TOTAL RESIDENT DEATHS                                                 2809
NEW STAFF CASES                                                        672
TOTAL STAFF CASES                                                     7507
NEW STAFF DEATHS                                                         2
TOTAL STAFF DEATHS                                                      35
Reporting                YesYesYesYesYesYe

In [57]:
facilities_NOT_reporting_df['TYPE'].value_counts()

Series([], Name: TYPE, dtype: int64)

In [58]:
facilities_df.head(10)

Unnamed: 0,ReportingDate,FACILITY NAME,COUNTY,TYPE,NEW RESIDENT CASES,TOTAL RESIDENT CASES,NEW RESIDENT DEATHS,TOTAL RESIDENT DEATHS,NEW STAFF CASES,TOTAL STAFF CASES,NEW STAFF DEATHS,TOTAL STAFF DEATHS,Reporting
0,11-09-2020,Jamieson Nursing Home,ALCONA,SNF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Yes
1,11-09-2020,Lincoln Haven Nursing & Rehab. Community,ALCONA,SNF,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,Yes
2,11-09-2020,Medilodge of Munising,ALGER,SNF,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Yes
3,11-09-2020,Allegan County Medical Care Community,ALLEGAN,SNF,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,Yes
4,11-09-2020,Beacon Home at The Oaks,ALLEGAN,AFC,,0.0,,0.0,,0.0,,0.0,Yes
5,11-09-2020,Briarwood Assisted Living,ALLEGAN,HFA,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,Yes
6,11-09-2020,Country Liv - Inn Inc,ALLEGAN,AFC,,0.0,,0.0,,0.0,,0.0,Yes
7,11-09-2020,Ely Manor Nursing & Rehab,ALLEGAN,SNF,1.0,1.0,0.0,0.0,2.0,3.0,0.0,0.0,Yes
8,11-09-2020,Golden Orchards I,ALLEGAN,AFC,,0.0,,0.0,,0.0,,0.0,Yes
9,11-09-2020,Golden Orchards II,ALLEGAN,AFC,,0.0,,0.0,,0.0,,0.0,Yes
