In [1]:
import glob
import json
import pandas as pd
import numpy as np

import urllib.request as urllib2

from IPython.display import JSON

# Functions

In [2]:
def facility2CMSNum (facilityName):
    if facilityName in ltc_name2cms_id:
        return ltc_name2cms_id[facilityName]
    else:
        return "No Match"

#  To be refactored
def get_matching_cms_date(date2Match):
    dates = list(state_cms_data['Week Ending'].unique())
    dates.sort()
    for date in dates:
        matchDate = date
        if np.datetime64(date2Match) <= date :
           break
    #     matchDate = date
    return matchDate

def add_cms_data_to_df(df, cms_df, df_key, cms_key):
    ''' 
        Steo 1 - Add CMS Provider Number based on Facility Name field
        Step 2 - Merge with CMS data
    '''
    # Specify columns from CMS data feed to add
    cols=['Week Ending', 'Federal Provider Number', 'Provider Name',
           'Provider Address', 'Provider City', 'Provider State', 'County', 
           'Provider Zip Code', 'Submitted Data',
           'Residents Weekly Admissions COVID-19',
           'Residents Total Admissions COVID-19',
           'Residents Weekly Confirmed COVID-19',
           'Residents Total Confirmed COVID-19',
           'Residents Weekly Suspected COVID-19',
           'Residents Total Suspected COVID-19', 
           'Residents Weekly All Deaths',
           'Residents Total All Deaths', 
           'Residents Weekly COVID-19 Deaths',
           'Residents Total COVID-19 Deaths', 'Number of All Beds',
           'Total Number of Occupied Beds',
           'Staff Weekly Confirmed COVID-19', 'Staff Total Confirmed COVID-19',
           'Staff Weekly Suspected COVID-19', 'Staff Total Suspected COVID-19',
           'Staff Weekly COVID-19 Deaths', 'Staff Total COVID-19 Deaths',
           'Shortage of Nursing Staff', 'Shortage of Clinical Staff',
           'Shortage of Aides', 'Shortage of Other Staff',
           'Weekly Resident Confirmed COVID-19 Cases Per 1,000 Residents',
           'Weekly Resident COVID-19 Deaths Per 1,000 Residents',
           'Total Resident Confirmed COVID-19 Cases Per 1,000 Residents',
           'Total Resident COVID-19 Deaths Per 1,000 Residents',
           'Total Residents COVID-19 Deaths as a Percentage of Confirmed COVID-19 Cases',
           'Three or More Confirmed COVID-19 Cases This Week',
           'Initial Confirmed COVID-19 Case This Week', 'Geolocation',
           'Resident Access to Testing in Facility',
           'Able to Test or Obtain Resources to Test All Current Residents Within Next 7 Days',
           'During Past Two Weeks Average Time to Receive Resident Test Results',
           'Has Facility Performed Resident Tests Since Last Report',
           'Tested Residents with New Signs or Symptoms',
           'Tested Asymptomatic Residents in a Unit or Section After a New Case',
           'Tested Asymptomatic Residents Facility-Wide After a New Case',
           'Tested Asymptomatic Residents Without Known Exposure as Surveillance',
           'Tested Another Subgroup of Residents',
           'Able to Test or Obtain Resources to Test All Staff and/or Personnel Within Next 7 Days',
           'During Past Two Weeks Average Time to Receive Staff and/or Personnel Test Results',
           'Has Facility Performed Staff and/or Personnel Tests Since Last Report',
           'Tested Staff and/or Personnel with New Signs or Symptoms',
           'Tested Asymptomatic Staff and/or Personnel in a Unit or Section After a New Case',
           'Tested Asymptomatic Staff and/or Personnel Facility-Wide After a New Case',
           'Tested Asymptomatic Staff and/or Personnel Without Known Exposure as Surveillance',
           'Tested Another Subgroup of Staff and/or Personnel',
           'In-House Point-of-Care Test Machine',
           'COVID-19 Point-of-Care Tests Performed on Residents Since Last Report',
           'COVID-19 Point-of-Care Tests Performed on Staff and/or Personnel Since Last Report',
           'Enough Supplies to Test All Staff and/or Personnel Using Point-of-Care Test Machine',
           'Any Current Supply of N95 Masks', 'One-Week Supply of N95 Masks',
           'Any Current Supply of Surgical Masks',
           'One-Week Supply of Surgical Masks',
           'Any Current Supply of Eye Protection',
           'One-Week Supply of Eye Protection', 'Any Current Supply of Gowns',
           'One-Week Supply of Gowns', 'Any Current Supply of Gloves',
           'One-Week Supply of Gloves', 'Any Current Supply of Hand Sanitizer',
           'One-Week Supply of Hand Sanitizer', 'Ventilator Dependent Unit',
           'Number of Ventilators in Facility',
           'Number of Ventilators in Use for COVID-19',
           'Any Current Supply of Ventilator Supplies',
           'One-Week Supply of Ventilator Supplies',
           'Geolocation'
           ]
    df_w_cms = pd.merge(df, cms_df[cols], left_on=df_key, right_on=cms_key, how='left')
    return df_w_cms

In [3]:
def outbreak_df_from_file(filename):
    """ From Json file:
        1) return DataFrame augmented and save to file
        2) return Summary data"""
    with open(filename) as f:
      ltc_data = json.load(f)

    # Extract Reporting Data
    reporting_date = '%d-%02d-%02d' %(ltc_data['LastUpdateDate']['year'], ltc_data['LastUpdateDate']['month'], ltc_data['LastUpdateDate']['day'])
    df = pd.DataFrame(ltc_data['FacilityValues'])
    df.insert(0, 'reporting_date', reporting_date)
    df['CFR'] = (df['deaths'] / df['confirmed_cases'])
    df['outbreaks'] = 1 # to allow counting # of outbreaks by Facility
    #Save Outbreak data to a file
    outbreak_file = 'Reporting_data/IL_' + reporting_date + '_Outbreaks_LTC_data_v2.csv'
    df.to_csv(outbreak_file, index = False)
    
    # Get summary data from feed - Note this may not match totals - ST-TODO: Check if summary data and totals from raw data match
    deaths = ltc_data['LTC_Reported_Cases']['deaths']
    confirmed_cases = ltc_data['LTC_Reported_Cases']['confirmed_cases']
    facility_cnt = len(df.groupby(['County', 'FacilityName']).size().reset_index().rename(columns={0:'count'}).sort_values(by='count', ascending=False))
    summary = {}
    summary['Date'] = reporting_date
    summary['Cases'] = confirmed_cases
    summary['Deaths'] = deaths
    summary['Outbreaks'] = df.reporting_date.value_counts()[0]
    summary['Facilities'] = facility_cnt
    
    return df, summary, reporting_date

In [4]:
def process_json_IL (filename, display_dfs=False, display_summary=True):
    """Process a JSON file to:
       1) Produce Summary Info
       2) Produce Outbreak file and dataframe
       3) Produce Facility file and dataframe
       4) Produce County file and dataframe
        
       TODO - make display dataframes optional
       TODO - make display summary info optional"""
    [outbreak_df, summary, reporting_date] = outbreak_df_from_file(filename)

    # Print Summary Data
    if display_summary:
        for k,v in summary.items():
            print(k + ": " + str(v))    

    # Save and Display Facility data
    df_facilities = outbreak_df.groupby(['County', 'FacilityName']).sum()
    df_facilities['CFR'] = df_facilities['deaths'] / df_facilities['confirmed_cases']
    df_facilities['facilities'] = 1
    df_facilities.insert(0, 'ReportingDate', reporting_date)
    df_facilities.sort_values(by='confirmed_cases', ascending=False).to_csv('Reporting_data/IL_' + reporting_date + '_Facilities_LTC_data_v2.csv')

    # Save and Display County Level Data
    df_county = df_facilities.groupby(by=['County']).sum()
    df_county['CFR'] = (df_county['deaths'] / df_county['confirmed_cases'])
    df_county.insert(0, 'ReportingDate', reporting_date)
    filename = 'Reporting_data/IL_' + reporting_date + '_County_LTC_stats_v2.csv'
    df_county.sort_values('confirmed_cases', ascending=False).to_csv('Reporting_data/IL_' + reporting_date + '_County_LTC_stats_v2.csv')
    
    
    if display_dfs:
        print("\nOutbreak Data\n=============")
        display(outbreak_df.sort_values(by='deaths', ascending=False).head(5))
        print("\nFacility Data\n=============")
        display(df_facilities.sort_values('deaths', ascending=False).head(10))
        print("\nCounty Data\n===========")
        display(df_county.sort_values(by='confirmed_cases', ascending=False).head(10))

    return reporting_date, summary, outbreak_df, df_facilities, df_county

In [5]:
def facility2CMSNum (facilityName):
    if facilityName in ltc_name2cms_id:
        return ltc_name2cms_id[facilityName]
    else:
        return "No Match"

In [6]:
# 1 - Get a list of all IL json source files - DONE
# 2 - For Each JSON Source file:
#     2.1 - Turn it into a DataFrame - DONE
#     2.2 - Match to CMS Federal Provider Number- DONE
# 3 - Merge with CMS data associated, as closely as possible with that date

# NEXT ACTIONS:
# - Alllow for Renaming of fields (ie: State - Raw, State - Calc, CMS Raw, CMS Calc, ...)

# Prep 1 - Load Supporting Files (CMS et al)

In [7]:
# Load Facility Name to CMS ID json file
fac2CMS_file = 'IL_FacilityName_to_CMS_ID.json'
with open(fac2CMS_file) as f:
  ltc_name2cms_id = json.load(f) 

In [8]:
# Load CMS Dataset from CMS website
url_csv = 'https://data.cms.gov/api/views/s2uc-8wxp/rows.csv?accessType=DOWNLOAD&api_foundry=true'

response = urllib2.urlopen(url_csv)
cms_data = pd.read_csv(response, parse_dates=['Week Ending'], dtype={'Provider Name': str})
max_date = cms_data['Week Ending'].max()
reporting_date = str(max_date)[0:10]
print(reporting_date)
display(cms_data.head(5))
cms_data['Week Ending'] = pd.to_datetime(cms_data['Week Ending'])

  interactivity=interactivity, compiler=compiler, result=result)


2020-11-22


Unnamed: 0,Week Ending,Federal Provider Number,Provider Name,Provider Address,Provider City,Provider State,Provider Zip Code,Submitted Data,Passed Quality Assurance Check,Residents Weekly Admissions COVID-19,...,"Weekly Resident Confirmed COVID-19 Cases Per 1,000 Residents","Weekly Resident COVID-19 Deaths Per 1,000 Residents","Total Resident Confirmed COVID-19 Cases Per 1,000 Residents","Total Resident COVID-19 Deaths Per 1,000 Residents",Total Residents COVID-19 Deaths as a Percentage of Confirmed COVID-19 Cases,County,Three or More Confirmed COVID-19 Cases This Week,Initial Confirmed COVID-19 Case This Week,Geolocation,Reporting Interval
0,2020-05-24,105045,BRADEN RIVER REHABILITATION CENTER LLC,2010 MANATEE AVE E,BRADENTON,FL,34208,N,,,...,,,,,,Manatee,,,POINT (-82.539097 27.496201000000003),Week 1 - May 24
1,2020-05-24,105384,CALUSA HARBOUR,2525 FIRST ST,FORT MYERS,FL,33901,N,,,...,,,,,,Lee,,,POINT (-81.864238 26.647471),Week 1 - May 24
2,2020-05-24,105453,KENSINGTON GARDENS REHAB AND NURSING CENTER,2055 PALMETTO ST,CLEARWATER,FL,33758,N,,,...,,,,,,Pinellas,,,POINT (-82.751998 27.975286),Week 1 - May 24
3,2020-05-24,105460,NORTH FLORIDA REHABILITATION AND SPECIALTY CARE,6700 NW 10TH PLACE,GAINESVILLE,FL,32605,N,,,...,,,,,,Alachua,,,POINT (-82.413587 29.661501),Week 1 - May 24
4,2020-05-31,105478,ADVANCED CARE CENTER,401 FAIRWOOD AVE,CLEARWATER,FL,33759,N,,,...,,,,,,Pinellas,,,POINT (-82.724356 27.970223),Week 2 - May 31


In [9]:
# Get CMS data for a particular state
myState = 'IL'
state_cms_data = cms_data[cms_data['Provider State'] == myState]

### 1 - Get a list of all IL json source files
### 2 - For Each JSON Source file:
####     2.1 - Turn it into a DataFrame - DONE
####    2.2 - Match to CMS Federal Provider Number- DONE
####     2.1 - Merge with CMS data associated, as closely as possible with that date

In [10]:
# 1 - Get a list of all IL json source files
files = glob.glob('Source_data/IL_*LTC_data_Source.json')
facility_dfs = []
outbreak_dfs = []
# 2 - For Each JSON Source file:
#     2.1 - Turn it into a DataFrame - DONE
#    2.2 - Match to CMS Federal Provider Number- DONE
#     2.1 - Merge with CMS data associated, as closely as possible with that date
for file in files:
    [reporting_date, summary, outbreak_df, df_facilities, df_county] = process_json_IL(file, )

    df_facilities.reset_index(inplace=True) # Needed because used group by to get facility level data ToDo: COnsider moving this code up
    df_facilities['county-facName']= df_facilities['County'].str.upper() + '-' + df_facilities['FacilityName'].str.upper()
    df_facilities['CMS_ProvNum'] = df_facilities['county-facName'].apply(lambda x: facility2CMSNum(x))
    facility_dfs.append(df_facilities)

    outbreak_df['county-facName'] = outbreak_df['County'].str.upper() + '-' + outbreak_df['FacilityName'].str.upper()
    outbreak_df['CMS_ProvNum'] = outbreak_df['county-facName'].apply(lambda x: facility2CMSNum(x))
    outbreak_dfs.append(outbreak_df)

Date: 2020-06-05
Cases: 18837
Deaths: 3053
Outbreaks: 554
Facilities: 554
Date: 2020-10-02
Cases: 30243
Deaths: 4697
Outbreaks: 967
Facilities: 891
Date: 2020-10-09
Cases: 30920
Deaths: 4792
Outbreaks: 1008
Facilities: 920
Date: 2020-10-23
Cases: 33440
Deaths: 5019
Outbreaks: 1151
Facilities: 1015
Date: 2020-11-06
Cases: 36683
Deaths: 5253
Outbreaks: 1309
Facilities: 1116
Date: 2020-06-19
Cases: 21390
Deaths: 3649
Outbreaks: 593
Facilities: 592
Date: 2020-11-20
Cases: 43222
Deaths: 5780
Outbreaks: 1532
Facilities: 1276
Date: 2020-09-18
Cases: 28941
Deaths: 4575
Outbreaks: 889
Facilities: 829
Date: 2020-06-12
Cases: 20550
Deaths: 3433
Outbreaks: 580
Facilities: 580
Date: 2020-10-30
Cases: 34278
Deaths: 5127
Outbreaks: 1209
Facilities: 1052
Date: 2020-08-28
Cases: 27126
Deaths: 4396
Outbreaks: 795
Facilities: 758
Date: 2020-08-21
Cases: 26355
Deaths: 4319
Outbreaks: 766
Facilities: 737
Date: 2020-07-10
Cases: 23324
Deaths: 3895
Outbreaks: 630
Facilities: 628
Date: 2020-09-11
Cases: 28189

In [11]:
all_facilities_df = pd.concat(facility_dfs)
all_facilities_df['ReportingDate']= pd.to_datetime(all_facilities_df['ReportingDate'])
all_facilities_df['MatchingDate'] = all_facilities_df['ReportingDate'].apply(lambda x: get_matching_cms_date(x))

all_outbreaks_df = pd.concat(outbreak_dfs)
all_outbreaks_df['reporting_date']= pd.to_datetime(all_outbreaks_df['reporting_date'])
all_outbreaks_df['MatchingDate'] = all_outbreaks_df['reporting_date'].apply(lambda x: get_matching_cms_date(x))

In [12]:
all_outbreaks_df.head()

Unnamed: 0,reporting_date,County,FacilityName,confirmed_cases,deaths,status,CFR,outbreaks,county-facName,CMS_ProvNum,ReportDate,MatchingDate
0,2020-06-05,Boone,Park Place of Belvidere,5,0,Open,0.0,1,BOONE-PARK PLACE OF BELVIDERE,146071,,2020-06-07
1,2020-06-05,Boone,Symphony Northwoods,87,13,Open,0.149425,1,BOONE-SYMPHONY NORTHWOODS,No Match,,2020-06-07
2,2020-06-05,Champaign,Reflections Memory Care,8,0,Open,0.0,1,CHAMPAIGN-REFLECTIONS MEMORY CARE,No Match,,2020-06-07
3,2020-06-05,Champaign,University Rehab Center,5,0,Open,0.0,1,CHAMPAIGN-UNIVERSITY REHAB CENTER,145364,,2020-06-07
4,2020-06-05,Christian,Villas of Holly Brook,2,0,Open,0.0,1,CHRISTIAN-VILLAS OF HOLLY BROOK,No Match,,2020-06-07


In [15]:
all_facilities_df_match_list = []
for date in all_facilities_df['MatchingDate'].unique():
    state_cms_data_match = state_cms_data[state_cms_data['Week Ending'] == date]
    facilities_df_match = all_facilities_df[all_facilities_df['MatchingDate'] == date]
    facilities_df_match = add_cms_data_to_df(facilities_df_match, state_cms_data_match, 'CMS_ProvNum', 'Federal Provider Number')
    all_facilities_df_match_list.append(facilities_df_match)

In [20]:
all_facilities_df_match = pd.concat(all_facilities_df_match_list)

In [22]:
filename =  "Reporting_data/IL_" + reporting_date + "_Facilities_FullSummary_LTC_data_Source.csv"
all_facilities_df_match.to_csv(filename, index=False)

# filename =  "Reporting_data/IL_" + reporting_date + "_Outbreaks_FullSummary_LTC_data_Source.csv"
# all_outbreaks_df.to_csv(filename, index=False)

In [23]:
reporting_date

'2020-08-07'

In [None]:
#df_facilities_w_cms = add_cms_data_to_df(all_facilities_df, state_cms_data_latest, 'CMS_ProvNum', 'Federal Provider Number')


# 2 - Load Supporting Files (CMS et al)

In [None]:
# get_matching_cms_date('2020-10-26T00:00:00.000000000')

# Play Area

In [None]:
#ltc_name2cms_id

# OLD CODE

# Individual Facility Data