## Imports

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("COVID-19_Nursing_Home_Dataset.csv", parse_dates=['Week Ending'])

## Basic Analysis 

In [18]:
all_shape = df.shape
all_shape

(567976, 122)

In [4]:
submitted = df[df['Submitted Data']=='Y']

In [5]:
did_sub_shape = submitted.shape

In [6]:
# Only 6,097 missing rows of data

all_shape[0]-did_sub_shape[0]

6097

In [19]:
# 19,171 unique federal provider numbers for skilled nursing facilities

df['Federal Provider Number'].nunique()

19171

In [20]:
df.head(1)

Unnamed: 0,Week Ending,Federal Provider Number,Provider Name,Provider Address,Provider City,Provider State,Provider Zip Code,Submitted Data,Passed Quality Assurance Check,Residents Weekly Admissions COVID-19,...,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result with Positive Antigen Test AND Negative NAAT (PCR) Test,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result with Any Other Combination of Antigen Test and/or NAAT (PCR) Test with At Least One Positive Test,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected and Symptomatic,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected and Asymptomatic,Number of Staff and/or Personnel with New Influenza,Number of Staff and/or Personnel with Acute Respiratory Illness Symptoms Excluding COVID-19 and/or Influenza,Number of Staff and/or Personnel with Confirmed Coinfection with Influenza and COVID-19,Submitted Data Counts,Passed Quality Counts
0,2020-05-24,45189,SOMERSET SENIOR LIVING AT PINE HILLS,900 MAGNOLIA RD,CAMDEN,AR,71701,N,,,...,,,,,,,,,0,0


In [21]:
# 71 facilities did not submit data

submitted['Federal Provider Number'].nunique()

19100

In [22]:
# First report for 2020 was filed the week of May 24th
# There are 31 weeks between those dates
# On average, 15,353 reports were filed weekly
year_2020 = df[df['Week Ending'] < '12/31/20']
year_2020['Week Ending'].value_counts().mean()

15353.25

In [23]:
# finding distribution of reporting
df['Submitted Data Counts'] = np.where(df['Submitted Data']=='N', 0, 1)
df['Passed Quality Counts'] = np.where(df['Passed Quality Assurance Check']=="Y", 1, 0)

In [24]:
grouped = df.groupby(by = ['Federal Provider Number', 'Provider Name', 'Provider State']).sum().reset_index()

In [25]:
# On average, facilities submitted for 29 of the 31 weeks
# That's a fairly good average
grouped['Submitted Data Counts'].mean()

29.308799749621823

In [30]:
grouped.head(5)

Unnamed: 0,Federal Provider Number,Provider Name,Provider State,Provider Zip Code,Residents Weekly Admissions COVID-19,Residents Total Admissions COVID-19,Residents Weekly Confirmed COVID-19,Residents Total Confirmed COVID-19,Residents Weekly Suspected COVID-19,Residents Total Suspected COVID-19,...,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result with Positive Antigen Test AND Negative NAAT (PCR) Test,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result with Any Other Combination of Antigen Test and/or NAAT (PCR) Test with At Least One Positive Test,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected and Symptomatic,Number of Staff and/or Personnel with a New Positive COVID-19 Test Result who are Reinfected and Asymptomatic,Number of Staff and/or Personnel with New Influenza,Number of Staff and/or Personnel with Acute Respiratory Illness Symptoms Excluding COVID-19 and/or Influenza,Number of Staff and/or Personnel with Confirmed Coinfection with Influenza and COVID-19,Submitted Data Counts,Passed Quality Counts
0,55139,SANTA TERESITA MANOR,CA,182020,0.0,4.0,0.0,12.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2
1,55141,RAMONA NURSING & REHABILITATION CENTER,CA,91732,0.0,21.0,0.0,63.0,3.0,32.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
2,55147,MADERA REHABILITATION & NURSING CENTER,CA,374552,8.0,72.0,30.0,289.0,14.0,342.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4
3,55150,WINDSOR MANOR REHABILITATION CENTER,CA,94521,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1
4,55153,MONTEBELLO CARE CENTER,CA,181280,0.0,24.0,0.0,95.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2


In [28]:
# 12,739 facilities reported each of the 31 weeks
# Only 11 out of the 12,739 reports are missing due to bad inputs
# 15,027 facilities reported for more equal to or more than 20 weeks
# 1,279 facilities reported fewer than 5 weeks
# 71 facilities reported 0 times
reported_with_no_errors = grouped[(grouped['Submitted Data Counts']>=30) & (grouped['Passed Quality Counts']>=30)]
no_reporting = grouped[grouped['Submitted Data Counts']==0]

In [29]:
# 71 facilities did not report at all
no_reporting.shape

(71, 57)

In [31]:
no_reporting['Provider State'].value_counts()

TX    16
OH    12
FL     7
CA     6
OK     4
IN     3
IL     3
NC     3
VA     2
GA     2
KY     2
NJ     1
AZ     1
OR     1
MA     1
WA     1
MO     1
MN     1
TN     1
ME     1
KS     1
UT     1
Name: Provider State, dtype: int64