# Test DataIO Python Library routines

This Jupyter notebook can be used to test the recently written dataIO python functions, which have been collected into the data_IO library.

In [131]:
# This forces a reload of any external library file if it changes.  
# Useful when developing external libraries since otherwise Jupyter 
# will not re-import any library without restarting the python kernel.

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [132]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import datetime

# Import COVID data IO routines from external python library
import COVIDlib.data_IO as COVID_IO

## Define variables of interest below
data_dir = 'our_data/'    # Data directory for the COVID datafiles
test_dir = 'test_data/'   # Data directory for storing test datafiles

## Define FIPS corresponding to various local areas
ClayFIPS = 27027
CassFIPS = 38017
MNFIPS = 27
NDFIPS = 38

# Test John Hopkins DataIO

Execute and test the John Hopkins DataIO routines first authored by Luke

In [3]:
## 
## Retrieve the John Hopkins data
##

# Retrieve John Hopkins dataframes
(JH_state_df, JH_cnty_df) = COVID_IO.GetCDRDataFrames()
# Retrieve State-Level data for Minnesota
MN_df = COVID_IO.GetCDRState(MNFIPS, JH_state_df)
# Retrieve County-level data for Clay County
CLAY_df = COVID_IO.GetCDRCounty(ClayFIPS, JH_cnty_df)

# Retrieve Test John Hopkins dataframes
test_cntyfile = test_dir+"TEST_countylevel_combinedCDR.csv"
test_statefile = test_dir+"TEST_statelevel_combinedCDR.csv"
(testJH_state_df, testJH_cnty_df) = COVID_IO.GetCDRDataFrames(stateFile = test_statefile, countyFile = test_cntyfile)
# Retrieve TEST State-Level data for Minnesota
TESTMN_df = COVID_IO.GetCDRState(MNFIPS, testJH_state_df)
# Retrieve TEST County-level data for Clay County
TESTCLAY_df = COVID_IO.GetCDRCounty(ClayFIPS, testJH_cnty_df)

In [4]:
print("Testing John Hopkins Test Data")
print(("-"*100))
if TESTMN_df['Dates'].values[0][0] == np.datetime64('2020-03-22') and TESTMN_df['Confirmed'].values[0][0] == 0:
    print("Correct Date format: {0} Compared to 2020-03-22 & Correct Death Format: {1} compared to 0".format(TESTMN_df['Dates'].values[0][0],TESTMN_df['Confirmed'].values[0][0]))
else:
    print("Inccorrect Results")
if TESTCLAY_df['Dates'].values[0][0] == np.datetime64('2020-03-22') and TESTCLAY_df['Confirmed'].values[0][0] == 10:
    print("Correct Date format: {0} Compared to 2020-03-22 & Correct Death Format: {1} compared to 10".format(TESTCLAY_df['Dates'].values[0][0],TESTCLAY_df['Confirmed'].values[0][0]))
else:
    print("Inccorrect Results")

Testing John Hopkins Test Data
----------------------------------------------------------------------------------------------------
Correct Date format: 2020-03-22 Compared to 2020-03-22 & Correct Death Format: 0 compared to 0
Correct Date format: 2020-03-22 Compared to 2020-03-22 & Correct Death Format: 10 compared to 10


In [5]:
##
## The Clay county data should have a steady 10 confirmed per day starting March 22, 2020, so the number of confirmed increases as 10 cases per day (means dConfirmed is 10/day)
## First death occurs 14 days later (since I "kill" 10% of the infected at the end of 14 days, the other 90% are recovered).  Check this!

##
## The Minnesota data should have an unreal situation of 10 additional new cases a day from March 22 to June 1.  We still "kill" 10% of the infected 14 days later and mark the
## other 90% "recovered."  Check this.

In [92]:
testConfirmed = TESTCLAY_df['Confirmed'][0]
expectedConfirmed = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500, 510, 520, 530, 540, 550, 560, 570, 580, 590, 600, 610, 620, 630, 640, 650, 660, 670, 680, 690, 700, 710, 720]

if testConfirmed == expectedConfirmed:
    print("County Confirmed cases - Passed")
else:
    print("County Confirmed cases - Failed\n\tExpected: List w/ 10 new cases per day\tActual: {0}".format(testConfirmed))

testDeaths = TESTCLAY_df['Deaths'][0]
expectedDeaths = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]

if testDeaths == expectedDeaths and testDeaths[14] == 1:
    print("County Deaths - Passed")
else:
    print("County Deaths - Failed\n\tExpected: List w/ 10% of infected patients dying, the first occuring 14 days in\tActual: {0}".format(testDeaths))
    
testRecovery = TESTCLAY_df['Recovered'][0]
expectedRecovery = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 18, 27, 36, 45, 54, 63, 72, 81, 90, 99, 108, 117, 126, 135, 144, 153, 162, 171, 180, 189, 198, 207, 216, 225, 234, 243, 252, 261, 270, 279, 288, 297, 306, 315, 324, 333, 342, 351, 360, 369, 378, 387, 396, 405, 414, 423, 432, 441, 450, 459, 468, 477, 486, 495, 504, 513, 522]

if testRecovery == expectedRecovery:
    print("County Recovered - Passed")
else:
    print("County Recovered - Failed\n\tExpected: List w/ 90% of infected patients having recovered\tActual: {0}".format(testRecovery))



County Confirmed cases - Passed
County Deaths - Passed
County Recovered - Passed


## Test Apple and Google Mobility DataIO

Execute and test the Apple and Google Mobility DataIO routines first authored by Dio

In [81]:
## 
## Retrieve the Apple Mobility Data
##

# Retrieve Apple Mobility Dataframe
(aapl_cnty_df, aapl_state_df) = COVID_IO.initAaplMobilityDataframes()
# Get real Clay county and Minnesota mobility data
aapl_CLAY_df = COVID_IO.getAaplCountyMobility(ClayFIPS, aapl_cnty_df)
aapl_MN_df = COVID_IO.getAaplStateMobility(MNFIPS, aapl_state_df)

# Retrieve TEST Apple Mobility Dataframe
test_cntyfile = test_dir+"TEST_aapl_mobility_cnty.csv"
test_statefile = test_dir+"TEST_aapl_mobility_state.csv"
(testaapl_cnty_df, testaapl_state_df) = COVID_IO.initAaplMobilityDataframes(countyFile = test_cntyfile, stateFile = test_statefile)
# Get TEST Clay county and Minnesota mobility data
testaapl_CLAY_df = COVID_IO.getAaplCountyMobility(ClayFIPS, testaapl_cnty_df)
testaapl_MN_df = COVID_IO.getAaplStateMobility(MNFIPS, testaapl_state_df)

## 
## Retrieve the Google Mobility Data
##

# Retrieve Google Mobility Dataframe
(goog_cnty_df, goog_state_df) = COVID_IO.initgoogMobilityDataframes()
# Get real Clay county and Minnesota mobility data
goog_CLAY_df = COVID_IO.getGoogleCountyMobility(ClayFIPS, goog_cnty_df)
goog_MN_df = COVID_IO.getGoogleStateMobility(MNFIPS, goog_state_df)

# Retrieve TEST Google Mobility Dataframe
test_cntyfile = test_dir+"TEST_goog_mobility_cnty.csv"
test_statefile = test_dir+"TEST_goog_mobility_state.csv"
(testgoog_cnty_df, testgoog_state_df) = COVID_IO.initgoogMobilityDataframes(countyFile = test_cntyfile, stateFile = test_statefile)
# Get TEST Clay county and Minnesota mobility data
testgoog_CLAY_df = COVID_IO.getGoogleCountyMobility(ClayFIPS, testgoog_cnty_df)
testgoog_MN_df = COVID_IO.getGoogleStateMobility(MNFIPS, testgoog_state_df)

ValueError: could not convert string to float: '2020-02-15'

In [80]:
print("Testing Mobility Test Data")
print(("-"*100))
if testgoog_CLAY_df['dates'].values[0][0] == np.datetime64('2020-02-15') and testgoog_CLAY_df['retail_recreation_Percent'].values[0][0] == 0.0:
    print("Correct Date format: {0} Compared to 2020-02-15 & Retail/Rec % Format: {1} compared to 0.0".format(testgoog_CLAY_df['dates'].values[0][0],testgoog_CLAY_df['retail_recreation_Percent'].values[0][0]))
else:
    print("Inccorrect Results")
if testgoog_MN_df['State'].values[0] == "Minnesota" and testgoog_MN_df['residential_percent'].values[0][0] == 5.0:
    print("Correct State format: {0} Compared to Minnesota & Correct Res% Format: {1} compared to 5.0".format(testgoog_MN_df['State'].values[0],testgoog_MN_df['residential_percent'].values[0][0]))
else:
    print("Inccorrect Results")

Testing Mobility Test Data
----------------------------------------------------------------------------------------------------


NameError: name 'testgoog_CLAY_df' is not defined

In [None]:
#
## Test Apple Mobility Data should be checked
##  - For Clay county, the test data is a sawtooth pattern from -30 to +30 with a 7 day period starting on Feb. 15, 2020
##  - For Minnesota, the test data is a boxcar pattern 4 days at 30 followed by four days at -30 and back again starting on Feb. 15, 2020 starting on Feb. 15, 2020

In [None]:
##
## Test Google Mobility Data should be checked
##  - For Clay county, the test data is a sine wave with an amplitude of 20 and wavelength of 30 days starting on Feb. 15, 2020
##  - For Minnesota, the test data is a cosine wave with an amplitude of 20 and wavelength of 30 days starting on Feb. 15, 2020

## Test IMHE DataIO

Execute and test the Apple and Google Mobility DataIO routines first authored by Luke

In [133]:
## 
## Retrieve the IMHE Data
##

# Retrieve IMHE Dataframes
(summary_df, hospitalization_df) = COVID_IO.GetIMHEDataFrames()
# Retrieve specific Dataframes and Data for MN
equip_df = COVID_IO.GetEquipData(MNFIPS, summary_df)
icu_beds = COVID_IO.GetNumICUBeds(MNFIPS, summary_df)
all_beds = COVID_IO.GetNumAllBeds(MNFIPS, summary_df)
icu_usage = COVID_IO.GetICUBedUsage(MNFIPS, summary_df)
allbed_usage = COVID_IO.GetAllBedUsage(MNFIPS, summary_df)
hospital_df = COVID_IO.GetHospitalizationData(MNFIPS, hospitalization_df)

# Retrieve TEST IMHE Dataframes
testsummaryfile = test_dir+"TEST_imhe_summary.csv"
testhospitalizationfile = test_dir+"TEST_imhe_hospitalizations.csv"
(testsummary_df, testhospitalization_df) = COVID_IO.GetIMHEDataFrames(summaryFile = testsummaryfile, hospitalFile = testhospitalizationfile)
# Retrieve specific TEST Dataframes and Data for MN
testequip_df = COVID_IO.GetEquipData(MNFIPS, testsummary_df)
testicu_beds = COVID_IO.GetNumICUBeds(MNFIPS, testsummary_df)
testall_beds = COVID_IO.GetNumAllBeds(MNFIPS, testsummary_df)
testicu_usage = COVID_IO.GetICUBedUsage(MNFIPS, testsummary_df)
testallbed_usage = COVID_IO.GetAllBedUsage(MNFIPS, testsummary_df)
testhospital_df = COVID_IO.GetHospitalizationData(MNFIPS, testhospitalization_df)

 

In [134]:
##
## Test IMHE data for Minnesota should be checked.
##  All summary dates were set to May 15, 2020 (lower May 1, upper June 15)
##  Bed capacity was set to 2000 and ICU capacity to 200. 
##  The test data assumes 100% usage.


In [135]:
# Testing Functions for IMHE Summary Data
print("----------------------------------------\nTesting Functions for IMHE Summary Data\n----------------------------------------\n")
if testicu_beds == 200:
    print("GetNumICUBeds : Passed")
else:
    print("GetNumICUBeds : Failed")
print("\tExpected value: 200 - Actual value: {0}".format(testicu_beds))
    
if testall_beds == 2000:
    print("GetNumAllBeds : Passed")
else:
    print("GetNumAllBeds : Failed")
print("\tExpected value: 2000 - Actual value: {0}".format(testall_beds) + "\n")
    
if testicu_usage == 200:
    print("GetICUBedUsage : Passed")
else:
    print("GetICUBedUsage : Failed")
print("\tExpected value: 200 - Actual value: {0}".format(testicu_usage))

if testallbed_usage == 2000:
    print("GetNumAllBeds : Passed")
else:
    print("GetNumAllBeds : Failed")
print("\tExpected value: 2000 - Actual value: {0}".format(testallbed_usage) + "\n")

if testequip_df['peak_bed_day_mean'].values[0] == datetime.date(2020, 5, 15):
    print("Peak Bed Day Mean - Passed")
else:
    print("Peak Bed Day Mean - Failed")

if testequip_df['peak_icu_bed_day_mean'].values[0] == datetime.date(2020, 5, 15):
    print("Peak ICU Bed Day Mean - Passed")
else:
    print("Peak ICU Bed Day Mean - Failed")
    
if testequip_df['peak_vent_day_mean'].values[0] == datetime.date(2020, 5, 15):
    print("Peak Vent Day Mean - Passed")
else:
    print("Peak Vent Day Mean - Failed")

----------------------------------------
Testing Functions for IMHE Summary Data
----------------------------------------

GetNumICUBeds : Passed
	Expected value: 200 - Actual value: 200
GetNumAllBeds : Passed
	Expected value: 2000 - Actual value: 2000

GetICUBedUsage : Passed
	Expected value: 200 - Actual value: 200
GetNumAllBeds : Passed
	Expected value: 2000 - Actual value: 2000

Peak Bed Day Mean - Passed
Peak ICU Bed Day Mean - Passed
Peak Vent Day Mean - Passed


In [98]:
# The hospitalization data starts on March 22, 2020
# test_imhe_hospitalizations['allbed_mean'] is increasing at 1000 per day (10% margins of error on lower/upper)
# test_imhe_hospitalizations['ICUbed_mean'] is increasing at 100 per day (10% margins of error on lower/upper)
# test_imhe_hospitalizations['InvVen_mean'] is increasing at 50 per day (10% margins of error on lower/upper)
# For the hospitalization deaths/admits/ICU use, the test data has a constant 10 deaths/100 admits/20 ICU a day 
# (with 10% margins of error on lower/upper)

In [99]:
print("-----------------------------------------------\nTesting Functions for IMHE Hospitalization Data\n-----------------------------------------------\n")

if testhospital_df['dates'][0][0] == datetime.date(2020, 3, 22):
    print('Start date - Passed')
else:
    print('Start date - Failed')
    
print("\tExpected value: 2020-03-22 - Actual value: {0}\n".format(testhospital_df['dates'][0][0]))
    
allBedList = [0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000, 31000, 32000, 33000, 34000, 35000, 36000, 37000, 38000, 39000, 40000, 41000, 42000, 43000, 44000, 45000, 46000, 47000, 48000, 49000, 50000, 51000, 52000, 53000, 54000, 55000, 56000, 57000, 58000, 59000, 60000, 61000, 62000, 63000, 64000, 65000, 66000, 67000, 68000, 69000, 70000, 71000, 72000, 73000, 74000, 75000, 76000, 77000, 78000, 79000, 80000, 81000, 82000, 83000, 84000, 85000, 86000, 87000, 88000, 89000, 90000, 91000, 92000, 93000, 94000, 95000, 96000, 97000, 98000, 99000, 100000, 101000, 102000, 103000, 104000, 105000, 106000, 107000, 108000, 109000, 110000, 111000, 112000, 113000, 114000, 115000, 116000, 117000, 118000, 119000, 120000, 121000, 122000, 123000, 124000, 125000, 126000, 127000, 128000, 129000, 130000, 131000, 132000]
icuBedList = [0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400, 4500, 4600, 4700, 4800, 4900, 5000, 5100, 5200, 5300, 5400, 5500, 5600, 5700, 5800, 5900, 6000, 6100, 6200, 6300, 6400, 6500, 6600, 6700, 6800, 6900, 7000, 7100, 7200, 7300, 7400, 7500, 7600, 7700, 7800, 7900, 8000, 8100, 8200, 8300, 8400, 8500, 8600, 8700, 8800, 8900, 9000, 9100, 9200, 9300, 9400, 9500, 9600, 9700, 9800, 9900, 10000, 10100, 10200, 10300, 10400, 10500, 10600, 10700, 10800, 10900, 11000, 11100, 11200, 11300, 11400, 11500, 11600, 11700, 11800, 11900, 12000, 12100, 12200, 12300, 12400, 12500, 12600, 12700, 12800, 12900, 13000, 13100, 13200]
ventList = [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2050, 2100, 2150, 2200, 2250, 2300, 2350, 2400, 2450, 2500, 2550, 2600, 2650, 2700, 2750, 2800, 2850, 2900, 2950, 3000, 3050, 3100, 3150, 3200, 3250, 3300, 3350, 3400, 3450, 3500, 3550, 3600, 3650, 3700, 3750, 3800, 3850, 3900, 3950, 4000, 4050, 4100, 4150, 4200, 4250, 4300, 4350, 4400, 4450, 4500, 4550, 4600, 4650, 4700, 4750, 4800, 4850, 4900, 4950, 5000, 5050, 5100, 5150, 5200, 5250, 5300, 5350, 5400, 5450, 5500, 5550, 5600, 5650, 5700, 5750, 5800, 5850, 5900, 5950, 6000, 6050, 6100, 6150, 6200, 6250, 6300, 6350, 6400, 6450, 6500, 6550, 6600]

if testhospital_df['allbed_mean'][0] == allBedList:
    print("All bed data - Passed")
else:
    print("All bed data - Failed")

if testhospital_df['ICUbed_mean'][0] == icuBedList:
    print("ICU bed data - Passed")
else:
    print("ICU bed data - Failed")

if testhospital_df['InvVen_mean'][0] == ventList:
    print("Inv. Vent data - Passed")
else:
    print("Inv. Vent data - Failed")
    
print("\n")

tenCT = 0
deathsList = testhospital_df['deaths_mean'][0]
for entry in deathsList:
    if entry == 10:
        tenCT += 1
if tenCT == len(deathsList):
    print("Num. Deaths Mean - Passed")
else:
    print("Num. Deaths Mean - Failed")
    
tenCTAdmis = 0
deathsList = testhospital_df['admis_mean'][0]
for entry in deathsList:
    if entry == 10:
        tenCTAdmis += 1
if tenCTAdmis == len(deathsList):
    print("Num. Admissions Mean - Passed")
else:
    print("Num. Admissions Mean - Failed")
    
tenCTicu = 0
deathsList = testhospital_df['newICU_mean'][0]
for entry in deathsList:
    if entry == 10:
        tenCTicu += 1
if tenCTicu == len(deathsList):
    print("Num. Admissions Mean - Passed")
else:
    print("Num. Admissions Mean - Failed")

-----------------------------------------------
Testing Functions for IMHE Hospitalization Data
-----------------------------------------------

Start date - Passed
	Expected value: 2020-03-22 - Actual value: 2020-03-22

All bed data - Passed
ICU bed data - Passed
Inv. Vent data - Passed


Num. Deaths Mean - Passed
Num. Admissions Mean - Passed
Num. Admissions Mean - Passed
