# Comparing DataIO Methods

This is a Notebook that runs (and times) the three approaches developed so far for transfering processed data into our program.  Note that all of these dataframes, when finally loaded, are absolutely identical in memory, so any speed/simplicity gains should be appreciated.

In [1]:
# This forces a reload of any external library file if it changes.  
# Useful when developing external libraries since otherwise Jupyter 
# will not re-import any library without restarting the python kernel.

%load_ext autoreload
%autoreload 2

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import datetime
import time

# Import COVID data IO routines from external python library
import COVIDlib.data_IO as COVID_IO

## Define variables of interest below
data_dir = 'our_data/'    # Data directory for the COVID datafiles

## Define FIPS corresponding to various local areas
ClayFIPS = 27027
CassFIPS = 38017
MNFIPS = 27
NDFIPS = 38

## Oldest DataIO approach

This approach loads the CSV files in memory, but then requires us to message the data from strings to lists every time we select a subset of the data.

In [3]:
# Load all the dataframes into memory
print("Loading Complete COVID Dataset into memory from CSV files ", end='')

start= time.perf_counter()
# Retrieve John Hopkins dataframes
(oldest_JH_state_df, oldest_JH_cnty_df) = COVID_IO.GetCDRDataFrames()
# Retrieve Apple Mobility Dataframe
(oldest_aapl_cnty_df, oldest_aapl_state_df) = COVID_IO.initAaplMobilityDataframes()
# Retrieve Google Mobility Dataframe
(oldest_goog_cnty_df, oldest_goog_state_df) = COVID_IO.initgoogMobilityDataframes()
# Retrieve IMHE Dataframes
(oldest_summary_df, oldest_hospitalization_df) = COVID_IO.GetIMHEDataFrames()

# Narrow data down to local subsets
midpt= time.perf_counter()
print(f"({midpt-start:0.2f} sec elapsed) ... Building local datasets and restructuring data  ", end='')

# Build local dataframes for John Hopkins Data
oldest_MN_CDR_df = COVID_IO.GetCDRState(MNFIPS, oldest_JH_state_df)
oldest_ND_CDR_df = COVID_IO.GetCDRState(NDFIPS, oldest_JH_state_df)
oldest_CLAY_CDR_df = COVID_IO.GetCDRCounty(ClayFIPS, oldest_JH_cnty_df)
oldest_CASS_CDR_df = COVID_IO.GetCDRCounty(CassFIPS, oldest_JH_cnty_df)

# Build local dataframes of mobility data
oldest_CLAY_aapl_df = COVID_IO.getAaplCountyMobility(ClayFIPS, oldest_aapl_cnty_df)
oldest_CASS_aapl_df = COVID_IO.getAaplCountyMobility(CassFIPS, oldest_aapl_cnty_df)
oldest_MN_aapl_df = COVID_IO.getAaplStateMobility(MNFIPS, oldest_aapl_state_df)
oldest_ND_aapl_df = COVID_IO.getAaplStateMobility(NDFIPS, oldest_aapl_state_df)
oldest_CLAY_goog_df = COVID_IO.getGoogleCountyMobility(ClayFIPS, oldest_goog_cnty_df)
oldest_CASS_goog_df = COVID_IO.getGoogleCountyMobility(CassFIPS, oldest_goog_cnty_df)
oldest_MN_goog_df = COVID_IO.getGoogleStateMobility(MNFIPS, oldest_goog_state_df)
oldest_ND_goog_df = COVID_IO.getGoogleStateMobility(NDFIPS, oldest_goog_state_df)

# Build local dataframes/variables of IMHE data
oldest_MN_equip_df = COVID_IO.GetEquipData(MNFIPS, oldest_summary_df)
oldest_MN_icu_beds = COVID_IO.GetNumICUBeds(MNFIPS, oldest_summary_df)
oldest_MN_all_beds = COVID_IO.GetNumAllBeds(MNFIPS, oldest_summary_df)
oldest_MN_icu_usage = COVID_IO.GetICUBedUsage(MNFIPS, oldest_summary_df)
oldest_MN_allbed_usage = COVID_IO.GetAllBedUsage(MNFIPS, oldest_summary_df)
oldest_MN_hospital_df = COVID_IO.GetHospitalizationData(MNFIPS, oldest_hospitalization_df)
oldest_ND_equip_df = COVID_IO.GetEquipData(NDFIPS, oldest_summary_df)
oldest_ND_icu_beds = COVID_IO.GetNumICUBeds(NDFIPS, oldest_summary_df)
oldest_ND_all_beds = COVID_IO.GetNumAllBeds(NDFIPS, oldest_summary_df)
oldest_ND_icu_usage = COVID_IO.GetICUBedUsage(NDFIPS, oldest_summary_df)
oldest_ND_allbed_usage = COVID_IO.GetAllBedUsage(NDFIPS, oldest_summary_df)
oldest_ND_hospital_df = COVID_IO.GetHospitalizationData(NDFIPS, oldest_hospitalization_df)

end= time.perf_counter()

print(f"({end-midpt:0.2f} sec elapsed) ... Done ({end-start:0.2f} sec total for all operations)")

Loading Complete COVID Dataset into memory from CSV files (0.30 sec elapsed) ... Building local datasets and restructuring data  (0.15 sec elapsed) ... Done (0.45 sec total for all operations)


## Previous DataIO approach

This approach loads the CSV files in memory and messages the entire dataframe at once to clean up the data.  This means any later subselections of the data are really fast.

In [4]:
# Load all the dataframes into memory
print("Loading Complete COVID Dataset into memory from CSV files and restructuring data ", end='')

start= time.perf_counter()
# Retrieve John Hopkins dataframes
(old_JH_state_df, old_JH_cnty_df) = COVID_IO.CSVtoCDRDataFrames()
# Retrieve Apple Mobility Dataframe
(old_aapl_cnty_df, old_aapl_state_df) = COVID_IO.CSVtoAAPLMobilityDataFrames()
# Retrieve Google Mobility Dataframe
(old_goog_cnty_df, old_goog_state_df) = COVID_IO.CSVtoGOOGMobilityDataFrames()
# Retrieve IMHE Dataframes
(old_summary_df, old_hospitalization_df) = COVID_IO.CSVtoIMHEDataFrames()
end= time.perf_counter()

# Narrow data down to local subsets
midpt= time.perf_counter()
print(f"({midpt-start:0.2f} sec elapsed) ... Building local datasets  ", end='')

# Build local dataframes for John Hopkins Data
old_MN_CDR_df = COVID_IO.getLocalDataFrame(MNFIPS, old_JH_state_df)
old_ND_CDR_df = COVID_IO.getLocalDataFrame(NDFIPS, old_JH_state_df)
old_CLAY_CDR_df = COVID_IO.getLocalDataFrame(ClayFIPS, old_JH_cnty_df)
old_CASS_CDR_df = COVID_IO.getLocalDataFrame(CassFIPS, old_JH_cnty_df)

# Build local dataframes of mobility data
old_CLAY_aapl_df = COVID_IO.getLocalDataFrame(ClayFIPS, old_aapl_cnty_df)
old_CASS_aapl_df = COVID_IO.getLocalDataFrame(CassFIPS, old_aapl_cnty_df)
old_MN_aapl_df = COVID_IO.getLocalDataFrame(MNFIPS, old_aapl_state_df)
old_ND_aapl_df = COVID_IO.getLocalDataFrame(NDFIPS, old_aapl_state_df)
old_CLAY_goog_df = COVID_IO.getLocalDataFrame(ClayFIPS, old_goog_cnty_df)
old_CASS_goog_df = COVID_IO.getLocalDataFrame(CassFIPS, old_goog_cnty_df)
old_MN_goog_df = COVID_IO.getLocalDataFrame(MNFIPS, old_goog_state_df)
old_ND_goog_df = COVID_IO.getLocalDataFrame(NDFIPS, old_goog_state_df)

# Build local dataframes/variables of IMHE data
old_MN_equip_df = COVID_IO.getLocalDataFrame(MNFIPS, old_summary_df)
old_MN_icu_beds = COVID_IO.GetNumICUBeds(MNFIPS, old_summary_df)
old_MN_all_beds = COVID_IO.GetNumAllBeds(MNFIPS, old_summary_df)
old_MN_icu_usage = COVID_IO.GetICUBedUsage(MNFIPS, old_summary_df)
old_MN_allbed_usage = COVID_IO.GetAllBedUsage(MNFIPS, old_summary_df)
old_MN_hospital_df = COVID_IO.getLocalDataFrame(MNFIPS, old_hospitalization_df)
old_ND_equip_df = COVID_IO.getLocalDataFrame(NDFIPS, old_summary_df)
old_ND_icu_beds = COVID_IO.GetNumICUBeds(NDFIPS, old_summary_df)
old_ND_all_beds = COVID_IO.GetNumAllBeds(NDFIPS, old_summary_df)
old_ND_icu_usage = COVID_IO.GetICUBedUsage(NDFIPS, old_summary_df)
old_ND_allbed_usage = COVID_IO.GetAllBedUsage(NDFIPS, old_summary_df)
old_ND_hospital_df = COVID_IO.getLocalDataFrame(NDFIPS, old_hospitalization_df)

end= time.perf_counter()

print(f"({end-midpt:0.2f} sec elapsed) ... Done ({end-start:0.2f} sec total for all operations)")

Loading Complete COVID Dataset into memory from CSV files and restructuring data (14.23 sec elapsed) ... Building local datasets  (0.02 sec elapsed) ... Done (14.25 sec total for all operations)


## Pickling DataIO approach

Picking is an approach that takes a variable/object in memory and constructs a binary representation of it that is written to disk.  We can then simply read that binary representation and get back the exact object in memory.  No data processing necessary.  This cell implements that.  Its a bit slower than just reading the CSV and modifying the data for each FIPS as needed, BUT if we have to switch between different FIPS subsets, there will be only 0.02 sec versus ~0.15 sec to make the switch based on my timing here).

In [5]:
# Load all the dataframes into memory
print("Loading Complete COVID Dataset into memory from Pickle files ... ", end='')

start= time.perf_counter()
# Retrieve John Hopkins dataframes
(JH_state_df, JH_cnty_df) = COVID_IO.PtoCDRDataFrames()
# Retrieve Apple Mobility Dataframe
(aapl_cnty_df, aapl_state_df) = COVID_IO.PtoAAPLMobilityDataFrames()
# Retrieve Google Mobility Dataframe
(goog_cnty_df, goog_state_df) = COVID_IO.PtoGOOGMobilityDataFrames()
# Retrieve IMHE Dataframes
(summary_df, hospitalization_df) = COVID_IO.PtoIMHEDataFrames()
end= time.perf_counter()

# Narrow data down to local subsets
midpt= time.perf_counter()
print(f"({midpt-start:0.2f} sec elapsed) ... Building local datasets  ", end='')

# Build local dataframes for John Hopkins Data
MN_CDR_df = COVID_IO.getLocalDataFrame(MNFIPS, JH_state_df)
ND_CDR_df = COVID_IO.getLocalDataFrame(NDFIPS, JH_state_df)
CLAY_CDR_df = COVID_IO.getLocalDataFrame(ClayFIPS, JH_cnty_df)
CASS_CDR_df = COVID_IO.getLocalDataFrame(CassFIPS, JH_cnty_df)

# Build local dataframes of mobility data
CLAY_aapl_df = COVID_IO.getLocalDataFrame(ClayFIPS, aapl_cnty_df)
CASS_aapl_df = COVID_IO.getLocalDataFrame(CassFIPS, aapl_cnty_df)
MN_aapl_df = COVID_IO.getLocalDataFrame(MNFIPS, aapl_state_df)
ND_aapl_df = COVID_IO.getLocalDataFrame(NDFIPS, aapl_state_df)
CLAY_goog_df = COVID_IO.getLocalDataFrame(ClayFIPS, goog_cnty_df)
CASS_goog_df = COVID_IO.getLocalDataFrame(CassFIPS, goog_cnty_df)
MN_goog_df = COVID_IO.getLocalDataFrame(MNFIPS, goog_state_df)
ND_goog_df = COVID_IO.getLocalDataFrame(NDFIPS, goog_state_df)

# Build local dataframes/variables of IMHE data
MN_equip_df = COVID_IO.getLocalDataFrame(MNFIPS, summary_df)
MN_icu_beds = COVID_IO.GetNumICUBeds(MNFIPS, summary_df)
MN_all_beds = COVID_IO.GetNumAllBeds(MNFIPS, summary_df)
MN_icu_usage = COVID_IO.GetICUBedUsage(MNFIPS, summary_df)
MN_allbed_usage = COVID_IO.GetAllBedUsage(MNFIPS, summary_df)
MN_hospital_df = COVID_IO.getLocalDataFrame(MNFIPS, hospitalization_df)
ND_equip_df = COVID_IO.getLocalDataFrame(NDFIPS, summary_df)
ND_icu_beds = COVID_IO.GetNumICUBeds(NDFIPS, summary_df)
ND_all_beds = COVID_IO.GetNumAllBeds(NDFIPS, summary_df)
ND_icu_usage = COVID_IO.GetICUBedUsage(NDFIPS, summary_df)
ND_allbed_usage = COVID_IO.GetAllBedUsage(NDFIPS, summary_df)
ND_hospital_df = COVID_IO.getLocalDataFrame(NDFIPS, hospitalization_df)

end= time.perf_counter()

print(f"({end-midpt:0.2f} sec elapsed) ... Done ({end-start:0.2f} sec total for all operations)")

Loading Complete COVID Dataset into memory from Pickle files ... (0.59 sec elapsed) ... Building local datasets  (0.02 sec elapsed) ... Done (0.61 sec total for all operations)
