## COVID-19 Data Aggregation
Getting a combining COVID-19 data from different sources

In [6]:
import os

project_folder = os.getcwd()

# Path to chromedriver
driver_pth = rf"{project_folder}\assets\chromedriver.exe"
# chrome_path = r"C:\Program Files (x86)\Google\Chrome\Application"
os.environ["PATH"] = os.environ["PATH"] + f";{driver_pth}"

In [7]:
import json
import pandas as pd
import urllib.request as ureq
from datetime import datetime as dt


from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait as wdw
from selenium.webdriver.support import expected_conditions as expcnd

In [8]:
class Source:
    """
    Data Structure to set three common variables with
    the capacity for ad-hoc key-value pairs.
    """
    def __init__(self, site, csv_name, **kwargs):
        self.site = site
        self.csv_name = csv_name
        self.__set_path(csv_name)
        self.__dict__.update(kwargs)


    def __set_path(self, value):
        self.csv_path = f"src\csv-files\{value}.csv"

#### Source: New York Times
URL: https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html

In [9]:
# Create small structure to hold variables.
nyt = Source(
    site = "nyt",
    csv_name = "CasesByState",
    url = "https://www.nytimes.com/interactive/2020/us/coronavirus-us-cases.html",
    )

def process_df(dataframe):
    col_dict = {k:v.lower() for k, v in enumerate(dataframe.loc[0].values)}
    dataframe = dataframe.rename(columns=col_dict)
    dataframe.drop(0, inplace=True)
    dataframe = dataframe.reset_index(drop=True)
    return dataframe

def scrape_nyt():
    data_list = list()
    with webdriver.Chrome() as chdriver:
        chdriver.get(nyt.url)

        btn_show_more = wdw(chdriver, 10) \
                            .until(expcnd.presence_of_element_located((By.CSS_SELECTOR,"#g-cases-by-state button[class^='svelte']")))
    
        chdriver.execute_script("arguments[0].click()", btn_show_more)
        for element in wdw(chdriver, 10).until(expcnd.presence_of_all_elements_located((By.CSS_SELECTOR,"#g-cases-by-state table[class^='svelte'] tr"))):
            tbl_data = [i.text for i in element.find_elements_by_css_selector("th,td")]
            data_list.append(tbl_data)

    if len(data_list) > 0:
        df1 = pd.DataFrame(data_list)
        return process_df(df1)

def run_nyt(testing=False):
    df = scrape_nyt()
    df["date_retrieved"] = pd.to_datetime(dt.now()).strftime("%Y-%m-%d")
    df["source"] = nyt.site
    
    if testing:
        return df
    else:
        # Append to csv document (not happening in this notebook)
        with open(nyt.csv_path, mode="a", encoding="utf-8", newline="\n") as f:
            df.to_csv(f, index=False, header=f.tell()==0, line_terminator="\n", encoding="utf-8")
            

In [10]:
df = run_nyt(True)
df.head()

Unnamed: 0,state,cases,deaths,date_retrieved,source
0,New York,38987,432,2020-03-27,nyt
1,New Jersey,6876,81,2020-03-27,nyt
2,California,4060,82,2020-03-27,nyt
3,Washington,3208,151,2020-03-27,nyt
4,Michigan,2879,63,2020-03-27,nyt


#### CDC Daily Cumulative US Cases
URL: https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html

In [13]:
cdc_cumul = Source(
    site = "cdc_gov",
    csv_name = "DailyAccumulatedCases",
    url = "https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/total-cases-onset.json",
    )

def run_cdc_cumul(testing=False):
    with ureq.urlopen(cdc_cumul.url) as resp:
        data = resp.read().decode("utf-8")
    
    json_data =  json.loads(data)
    core_data = json_data.get("data").get("columns")
    dates = core_data[0][1:]
    start_of_tracking = [f"{i}" for i in range(1, len(dates) - 9)]
    values = core_data[1][1:]
    df_data = list(zip(dates, start_of_tracking, values))
    df_columns = ["Reported Date", "Start of Tracking", "Total number of accumulated cases",]
    
    df_cdc = pd.DataFrame(data=df_data, columns=df_columns)
    if testing:
        return df_cdc
    else:
        df_cdc.to_csv(cdc_cumul.csv_path, index=False)

In [14]:
df2 = run_cdc_cumul(True)
df2.head()

Unnamed: 0,Reported Date,Start of Tracking,Total number of accumulated cases
0,1/22/2020,1,1
1,1/23/2020,2,1
2,1/24/2020,3,2
3,1/25/2020,4,2
4,1/26/2020,5,5


#### CDC Daily Onset Data
URL: https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/cases-in-us.html

In [15]:
cdc_onset = Source(
    site = "cdc_gov",
    csv_name = "DailyReportedCasesUS",
    url = "https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/us-cases-epi-chart.json",
    )

def run_cdc_onset(testing=False):
    with ureq.urlopen(cdc_onset.url) as resp:
        data = resp.read().decode("utf-8")
    
    json_data =  json.loads(data)
    core_data = json_data.get("data").get("columns")
    dates = core_data[0][1:]
    # start_of_day = [f"{i}" for i in range(len(dates) - 10)]
    # start_of_day += ["Not all illnesses reported" for _ in range(10)]
    start_of_day = [i if i < len(dates) - 10 else 0 for i in range(len(dates))]
    values = core_data[1][1:]
    df_data = list(zip(dates, start_of_day, values))
    df_columns = ["Reported Date", "Start of Day", "Number of cases",]

    df_cdc = pd.DataFrame(data=df_data, columns=df_columns)
    if testing:
        return df_cdc
    else:
        df_cdc.to_csv(cdc_onset.csv_path, index=False)

In [16]:
df3 = run_cdc_onset(True)
df3.head()

Unnamed: 0,Reported Date,Start of Day,Number of cases
0,1/12/2020,0,0
1,1/13/2020,1,0
2,1/14/2020,2,2
3,1/15/2020,3,0
4,1/16/2020,4,1


#### Michigan - COVID-19 Cases, by County
URL: https://www.michigan.gov/coronavirus/0,9753,7-406-98163-520743--,00.html

In [19]:
mich = Source(
    site = "mich_gov",
    csv_name = "CasesByCounty",
    url = "https://www.michigan.gov/coronavirus/0,9753,7-406-98163-520743--,00.html",
    )

def run_michigan(testing=False):
    res = pd.read_html(mich.url)
    dfm = res[0]
    df_mich = process_df(dfm)
    # Drop total row from bottom
    df_mich = df_mich.loc[~df_mich["county"].str.contains("Total"),:].copy()
    
    df_mich["date_retrieved"] = pd.to_datetime(dt.now()).strftime("%Y-%m-%d")
    df_mich["source"] = mich.site

    if testing:
        return df_mich
    else:
        with open(mich.csv_path, mode="a", encoding="utf-8", newline="\n") as f:
            df_mich.to_csv(f, index=False, header=f.tell()==0, line_terminator="\n", encoding="utf-8")

In [20]:
df4 = run_michigan(True)
df4.head()

Unnamed: 0,county,cases,deaths,date_retrieved,source
0,Allegan,1,,2020-03-27,mich_gov
1,Barry,1,,2020-03-27,mich_gov
2,Bay,4,,2020-03-27,mich_gov
3,Berrien,11,,2020-03-27,mich_gov
4,Calhoun,7,,2020-03-27,mich_gov


#### Johns Hopkins Data from Github
URL: https://github.com/CSSEGISandData/COVID-19

In [21]:
gh = Source(
    site = "github",
    csv_name = "JohnsHopkinsCovidData",
    index_url = "https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_daily_reports",
    )


def run_jh(testing=False):
    res = pd.read_html(gh.index_url)
    df1 = res[0]
    dates = df1.loc[df1["Name"].str.endswith("csv"), :]["Name"].values.tolist()
    dates = dates[::-1]
    
    
    col_dict = {
     'Province/State':'Province_State',
     'Country/Region':'Country_Region',
     'Last Update':'Last_Update',
     'Latitude':'Lat',
     'Longitude':"Long_"
     }
    
    
    dfs = pd.DataFrame(columns=['FIPS','Admin2','Province_State','Country_Region','Last_Update','Lat','Long_','Active','Combined_Key'])
    
    for d in dates:
        tmp_url = f"https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/{d}"
        dfx = pd.read_csv(tmp_url)
        dfx = dfx.rename(columns=col_dict)
        dfs = dfs.append(dfx, sort=False)
    
    
    df = dfs.drop(['FIPS','Lat', 'Long_','Combined_Key',], axis=1).copy()
    df = df.reset_index(drop=True)
    df.loc[:, "Last_Update"] = pd.to_datetime(df.loc[:, "Last_Update"], infer_datetime_format=True)
    df.loc[:, "Last_Update"] = df.loc[:, "Last_Update"].dt.strftime("%Y-%m-%d")

    if testing:
        return df
    else:
        df.to_csv(gh.csv_path, index=False, float_format="%.0f")
        
        # df.to_excel(xl_name, index=False, float_format="%.0f")

In [22]:
df5 = run_jh(True)
df5.head()

Unnamed: 0,Admin2,Province_State,Country_Region,Last_Update,Active,Confirmed,Deaths,Recovered
0,Abbeville,South Carolina,US,2020-03-26,0,3.0,0.0,0.0
1,Acadia,Louisiana,US,2020-03-26,0,3.0,0.0,0.0
2,Accomack,Virginia,US,2020-03-26,0,2.0,0.0,0.0
3,Ada,Idaho,US,2020-03-26,0,39.0,0.0,0.0
4,Adair,Iowa,US,2020-03-26,0,1.0,0.0,0.0
