In [95]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import os

# dir_path = os.path.dirname(os.path.realpath(__file__))
# repo_root = os.path.abspath(os.path.join(dir_path, '..', '..'))

In [96]:
def getCdcData():
    # front end CDC page with links
    URL = 'https://healthdata.gov/dataset/covid-19-diagnostic-laboratory-testing-pcr-testing-time-series'

    # download Page as HTML
    page = requests.get(URL)

    # parse HTML with BS4
    soup = BeautifulSoup(page.content, 'html.parser')

    # snag the HREF from the known "download" button
    accessLink = soup.find('a', {'class': 'data-link'})['href']
    
    raw = pd.read_csv(accessLink)[['state_fips','overall_outcome','date','new_results_reported','total_results_reported']]
    
    totalNew = raw[['state_fips','date','new_results_reported']].groupby(['state_fips','date']).sum().reset_index().rename(columns={'new_results_reported':'total'})
    positiveNew = raw[raw['overall_outcome']=='Positive'][['state_fips','date','new_results_reported']].rename(columns={'new_results_reported':'positive'})

    return { 'totalNew': totalNew, 'positiveNew': positiveNew}


In [97]:
def parseCsvOutput(df, colName, operation=None):
    # thanks to @piRSquared on stackoverflow for this nifty pivot expressions
    # https://stackoverflow.com/questions/54915215/expressing-time-series-data-in-the-columns-rather-than-the-rows-of-a-dataframe
    tempDf = df[['state_fips','date',colName]]
    tempDf = tempDf.pivot_table(index='state_fips', columns='date').swaplevel(0, 1, 1).sort_index(1).reset_index()
    tempDf.columns = [column[0] for column in list(tempDf.columns)]

    return tempDf

In [112]:
def parse7dayRolling(df, colName, preLoaded=False, normalize=False):
    if preLoaded:
        tempDf = df
    else:
        tempDf = parseCsvOutput(df, colName)

    colList = list(tempDf.columns[1:])
    colList.sort()

    for i in range(len(colList), 0, -1):
        if i >= 7:
            n = i-7
            length = 7
        else:
            n = 0
            length = i
        tempDf.loc[:,colList[i-1]] = round(tempDf[colList[n:i]].sum(axis=1)/length,2)
    
    if normalize: 
    #     popDf = pd.read_csv(os.path.join(dir_path, 'county_populations.csv'))[["FIPS","population"]]
        popDf = pd.read_csv('state_populations.csv')[["FIPS","population"]]
        tempDf = tempDf.merge(popDf, left_on="state_fips", right_on="FIPS", how="left")
    
        for column in colList:
            tempDf[column] = tempDf[column]/tempDf['population']*100000
    
    return tempDf[['state_fips']+colList]

In [113]:
currentData = getCdcData()

In [169]:
totalTesting = parseCsvOutput(currentData['totalNew'], 'total')
testingPer100Rolling = parse7dayRolling(currentData['totalNew'], 'total', normalize=True).round(2)

positiveTestsRolling = parse7dayRolling(currentData['positiveNew'], 'positive')
testingRolling = parse7dayRolling(currentData['totalNew'], 'total')

testingPositivityRolling = positiveTestsRolling.div(testingRolling, axis='columns').round(2)
testingPositivityRolling['state_fips'] = positiveTestsRolling['state_fips']

casesRolling = parse7dayRolling(pd.read_csv('../../docs/csv/covid_confirmed_1p3a_state.csv')\
                                .rename(columns={"GEOID":"state_fips"})\
                                [testingRolling.columns], '', preLoaded=True)\
                                .sort_values('state_fips')
casesRolling = casesRolling[casesRolling.state_fips.isin(testingRolling.state_fips)].reset_index().drop(columns=['index'])
                                

ccptRolling = casesRolling.div(testingRolling, axis='columns').round(2)
ccptRolling['state_fips'] = casesRolling['state_fips'].astype(int)

In [173]:
casesRolling.div(testingRolling, axis='columns')

Unnamed: 0,state_fips,2020-03-01,2020-03-02,2020-03-03,2020-03-04,2020-03-05,2020-03-06,2020-03-07,2020-03-08,2020-03-09,...,2021-01-21,2021-01-22,2021-01-23,2021-01-24,2021-01-25,2021-01-26,2021-01-27,2021-01-28,2021-01-29,2021-01-30
0,1.0,,,,,,0.0,0.0,0.0,0.0,...,0.183865,0.191446,0.192457,0.181373,0.188702,0.197218,0.198813,0.223454,0.266981,0.258843
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.031812,0.033357,0.03514,0.033771,0.03163,0.029924,0.028997,0.02891,0.029835,0.031691
2,1.0,,0.0,0.036667,0.03125,0.022727,0.026764,0.053926,0.053221,0.062888,...,0.225544,0.199007,0.188545,0.194811,0.195068,0.19689,0.196098,0.181539,0.178036,0.18517
3,1.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.205968,0.189481,0.181457,0.187313,0.178016,0.195907,0.188909,0.170045,0.170791,0.182629
4,1.0,0.087719,0.070175,0.077857,0.089231,0.082677,0.082487,0.063033,0.064927,0.081567,...,0.130488,0.122447,0.117217,0.117687,0.126807,0.116566,0.144997,0.189647,0.29065,0.470967
5,1.0,,,,,inf,inf,inf,inf,12.214286,...,0.053138,0.051507,0.053242,0.053965,0.051226,0.052747,0.0576,0.059912,0.062028,0.051244
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017199,0.010212,...,0.065697,0.055318,0.053033,0.05133,0.0457,0.042996,0.051113,0.06206,0.076909,0.099415
7,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.069219,0.06936,0.070559,0.065798,0.069539,0.068409,0.067238,0.066997,0.068757,0.074769
8,1.0,,,,,,0.0,0.325581,0.057613,0.159664,...,0.034374,0.035198,0.031301,0.030552,0.032761,0.032225,0.030062,0.028874,0.028879,0.031774
9,1.0,0.064516,0.016667,0.019559,0.022346,0.013867,0.012435,0.01501,0.01261,0.011202,...,0.108828,0.104573,0.105376,0.10344,0.103742,0.102636,0.097913,0.099083,0.103011,0.120039


In [174]:
testingRolling.head()

Unnamed: 0,state_fips,2020-03-01,2020-03-02,2020-03-03,2020-03-04,2020-03-05,2020-03-06,2020-03-07,2020-03-08,2020-03-09,...,2021-01-21,2021-01-22,2021-01-23,2021-01-24,2021-01-25,2021-01-26,2021-01-27,2021-01-28,2021-01-29,2021-01-30
0,1,0.0,0.0,0.0,0.0,0.0,0.83,1.14,2.0,2.29,...,13943.43,13843.57,13923.71,14627.29,14369.57,14027.29,13962.86,12912.86,10433.57,8910.0
1,2,4.0,2.0,2.0,4.5,5.4,6.67,7.14,7.71,10.43,...,6376.86,6025.71,5976.14,5719.14,5767.71,5862.57,6123.71,6157.0,5425.14,4643.0
2,4,0.0,5.5,9.0,8.0,8.8,12.33,10.57,10.71,11.29,...,31481.29,34224.29,35883.14,35706.43,35847.57,35256.86,34161.43,33622.86,32508.57,29651.57
3,5,0.0,0.0,0.0,0.25,0.4,0.33,0.29,0.71,0.86,...,10202.0,10364.43,10090.57,9848.29,9985.43,9912.14,9717.43,9775.57,9352.29,8911.14
4,6,57.0,85.5,85.67,81.25,101.6,119.17,172.29,189.29,201.43,...,239188.43,229997.71,228575.71,227568.71,219369.29,199311.29,158082.14,117055.0,77257.86,46036.0


In [175]:
casesRolling.head()

Unnamed: 0,state_fips,2020-03-01,2020-03-02,2020-03-03,2020-03-04,2020-03-05,2020-03-06,2020-03-07,2020-03-08,2020-03-09,...,2021-01-21,2021-01-22,2021-01-23,2021-01-24,2021-01-25,2021-01-26,2021-01-27,2021-01-28,2021-01-29,2021-01-30
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2563.71,2650.29,2679.71,2653.0,2711.57,2766.43,2776.0,2885.43,2785.57,2306.29
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,202.86,201.0,210.0,193.14,182.43,175.43,177.57,178.0,161.86,147.14
2,4,0.0,0.0,0.33,0.25,0.2,0.33,0.57,0.57,0.71,...,7100.43,6810.86,6765.57,6956.0,6992.71,6941.71,6699.0,6103.86,5787.71,5490.57
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2101.29,1963.86,1831.0,1844.71,1777.57,1941.86,1835.71,1662.29,1597.29,1627.43
4,6,5.0,6.0,6.67,7.25,8.4,9.83,10.86,12.29,16.43,...,31211.29,28162.57,26793.0,26781.86,27817.57,23233.0,22921.43,22199.14,22455.0,21681.43


In [156]:
ccptRolling.sort_values('state_fips').head()

Unnamed: 0,state_fips,2020-03-01,2020-03-02,2020-03-03,2020-03-04,2020-03-05,2020-03-06,2020-03-07,2020-03-08,2020-03-09,...,2021-01-21,2021-01-22,2021-01-23,2021-01-24,2021-01-25,2021-01-26,2021-01-27,2021-01-28,2021-01-29,2021-01-30
18,1.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.17,0.21,0.23,0.24,0.24,0.24,0.23,0.24,0.26,0.24
19,2.0,,,,,,,,,0.0,...,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03
20,4.0,,,inf,inf,inf,inf,inf,inf,5.07,...,0.17,0.16,0.16,0.17,0.17,0.16,0.16,0.15,0.16,0.18
21,5.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02
22,6.0,inf,inf,inf,14.5,21.0,29.79,25.26,28.58,28.82,...,0.78,0.7,0.67,0.68,0.71,0.6,0.6,0.6,0.64,0.63


In [130]:
URL = 'https://healthdata.gov/dataset/covid-19-diagnostic-laboratory-testing-pcr-testing-time-series'

# download Page as HTML
page = requests.get(URL)


In [131]:

# parse HTML with BS4
soup = BeautifulSoup(page.content, 'html.parser')

In [134]:


# snag the HREF from the known "download" button
accessLink = soup.find('a', {'class': 'data-link'})

In [135]:
accessLink

<a class="btn btn-primary data-link" href="https://healthdata.gov/sites/default/files/covid-19_diagnostic_lab_testing_20210131_2252.csv"><i class="fa fa-download"></i> Download</a>