In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

from datetime import datetime as dt

In [2]:
# Base URL to edit when iterating through dates
# John Hopkins Data
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/'

# States/Provinces to ignore - List is actually unused
excluded = ["Guam", "District of Columbia", "Grand Princess",
            "Diamond Princess","Northern Mariana Islands",
            "Puerto Rico", "Recovered", "American Samoa", "Virgin Islands"]

In [3]:
# Dict to convert full state name to initials for plotly compatibility
us_state_to_abbrev = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "U.S. Virgin Islands": "VI"
}

# Dict to convert UID to FIPS code for database coherency and vaccine graph functionality
uid_to_fips = {
    84000001.0: 1.0,
    84000002.0: 2.0,
    16.0: 60.0,
    84000004.0: 4.0,
    84000005.0: 5.0,
    84000006.0: 6.0,
    84000008.0: 8.0,
    84000009.0: 9.0,
    84000010.0: 10.0,
    84088888.0: 88888.0,
    84000011.0: 11.0,
    84000012.0: 12.0,
    84000013.0: 13.0,
    84099999.0: 99999.0,
    316.0: 66.0,
    84000016.0: 16.0,
    84000017.0: 17.0,
    84000018.0: 18.0,
    84000019.0: 19.0,
    84000020.0: 20.0,
    84000021.0: 21.0,
    84000022.0: 22.0,
    84000023.0: 23.0,
    84000024.0: 24.0,
    84000025.0: 25.0,
    84000026.0: 26.0,
    84000027.0: 27.0,
    84000028.0: 28.0,
    84000029.0: 29.0,
    84000030.0: 30.0,
    84000031.0: 31.0,
    84000032.0: 32.0,
    84000033.0: 33.0,
    84000034.0: 34.0,
    84000035.0: 35.0,
    84000036.0: 36.0,
    84000037.0: 37.0,
    84000038.0: 38.0,
    580.0: 69.0,
    84000039.0: 39.0,
    84000040.0: 40.0,
    84000041.0: 41.0,
    84000042.0: 42.0,
    630.0: 72.0,
    84000044.0: 44.0,
    84000045.0: 45.0,
    84000046.0: 46.0,
    84000047.0: 47.0,
    84000048.0: 48.0,
    84000049.0: 49.0,
    84000050.0: 50.0,
    850.0: 78.0,
    84000051.0: 51.0,
    84000053.0: 53.0,
    84000054.0: 54.0,
    84000055.0: 55.0,
    84000056.0: 56.0  
}

In [4]:
# Generating Date Ranges for CSV retrieval
dates = pd.date_range(start='04-12-2020', end='07-26-2022').strftime('%m-%d-%Y')
for date in dates[0:5]:
    print(date)
print()
print(f'Total Days: {len(dates)}')

04-12-2020
04-13-2020
04-14-2020
04-15-2020
04-16-2020

Total Days: 836


In [5]:
def generateSmallTables(dates = dates):
    for date in dates:
        url = BASE_URL + date + ".csv"
        print(url)
        df = pd.read_csv(url).fillna(0)
        df.to_csv("data/SmallTables/" + date + ".csv", index=False)

#generateSmallTables()

In [6]:
def editSmallTables(dates = dates):
    for date in dates:
        path = "data/SmallTables/" + date + ".csv"
        print(path)
        df = pd.read_csv(path).fillna(0)
        df['Last_Update'] = date
        df.to_csv("data/SmallTables/" + date + ".csv", index=False)

#editSmallTables()

In [7]:
def generateBigTable(dates = dates):
    # Init main DataFrame
    allData = pd.DataFrame()
    # Iterate through days
    for date in dates:
        path = "data/SmallTables/" + date + ".csv"
        print(path)
        allData = pd.concat([allData, pd.read_csv(path).fillna(0)])
    # Pandas didn't like it when I tried anything but a boolean operator
    allData = allData[allData.Province_State != "Guam"]
    allData = allData[allData.Province_State != "District of Columbia"]
    allData = allData[allData.Province_State != "Grand Princess"]
    allData = allData[allData.Province_State != "Diamond Princess"]
    allData = allData[allData.Province_State != "Northern Mariana Islands"]
    allData = allData[allData.Province_State != "Puerto Rico"]
    allData = allData[allData.Province_State != "Recovered"]
    allData = allData[allData.Province_State != "American Samoa"]
    allData = allData[allData.Province_State != "Virgin Islands"]
    # Drop redundant columns
    allData = allData.drop(["ISO3", "FIPS", "Country_Region", "Date"], axis = 1)
    # Reorder so Primary Key is in front of DataFrame
    uid = allData.pop("UID")
    updated = allData.pop("Last_Update")
    allData.insert(0, "UID", uid)
    allData.insert(1, "Date", updated)
    #allData["Province_State"] = us_state_to_abbrev[allData["Province_State"]]
    allData["Province_State"] = allData["Province_State"].replace(us_state_to_abbrev)

    # Write to CSV
    allData.to_csv("data/bigTable.csv", index=False)

#generateBigTable()

In [8]:
testView = pd.read_csv("data/bigTable.csv")
print(testView.shape)
testView.head(100)

(41800, 17)


Unnamed: 0,UID,Date,Province_State,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
0,84000001.00,04-12-2020,AL,32.32,-86.90,3667,93,0.00,0.00,74.79,0.00,437.00,0.00,1265.08,12.26,62029.00,2.54
1,84000002.00,04-12-2020,AK,61.37,-152.40,272,8,66.00,198.00,37.18,0.00,31.00,0.00,1344.71,11.40,8038.00,2.94
2,84000004.00,04-12-2020,AZ,33.73,-111.43,3542,115,0.00,0.00,48.66,0.00,0.00,0.00,578.52,0.00,52289.00,3.25
3,84000005.00,04-12-2020,AR,34.97,-92.37,1280,27,367.00,886.00,42.41,0.00,130.00,0.00,761.75,10.16,19722.00,2.11
4,84000006.00,04-12-2020,CA,36.12,-119.68,22201,632,0.00,0.00,56.19,0.00,5234.00,0.00,485.42,22.96,190328.00,2.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,84000051.00,04-13-2020,VA,37.77,-78.17,5747,149,0.00,0.00,67.33,0.00,1238.00,0.00,523.58,21.54,41401.00,2.59
96,84000053.00,04-13-2020,WA,47.40,-121.49,10635,582,0.00,0.00,139.66,0.00,527.00,0.00,1242.51,4.96,93802.00,5.47
97,84000054.00,04-13-2020,WV,38.49,-80.95,611,8,85.00,518.00,34.09,0.00,164.00,0.00,1258.88,26.84,16655.00,1.31
98,84000055.00,04-13-2020,WI,44.27,-89.62,3428,154,0.00,0.00,58.88,43120.00,993.00,0.00,0.00,28.97,40197.00,4.49


In [9]:
testView.describe()

Unnamed: 0,UID,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
count,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41550.0,41550.0
mean,84000029.32,39.47,-93.67,749196.12,11167.77,30669.81,21378.32,11547.2,8670962.38,754.64,1.1,130538.98,1.47,312579.1,0.84
std,15.62,6.08,19.15,1177240.22,15585.13,123387.51,77075.36,9149.05,17554437.75,5235.15,0.79,127544.81,4.37,1207473.43,1.78
min,84000001.0,21.09,-157.5,270.0,0.0,0.0,0.0,28.74,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,84000017.0,35.63,-105.31,99544.25,1574.0,0.0,0.0,2855.76,0.0,0.0,0.0,26072.18,0.0,0.0,0.0
50%,84000029.5,40.0,-89.65,341046.0,5598.5,0.0,0.0,10274.52,3127878.0,0.0,1.24,105147.64,0.0,0.0,0.0
75%,84000042.0,43.33,-79.81,883742.25,14018.0,4780.5,4544.25,17587.51,9842443.0,0.0,1.58,188195.39,0.0,16097.0,0.69
max,84000056.0,61.37,-69.38,10644446.0,93319.0,2470308.0,1408516.0,58927.99,318233196.0,89995.0,6.28,1635860.89,38.5,19565151.0,9.74


In [10]:
# Age and Sex Data
path = "data/SexAndAge/COVID-19_Death_Counts_by_Age_2020-2022.csv"
sex_age = pd.read_csv(path)
sex_age.head()

#sex_age = sex_age.drop(["Data as of", "Start Date", "End Date"], axis=1)
#sex_age.to_csv(path, index=False)
sex_age.head(200)

Unnamed: 0,Sex,Age Years,Total deaths,COVID-19 Deaths
0,Female,0-05 Months,19659,103
1,Male,0-05 Months,24188,126
2,Female,06-11 Months,1639,32
3,Male,06-11 Months,2111,45
4,Female,01 Year,1567,34
...,...,...,...,...
169,Male,83 Years,107411,14379
170,Female,84 Years,111326,12230
171,Male,84 Years,105517,13767
172,Female,85 Years and over,1483023,148112


In [11]:
def clean_vaccine(vaccine):
    vaccine = vaccine[vaccine.FIPS != 0]
    vaccine["Date"] = pd.to_datetime(vaccine["Date"]).dt.strftime('%m-%d-%Y')
    vaccine = vaccine.drop(["ID"], axis=1)
    #vaccine.to_csv(path, index=False)
    return vaccine

In [12]:
# Load Vaccine Data
path = "data/vaccine2.csv"
vaccine = pd.read_csv(path, low_memory=False).fillna(0)
print(vaccine.shape)
vaccine.head(100)

(861191, 7)


Unnamed: 0,FIPS,Date,Vaccine,DoseType,DoseValue,Vax_Full,Vax_Partial
0,1,12-10-2020,All,Admin,0.00,0.00,0.00
1,1,12-10-2020,All,Alloc,0.00,0.00,0.00
2,1,12-10-2020,All,Ship,0.00,0.00,0.00
3,1,12-10-2020,All,Stage1,0.00,0.00,0.00
4,1,12-10-2020,All,Stage2,0.00,0.00,0.00
...,...,...,...,...,...,...,...
95,1,12-15-2020,Unassigned,Admin,0.00,0.00,0.00
96,1,12-15-2020,Unassigned,Alloc,0.00,0.00,0.00
97,1,12-15-2020,Unassigned,Ship,0.00,0.00,0.00
98,1,12-15-2020,Unassigned,Stage1,0.00,0.00,0.00


In [13]:
print(vaccine.shape)
vaccine.describe()

(861191, 7)


Unnamed: 0,FIPS,DoseValue,Vax_Full,Vax_Partial
count,861191.0,861191.0,861191.0,861191.0
mean,28.97,1837973.96,3148601.57,657819.53
std,15.69,11182745.94,4267680.77,978852.47
min,1.0,-271346.0,0.0,0.0
25%,16.0,0.0,621861.0,129009.0
50%,29.0,4647.0,1727264.0,351506.0
75%,42.0,1130426.0,3889031.0,654533.0
max,60.0,587903405.0,28893211.0,7310293.0
