In [29]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.2f' % x)

from datetime import datetime as dt

In [28]:
# Base URL to edit when iterating through dates
BASE_URL = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports_us/'

# States/Provinces to ignore
excluded = ["Guam", "District of Columbia", "Grand Princess",
            "Diamond Princess","Northern Mariana Islands",
            "Puerto Rico", "Recovered", "American Samoa", "Virgin Islands"]

In [3]:
# Generating Date Ranges for CSV retrieval
dates = pd.date_range(start='04-12-2020', end='07-26-2022').strftime('%m-%d-%Y')
for date in dates[0:5]:
    print(date)
print()
print(f'Total Days: {len(dates)}')

04-12-2020
04-13-2020
04-14-2020
04-15-2020
04-16-2020

Total Days: 836


In [5]:
def generateSmallTables(dates = dates):
    for date in dates:
        url = BASE_URL + date + ".csv"
        print(url)
        df = pd.read_csv(url).fillna(0)
        df.to_csv("data/SmallTables/" + date + ".csv", index=False)

#generateSmallTables()

In [6]:
def editSmallTables(dates = dates):
    for date in dates:
        path = "data/SmallTables/" + date + ".csv"
        print(path)
        df = pd.read_csv(path).fillna(0)
        df['Last_Update'] = date
        df.to_csv("data/SmallTables/" + date + ".csv", index=False)

#editSmallTables()

In [27]:
def generateBigTable(dates = dates):
    # Init main DataFrame
    allData = pd.DataFrame()
    # Iterate through days
    for date in dates:
        path = "data/SmallTables/" + date + ".csv"
        print(path)
        allData = pd.concat([allData, pd.read_csv(path).fillna(0)])
    # There is probably a better way to clean this but it isn't very clear
    allData = allData[allData.Province_State != "Guam"]
    allData = allData[allData.Province_State != "District of Columbia"]
    allData = allData[allData.Province_State != "Grand Princess"]
    allData = allData[allData.Province_State != "Diamond Princess"]
    allData = allData[allData.Province_State != "Northern Mariana Islands"]
    allData = allData[allData.Province_State != "Puerto Rico"]
    allData = allData[allData.Province_State != "Recovered"]
    allData = allData[allData.Province_State != "American Samoa"]
    allData = allData[allData.Province_State != "Virgin Islands"]
    # Drop redundant columns
    allData = allData.drop(["ISO3", "FIPS", "Country_Region", "Date"] ,axis = 1)
    # Reorder so Primary Key is in front of DataFrame
    uid = allData.pop("UID")
    updated = allData.pop("Last_Update")
    allData.insert(0, "UID", uid)
    allData.insert(1, "Date", updated)
    # Write to CSV
    allData.to_csv("data/bigTable.csv", index=False)

#generateBigTable()

In [30]:
testView = pd.read_csv("data/bigTable.csv")
print(testView.shape)
testView.head(10)

(41800, 17)


Unnamed: 0,UID,Date,Province_State,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
0,84000001.0,04-12-2020,Alabama,32.32,-86.9,3667,93,0.0,0.0,74.79,0.0,437.0,0.0,1265.08,12.26,62029.0,2.54
1,84000002.0,04-12-2020,Alaska,61.37,-152.4,272,8,66.0,198.0,37.18,0.0,31.0,0.0,1344.71,11.4,8038.0,2.94
2,84000004.0,04-12-2020,Arizona,33.73,-111.43,3542,115,0.0,0.0,48.66,0.0,0.0,0.0,578.52,0.0,52289.0,3.25
3,84000005.0,04-12-2020,Arkansas,34.97,-92.37,1280,27,367.0,886.0,42.41,0.0,130.0,0.0,761.75,10.16,19722.0,2.11
4,84000006.0,04-12-2020,California,36.12,-119.68,22201,632,0.0,0.0,56.19,0.0,5234.0,0.0,485.42,22.96,190328.0,2.84
5,84000008.0,04-12-2020,Colorado,39.06,-105.31,7307,289,0.0,0.0,126.89,0.0,1376.0,0.0,615.39,18.83,34873.0,3.96
6,84000009.0,04-12-2020,Connecticut,41.6,-72.76,12035,554,0.0,0.0,337.56,0.0,1654.0,0.0,1156.15,13.74,41220.0,4.6
7,84000010.0,04-12-2020,Delaware,39.32,-75.51,1625,49,191.0,1385.0,166.88,0.0,190.0,0.0,1140.21,11.69,11103.0,3.02
8,84000012.0,04-12-2020,Florida,27.77,-81.69,19895,461,0.0,0.0,92.63,0.0,2772.0,0.0,1508.52,13.93,323996.0,2.32
9,84000013.0,04-12-2020,Georgia,33.04,-83.64,12452,433,0.0,0.0,117.28,0.0,2505.0,0.0,537.04,20.12,54453.0,3.48


In [12]:
testView.describe()

Unnamed: 0,Lat,Long_,Confirmed,Deaths,Recovered,Active,Incident_Rate,Total_Test_Results,People_Hospitalized,Case_Fatality_Ratio,UID,Testing_Rate,Hospitalization_Rate,People_Tested,Mortality_Rate
count,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41800.0,41550.0,41550.0
mean,39.466,-93.668,749196.122,11167.77,30669.813,21378.317,11547.205,8670962.377,754.641,1.096,84000029.32,130538.978,1.473,312579.096,0.843
std,6.08,19.146,1177240.222,15585.131,123387.509,77075.362,9149.047,17554437.753,5235.154,0.79,15.624,127544.814,4.374,1207473.429,1.776
min,21.094,-157.498,270.0,0.0,0.0,0.0,28.743,0.0,0.0,0.0,84000001.0,0.0,0.0,0.0,0.0
25%,35.63,-105.311,99544.25,1574.0,0.0,0.0,2855.756,0.0,0.0,0.0,84000017.0,26072.185,0.0,0.0,0.0
50%,40.0,-89.648,341046.0,5598.5,0.0,0.0,10274.517,3127878.0,0.0,1.239,84000029.5,105147.639,0.0,0.0,0.0
75%,43.327,-79.806,883742.25,14018.0,4780.5,4544.25,17587.506,9842443.0,0.0,1.583,84000042.0,188195.39,0.0,16097.0,0.691
max,61.371,-69.382,10644446.0,93319.0,2470308.0,1408516.0,58927.992,318233196.0,89995.0,6.283,84000056.0,1635860.889,38.501,19565151.0,9.741


In [33]:
# Country level data
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_daily_reports/01-01-2021.csv"
df = pd.read_csv(url).fillna(0)
df = df.drop(["FIPS"])
df.head(20)

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incident_Rate,Case_Fatality_Ratio
0,0.0,0,0,Afghanistan,2021-01-02 05:22:33,33.94,67.71,52513,2201,41727,8585,Afghanistan,134.9,4.19
1,0.0,0,0,Albania,2021-01-02 05:22:33,41.15,20.17,58316,1181,33634,23501,Albania,2026.41,2.03
2,0.0,0,0,Algeria,2021-01-02 05:22:33,28.03,1.66,99897,2762,67395,29740,Algeria,227.81,2.76
3,0.0,0,0,Andorra,2021-01-02 05:22:33,42.51,1.52,8117,84,7463,570,Andorra,10505.4,1.03
4,0.0,0,0,Angola,2021-01-02 05:22:33,-11.2,17.87,17568,405,11146,6017,Angola,53.45,2.31
5,0.0,0,0,Antigua and Barbuda,2021-01-02 05:22:33,17.06,-61.8,159,5,148,6,Antigua and Barbuda,162.36,3.14
6,0.0,0,0,Argentina,2021-01-02 05:22:33,-38.42,-63.62,1629594,43319,1426676,159599,Argentina,3605.63,2.66
7,0.0,0,0,Armenia,2021-01-02 05:22:33,40.07,45.04,159738,2828,143355,13555,Armenia,5390.66,1.77
8,0.0,0,Australian Capital Territory,Australia,2021-01-02 05:22:33,-35.47,149.01,118,3,114,1,"Australian Capital Territory, Australia",27.56,2.54
9,0.0,0,New South Wales,Australia,2021-01-02 05:22:33,-33.87,151.21,4947,54,0,4893,"New South Wales, Australia",60.94,1.09
