# Getting the Data

Data Source:<br> 
<br> 
https://github.com/CSSEGISandData/COVID-19
<br> 
https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series

## Naming conventions

### `group`

`group` refers to the two separate "groups" of data.

* `"world"` - represents data from each country.
* `"usa"` - represents data from each state in the United States.

### `kind`

`kind` will refer to the two different kinds of COVID-19 data.

* `"deaths"`
* `"cases"`


### `area`

* `"area"` will refer to specific countries or states.


## Downloading the data


In [9]:
import pandas as pd

DOWNLOAD_URL = (
    "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/"
    "master/csse_covid_19_data/csse_covid_19_time_series/"
    "time_series_covid19_{kind}_{group}.csv"
)

GROUPS = "world", "usa"
KINDS = "deaths", "cases"


# Function 1
def download_data(group, kind):
    """
    Fetches and returns COVID-19 data from the John Hopkins GitHub repository.
    Selects data type ('deaths' or 'cases') and scope ('world' or 'usa').

    Parameters
    ----------
    group : str
        'world' for global data or 'usa' for US data.
    kind : str
        'deaths' for death data or 'cases' for case data.

    Returns
    -------
    DataFrame
        Pandas DataFrame with the requested data.
    """
    group_change_dict = {"world": "global", "usa": "US"}
    kind_change_dict = {"deaths": "deaths", "cases": "confirmed"}
    group = group_change_dict[group]
    kind = kind_change_dict[kind]
    return pd.read_csv(DOWNLOAD_URL.format(kind=kind, group=group))


In [10]:
df_world_deaths = download_data('world', 'deaths')
df_world_deaths.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7896,7896,7896,7896,7896,7896,7896,7896,7896,7896
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3598,3598,3598,3598,3598,3598,3598,3598,3598,3598
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6881,6881,6881,6881,6881,6881,6881,6881,6881,6881
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,165,165,165,165,165,165,165,165,165,165
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,1933,1933,1933,1933,1933,1933,1933,1933,1933,1933


In [11]:
GROUPS = "world", "usa"
KINDS = "deaths", "cases"

def read_all_data():
    """
    Downloads all data combinations (world/usa and deaths/cases) from the repository.

    Returns
    -------
    dict
        Dictionary of DataFrames, keyed by "{group}_{kind}".
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            df = download_data(group, kind)
            data[f"{group}_{kind}"] = df
    return data


In [12]:
data = read_all_data()
data['world_cases'].head(5)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,209322,209340,209358,209362,209369,209390,209406,209436,209451,209451
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,334391,334408,334408,334427,334427,334427,334427,334427,334443,334457
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271441,271448,271463,271469,271469,271477,271477,271490,271494,271496
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47866,47875,47875,47875,47875,47875,47875,47875,47890,47890
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,105255,105277,105277,105277,105277,105277,105277,105277,105288,105288


In [13]:
data['usa_cases'].head(5)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,84001001,US,USA,840,1001.0,Autauga,Alabama,US,32.539527,-86.644082,...,19732,19759,19759,19759,19759,19759,19759,19759,19790,19790
1,84001003,US,USA,840,1003.0,Baldwin,Alabama,US,30.72775,-87.722071,...,69641,69767,69767,69767,69767,69767,69767,69767,69860,69860
2,84001005,US,USA,840,1005.0,Barbour,Alabama,US,31.868263,-85.387129,...,7451,7474,7474,7474,7474,7474,7474,7474,7485,7485
3,84001007,US,USA,840,1007.0,Bibb,Alabama,US,32.996421,-87.125115,...,8067,8087,8087,8087,8087,8087,8087,8087,8091,8091
4,84001009,US,USA,840,1009.0,Blount,Alabama,US,33.982109,-86.567906,...,18616,18673,18673,18673,18673,18673,18673,18673,18704,18704


## Save the data locally

In [14]:
def write_data(data, directory, **kwargs):
    """
    Saves each DataFrame in 'data' to CSV files in the specified directory.

    Parameters
    ----------
    data : dict
        Dictionary of DataFrames to save.
    directory : str
        Target directory for CSV files.

    Returns
    -------
    None
    """
    for name, df in data.items():
        df.to_csv(f"{directory}/{name}.csv", **kwargs)


In [15]:
#run write_data function
write_data(data, "data/raw", index=False)

In [16]:
def read_local_data(group, kind, directory):
    """
    Reads a specific CSV file as a DataFrame from a given directory.

    Parameters
    ----------
    group : str
        'world' or 'usa'.
    kind : str
        'deaths' or 'cases'.
    directory : str
        Directory path to read the file from.

    Returns
    -------
    DataFrame
    """
    return pd.read_csv(f"{directory}/{group}_{kind}.csv")


In [17]:
read_local_data('world', 'deaths', 'data/raw').head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7896,7896,7896,7896,7896,7896,7896,7896,7896,7896
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3598,3598,3598,3598,3598,3598,3598,3598,3598,3598
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6881,6881,6881,6881,6881,6881,6881,6881,6881,6881


In [18]:
def run():
    """
    Executes data loading and transformation steps for all data combinations.

    Returns
    -------
    dict
        Dictionary of transformed DataFrames.
    """
    data = {}
    for group in GROUPS:
        for kind in KINDS:
            df = read_local_data(group, kind, "data/raw")
            data[f"{group}_{kind}"] = df
    return data


In [8]:
# run run() function
data = run()
data['usa_deaths'].tail(3)

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,...,2/28/23,3/1/23,3/2/23,3/3/23,3/4/23,3/5/23,3/6/23,3/7/23,3/8/23,3/9/23
3339,84090056,US,USA,840,90056.0,Unassigned,Wyoming,US,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3340,84056043,US,USA,840,56043.0,Washakie,Wyoming,US,43.904516,-107.680187,...,50,50,50,50,50,50,50,51,51,51
3341,84056045,US,USA,840,56045.0,Weston,Wyoming,US,43.839612,-104.567488,...,23,23,23,23,23,23,23,23,23,23
