# Dataset - Global

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global.csv"
dest = "./time_series_covid19_confirmed_global_transformed.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
globalDf.head(20)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/26/20,12/27/20,12/28/20,12/29/20,12/30/20,12/31/20,1/1/21,1/2/21,1/3/21,1/4/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,50886,51039,51280,51350,51405,51526,51526,51526,51526,53011
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,55755,56254,56572,57146,57727,58316,58316,58991,59438,59623
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,97857,98249,98631,98988,99311,99610,99897,100159,100408,100645
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,7806,7821,7875,7919,7983,8049,8117,8166,8192,8249
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,17149,17240,17296,17371,17433,17553,17568,17608,17642,17684
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,155,155,158,158,158,159,159,159,160,160
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1578267,1583297,1590513,1602163,1613928,1625514,1629594,1634834,1640718,1648940
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,157349,157834,157948,158296,158878,159409,159738,159798,160027,160220
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,118,118,118,118,118,118,118,118,118,118
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,4858,4872,4881,4906,4923,4928,4947,4958,4965,4973


## Data Manipulation

In [3]:
# Renaming columns
globalDf.rename(columns = {
    "Province/State": "Province_State", 
    "Country/Region": "Country_Region"
}, inplace = True)

# Name the "Province_State" as "Country_Region" if marked as NaN
globalDf["Province_State"] = globalDf["Province_State"].mask(pd.isnull, globalDf["Country_Region"])

# Derive confirmed cases per day and attach back to the source dataframe
locationsDf = globalDf.iloc[:, :4]
datesDf = globalDf.iloc[:, 4:].diff(axis = 1)
diffDf = pd.concat([locationsDf, datesDf], axis = 1)

# Transform spreading "date & confirmed cases" data into "Date" adn "Confirmed Cases"
globalDf = diffDf.melt(
    id_vars = ["Province_State", "Country_Region", "Lat", "Long"],
    var_name = "Date",
    value_name = "Confirmed Cases")

In [4]:
globalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,
1,Albania,Albania,41.153300,20.168300,1/22/20,
2,Algeria,Algeria,28.033900,1.659600,1/22/20,
3,Andorra,Andorra,42.506300,1.521800,1/22/20,
4,Angola,Angola,-11.202700,17.873900,1/22/20,
...,...,...,...,...,...,...
94923,Vietnam,Vietnam,14.058324,108.277199,1/4/21,3.0
94924,West Bank and Gaza,West Bank and Gaza,31.952200,35.233200,1/4/21,1009.0
94925,Yemen,Yemen,15.552727,48.516388,1/4/21,0.0
94926,Zambia,Zambia,-13.133897,27.849332,1/4/21,411.0


## Save Dataframe

In [5]:
globalDf.to_csv(dest, index = False)