# Dataset - Global

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global.csv"
dest = "./time_series_covid19_confirmed_global_transformed.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
globalDf.head(20)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,10/30/20,10/31/20,11/1/20,11/2/20,11/3/20,11/4/20,11/5/20,11/6/20,11/7/20,11/8/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,41334,41425,41501,41633,41728,41814,41935,41975,42033,42092
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,20634,20875,21202,21523,21904,22300,22721,23210,23705,24206
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,57651,57942,58272,58574,58979,59527,60169,60800,61381,62051
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,4665,4756,4825,4888,4910,5045,5135,5135,5319,5383
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,10558,10805,11035,11228,11577,11813,12102,12223,12335,12433
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,127,128,128,128,128,130,130,130,131,131
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1157179,1166924,1173533,1183131,1195276,1205928,1217028,1228814,1236851,1242182
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,87432,89813,92254,93448,94776,97150,99563,101773,104249,106424
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,114,114,114,114,114,114,114,114,114,114
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,4421,4425,4432,4435,4443,4445,4454,4459,4462,4469


## Data Manipulation

In [3]:
# Renaming columns
globalDf.rename(columns = {
    "Province/State": "Province_State", 
    "Country/Region": "Country_Region"
}, inplace = True)

# Name the "Province_State" as "Country_Region" if marked as NaN
globalDf["Province_State"] = globalDf["Province_State"].mask(pd.isnull, globalDf["Country_Region"])

# Derive confirmed cases per day and attach back to the source dataframe
locationsDf = globalDf.iloc[:, :4]
datesDf = globalDf.iloc[:, 4:].diff(axis = 1)
diffDf = pd.concat([locationsDf, datesDf], axis = 1)

# Transform spreading "date & confirmed cases" data into "Date" adn "Confirmed Cases"
globalDf = diffDf.melt(
    id_vars = ["Province_State", "Country_Region", "Lat", "Long"],
    var_name = "Date",
    value_name = "Confirmed Cases")

In [4]:
globalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,
1,Albania,Albania,41.153300,20.168300,1/22/20,
2,Algeria,Algeria,28.033900,1.659600,1/22/20,
3,Andorra,Andorra,42.506300,1.521800,1/22/20,
4,Angola,Angola,-11.202700,17.873900,1/22/20,
...,...,...,...,...,...,...
78251,West Bank and Gaza,West Bank and Gaza,31.952200,35.233200,11/8/20,501.0
78252,Western Sahara,Western Sahara,24.215500,-12.885800,11/8/20,0.0
78253,Yemen,Yemen,15.552727,48.516388,11/8/20,0.0
78254,Zambia,Zambia,-13.133897,27.849332,11/8/20,46.0


## Save Dataframe

In [5]:
globalDf.to_csv(dest, index = False)