# Dataset - Global

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global.csv"
dest = "./time_series_covid19_confirmed_global_transformed.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
globalDf.head(20)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,11/20/20,11/21/20,11/22/20,11/23/20,11/24/20,11/25/20,11/26/20,11/27/20,11/28/20,11/29/20
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,44443,44503,44706,44988,45280,45490,45716,45839,45966,46215
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,31459,32196,32761,33556,34300,34944,35600,36245,36790,37625
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,72755,73774,74862,75867,77000,78025,79110,80168,81212,82221
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,6142,6207,6256,6304,6351,6428,6534,6610,6610,6712
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,14267,14413,14493,14634,14742,14821,14920,15008,15087,15103
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,139,139,139,139,139,140,141,141,141,141
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1359042,1366182,1370366,1374631,1381795,1390388,1399431,1407277,1413375,1418807
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,123646,124839,126224,126709,127522,129085,130870,132346,133594,134768
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,115,115,115,115,115,115,116,117,117,117
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,4527,4538,4542,4548,4552,4552,4556,4564,4568,4577


## Data Manipulation

In [3]:
# Renaming columns
globalDf.rename(columns = {
    "Province/State": "Province_State", 
    "Country/Region": "Country_Region"
}, inplace = True)

# Name the "Province_State" as "Country_Region" if marked as NaN
globalDf["Province_State"] = globalDf["Province_State"].mask(pd.isnull, globalDf["Country_Region"])

# Derive confirmed cases per day and attach back to the source dataframe
locationsDf = globalDf.iloc[:, :4]
datesDf = globalDf.iloc[:, 4:].diff(axis = 1)
diffDf = pd.concat([locationsDf, datesDf], axis = 1)

# Transform spreading "date & confirmed cases" data into "Date" adn "Confirmed Cases"
globalDf = diffDf.melt(
    id_vars = ["Province_State", "Country_Region", "Lat", "Long"],
    var_name = "Date",
    value_name = "Confirmed Cases")

In [4]:
globalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,
1,Albania,Albania,41.153300,20.168300,1/22/20,
2,Algeria,Algeria,28.033900,1.659600,1/22/20,
3,Andorra,Andorra,42.506300,1.521800,1/22/20,
4,Angola,Angola,-11.202700,17.873900,1/22/20,
...,...,...,...,...,...,...
84818,West Bank and Gaza,West Bank and Gaza,31.952200,35.233200,11/29/20,1695.0
84819,Western Sahara,Western Sahara,24.215500,-12.885800,11/29/20,0.0
84820,Yemen,Yemen,15.552727,48.516388,11/29/20,17.0
84821,Zambia,Zambia,-13.133897,27.849332,11/29/20,19.0


## Save Dataframe

In [5]:
globalDf.to_csv(dest, index = False)