# Dataset - Global

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global.csv"
dest = "./time_series_covid19_confirmed_global_transformed.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
globalDf.head(20)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/12/21,1/13/21,1/14/21,1/15/21,1/16/21,1/17/21,1/18/21,1/19/21,1/20/21,1/21/21
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,53584,53584,53775,53831,53938,53984,54062,54141,54278,54403
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,64627,65334,65994,66635,67216,67690,67982,68568,69238,69916
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,102641,102860,103127,103381,103611,103833,104092,104341,104606,104852
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,8682,8818,8868,8946,9038,9083,9083,9194,9308,9379
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,18343,18425,18613,18679,18765,18875,18926,19011,19093,19177
5,,Antigua and Barbuda,17.0608,-61.7964,0,0,0,0,0,0,...,176,176,184,184,187,189,189,190,190,192
6,,Argentina,-38.4161,-63.6167,0,0,0,0,0,0,...,1744704,1757429,1770715,1783047,1791979,1799243,1807428,1819569,1831681,1843077
7,,Armenia,40.0691,45.0382,0,0,0,0,0,0,...,162643,163128,163576,163972,164235,164586,164676,164912,165221,165528
8,Australian Capital Territory,Australia,-35.4735,149.0124,0,0,0,0,0,0,...,118,118,118,118,118,118,118,118,118,118
9,New South Wales,Australia,-33.8688,151.2093,0,0,0,0,3,4,...,5041,5043,5045,5057,5066,5074,5076,5079,5084,5083


## Data Manipulation

In [3]:
# Renaming columns
globalDf.rename(columns = {
    "Province/State": "Province_State", 
    "Country/Region": "Country_Region"
}, inplace = True)

# Name the "Province_State" as "Country_Region" if marked as NaN
globalDf["Province_State"] = globalDf["Province_State"].mask(pd.isnull, globalDf["Country_Region"])

# Derive confirmed cases per day and attach back to the source dataframe
locationsDf = globalDf.iloc[:, :4]
datesDf = globalDf.iloc[:, 4:].diff(axis = 1)
diffDf = pd.concat([locationsDf, datesDf], axis = 1)

# Transform spreading "date & confirmed cases" data into "Date" adn "Confirmed Cases"
globalDf = diffDf.melt(
    id_vars = ["Province_State", "Country_Region", "Lat", "Long"],
    var_name = "Date",
    value_name = "Confirmed Cases")

In [4]:
globalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.939110,67.709953,1/22/20,
1,Albania,Albania,41.153300,20.168300,1/22/20,
2,Algeria,Algeria,28.033900,1.659600,1/22/20,
3,Andorra,Andorra,42.506300,1.521800,1/22/20,
4,Angola,Angola,-11.202700,17.873900,1/22/20,
...,...,...,...,...,...,...
99913,Vietnam,Vietnam,14.058324,108.277199,1/21/21,2.0
99914,West Bank and Gaza,West Bank and Gaza,31.952200,35.233200,1/21/21,473.0
99915,Yemen,Yemen,15.552727,48.516388,1/21/21,0.0
99916,Zambia,Zambia,-13.133897,27.849332,1/21/21,1264.0


## Save Dataframe

In [5]:
globalDf.to_csv(dest, index = False)