# Dataset - All

Merge datasets "global", "US", and "Brazil".

In [1]:
import pandas as pd

## Initialize

In [2]:
srcGlobal = "./time_series_covid19_confirmed_global_transformed.csv"
srcUS = "./time_series_covid19_confirmed_US_transformed.csv"
srcBrazil = "../vinicius_da_silva/output_brazil.csv"
srcArgentina = "../vinicius_da_silva/output_argentina.csv"
dest = "./time_series_covid19_confirmed_all.csv"

# Read data
globalDf = pd.read_csv(srcGlobal)
usDf = pd.read_csv(srcUS)
brazilDf = pd.read_csv(srcBrazil)
argentinaDf = pd.read_csv(srcArgentina)

## Data Manipulation

In [3]:
# Drop Brazil from brazilDf (we've subsampled Brazil to regions)
brazilDf = brazilDf[brazilDf.Province_State != "Brazil"]

# Drop Brazil and Argentina data from globalDf
globalDf = globalDf[globalDf.Country_Region != "Brazil"]
globalDf = globalDf[globalDf.Country_Region != "Argentina"]

# Drop US data from globalDf
globalDf = globalDf[globalDf.Country_Region != "US"]

# Merge datasets
finalDf = pd.concat([globalDf, usDf], axis = 0)
finalDf = pd.concat([finalDf, brazilDf], axis = 0)
finalDf = pd.concat([finalDf, argentinaDf], axis = 0).reset_index(drop = True)

# Convert NaN to 0
finalDf["Confirmed Cases"] = finalDf["Confirmed Cases"].mask(pd.isnull, 0)

# Convert negative numbers to 0
finalDf["Confirmed Cases"] = finalDf["Confirmed Cases"].mask(finalDf["Confirmed Cases"] < 0, 0)

# Drop (Lat, Long) == (0, 0) or (NaN, NaN)
#   - This action involves the following regions:
#       - (0, 0)    : 'Diamond Princess', 'Grand Princess', 'MS Zaandam'
#       - (NaN, NaN): 'Repatriated Travellers'
# ```
# droppedDf = finalDf[((finalDf.Lat == 0) & (finalDf.Long == 0)) | ((finalDf.Lat.isnull()) & (finalDf.Long.isnull()))]
# droppedDf["Province_State"].unique()
# ```
finalDf.drop(finalDf[((finalDf.Lat == 0) & (finalDf.Long == 0)) | ((finalDf.Lat.isnull()) & (finalDf.Long.isnull()))].index, inplace = True)


In [4]:
finalDf

Unnamed: 0,Province_State,Country_Region,Lat,Long,Date,Confirmed Cases
0,Afghanistan,Afghanistan,33.93911,67.709953,1/22/20,0.0
1,Albania,Albania,41.15330,20.168300,1/22/20,0.0
2,Algeria,Algeria,28.03390,1.659600,1/22/20,0.0
3,Andorra,Andorra,42.50630,1.521800,1/22/20,0.0
4,Angola,Angola,-11.20270,17.873900,1/22/20,0.0
...,...,...,...,...,...,...
125289,Jujuy,Argentina,-23.75000,-65.500000,12/19/20,4.0
125290,Jujuy,Argentina,-23.75000,-65.500000,10/03/20,55.0
125291,Jujuy,Argentina,-23.75000,-65.500000,09/07/20,247.0
125292,Jujuy,Argentina,-23.75000,-65.500000,06/10/20,1.0


## Save Dataframe

In [5]:
finalDf.to_csv(dest, index = False)