# Data prep
Here the data is prepared for exploration and analysis.

## Geopandas
The geo file is needed for drawing the map. It is read using geopandas and modified for better usability.

In [176]:
import geopandas as gpd

#Shape file
shapefile = '../data/geo_data/ne_110m_admin_0_countries.shp'
#Read shapefile using Geopandas. Only take the columns needed
gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
#Rename columns
gdf.columns = ['country', 'country_code', 'geometry']
#Remove antartica because it is irrelevant and takes up a lot of space
gdf = gdf.drop(gdf.index[159]) 
gdf.head()

Unnamed: 0,country,country_code,geometry
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


## Death count
The death count file is read using pandas and also modified for better usability.

In [177]:
import pandas as pd 

#Death count datafile
datafile = '../data/time_series_covid19_deaths_global_iso3_regions.csv'
#Read using pandas. Only include relevant columns
deaths_df = pd.read_csv(datafile,usecols = lambda column : column not in ['Lat', 'Long','Region Code', 'Region Name', 'Sub-region Code', 'Sub-region Name', 'Intermediate Region Code','Intermediate Region Name'])
#Rename column for consistency
deaths_df = deaths_df.rename(columns={"ISO 3166-1 Alpha 3-Codes": "country_code"})
deaths_df.head()

Unnamed: 0,Province/State,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,4/26/20,4/27/20,4/28/20,4/29/20,4/30/20,5/1/20,5/2/20,5/3/20,5/4/20,country_code
0,#adm1+name,#country+name,,,,,,,,,...,,,,,,,,,,#country+code
1,,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.0,57.0,58.0,60.0,64.0,68.0,72.0,85.0,90.0,AFG
2,,Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,28.0,28.0,30.0,30.0,31.0,31.0,31.0,31.0,31.0,ALB
3,,Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,425.0,432.0,437.0,444.0,450.0,453.0,459.0,463.0,465.0,DZA
4,,Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,40.0,40.0,41.0,42.0,42.0,43.0,44.0,45.0,45.0,AND


For some countries the death count has been entered for individual provinces or states, therefore the data is grouped by country and summed to get the count for each country.

In [66]:
country_total_df = deaths_df.groupby(['country_code']).sum().reset_index()

Finally the dataframe is merged with the geo data in order to add geometry for each country and NaNs are replaced with 'No data'.

In [67]:
country_total_df = gdf.merge(country_total_df, on = 'country_code', how = 'left')
country_total_df.fillna('No data', inplace = True)
country_total_df.head()

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/19/20,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",0,0,0,0,0,0,0,...,7,10,10,10,10,10,10,10,10,10
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",0,0,0,0,0,0,0,...,1564,1726,1909,2077,2240,2401,2570,2686,2840,2982
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",0,0,0,0,0,0,0,...,40945,42659,45086,47412,49724,51493,53755,54881,56259,58355


## Countermeasures
The countermeasures data file is read using pandas. Flag and notes columns are emitted as these are not relevant for this analysis. 

In [68]:
countermeasures_file = '../data/OxCGRT.csv'
#Read csv file using pandas
countermeasures_df = pd.read_csv(countermeasures_file, usecols = lambda column : ('_Flag' not in column and '_Notes' not in column))
countermeasures_df = countermeasures_df.rename(columns={"CountryCode": "country_code"})
countermeasures_df.head()

Unnamed: 0,CountryName,country_code,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,...,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,M1_Wildcard,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,LegacyStringencyIndex,LegacyStringencyIndexForDisplay
0,Aruba,ABW,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
2,Angola,AGO,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
3,Albania,ALB,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
4,Andorra,AND,20200101,0.0,0.0,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


Columns names are modified so they do not include spaces.

In [69]:
countermeasures_df.columns = countermeasures_df.columns.str.replace(' ', '_')

The date column is formatted so it has the same format as in the death count dataframe

In [70]:
def convertDate(date): 
    # convert 20200122 into 1/22/20
    s = str(date)
    year = s[2:4]
    month = s[5:6]
    day = str(int(s[6:8]))
    return month + '/' + day + '/' + year

countermeasures_df['Date'] = countermeasures_df['Date'].apply(convertDate)

In [71]:
countermeasures_df.head()

Unnamed: 0,CountryName,country_code,Date,C1_School_closing,C2_Workplace_closing,C3_Cancel_public_events,C4_Restrictions_on_gatherings,C5_Close_public_transport,C6_Stay_at_home_requirements,C7_Restrictions_on_internal_movement,...,H3_Contact_tracing,H4_Emergency_investment_in_healthcare,H5_Investment_in_vaccines,M1_Wildcard,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,LegacyStringencyIndex,LegacyStringencyIndexForDisplay
0,Aruba,ABW,1/1/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,1/1/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
2,Angola,AGO,1/1/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
3,Albania,ALB,1/1/20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
4,Andorra,AND,1/1/20,0.0,0.0,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


### Create separate dataframes for each countermeasure
To enable map interactions a separate dataframe is created for each countermeasure.

A list of items to include is defined. Only the containments and closures are selected. These are the columns starting with uppercase C. 

In [72]:
columns = list(countermeasures_df.columns)
items_to_include = [item for item in columns if (item.startswith('C') and '_' in item)]

The countermeasure dataframes are created and saved in a dictionary. Each dataframe is also merged with the geodata. 

In [73]:
countermeasure_dfs = {}
for item in items_to_include:
    t = countermeasures_df[['country_code', 'Date', item]]
    countermeasure_dfs[item] = t.set_index(['country_code', 'Date'])[item].unstack().reset_index()
    countermeasure_dfs[item] = gdf.merge(countermeasure_dfs[item], on = 'country_code', how = 'left')
    countermeasure_dfs[item].fillna('No data', inplace = True)

# Population
To be able to show dead per million population count for each country is needed. 
source: https://data.worldbank.org/indicator/sp.pop.totl

In [143]:
#Death count datafile
datafile = '../data/population.csv'
#Read using pandas. Only include relevant columns
population = pd.read_csv(datafile,header=2, usecols = lambda column : column in ['Country Code', '2018'])
#Rename columns
population.columns=['country_code', 'population']

Add population info to country death count dataframe by merging population with dataframe.

In [144]:
#Merge on contry code
country_total_df_with_population = country_total_df.merge(population, on = 'country_code', how = 'left')
country_total_df_with_population.head()

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,population
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,883483.0
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",0,0,0,0,0,0,0,...,10,10,10,10,10,10,10,10,10,56318348.0
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",0,0,0,0,0,0,0,...,1726,1909,2077,2240,2401,2570,2686,2840,2982,37057765.0
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",0,0,0,0,0,0,0,...,42659,45086,47412,49724,51493,53755,54881,56259,58355,326687501.0


The population is missing for some countries. The following lists all rows with missing population.

In [145]:
country_total_df_with_population[country_total_df_with_population['population'].isna()]

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,population
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
20,Falkland Islands,FLK,"POLYGON ((-61.20000 -51.85000, -60.00000 -51.2...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
23,French Southern and Antarctic Lands,ATF,"POLYGON ((68.93500 -48.62500, 69.58000 -48.940...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
79,Palestine,PSX,"POLYGON ((35.39756 31.48909, 34.92741 31.35344...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
140,Taiwan,TWN,"POLYGON ((121.77782 24.39427, 121.17563 22.790...",0,0,0,0,0,0,0,...,6,6,6,6,6,6,6,6,6,
154,Eritrea,ERI,"POLYGON ((36.42951 14.42211, 36.32322 14.82249...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
159,Northern Cyprus,CYN,"POLYGON ((32.73178 35.14003, 32.80247 35.14550...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
166,Somaliland,SOL,"POLYGON ((48.94820 11.41062, 48.94820 11.41062...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
173,Kosovo,KOS,"POLYGON ((20.59025 41.85541, 20.52295 42.21787...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,
175,South Sudan,SDS,"POLYGON ((30.83385 3.50917, 29.95350 4.17370, ...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,


The population count is missing for 10 of the countries in the death count dataset. Since there are only 10 missing values these are added manually. The population for every country but two is found on www.worldometers.info. The sources for the other two are listed below.

Western Sahara: 567,402

Falkland Islands: 3,234

French Southern and Antarctic Lands	: 0 (source: https://en.wikipedia.org/wiki/French_Southern_and_Antarctic_Lands)

Palestine: 4,862,979

Taiwan: 23,726,460

Eritrea: 3,457,786

Northern Cyprus: 1,189,265

Somaliland: 15,008,226

Kosovo: 1,797,086 (source: https://en.wikipedia.org/wiki/Demographics_of_Kosovo)

South Sudan: 10,975,927



The population is added manually by index.

In [146]:
#Set population value for the missing countries
country_total_df_with_population.at[2,'population'] = 567402
country_total_df_with_population.at[20,'population'] = 3234
country_total_df_with_population.at[23,'population'] = 0
country_total_df_with_population.at[79,'population'] = 4862979
country_total_df_with_population.at[140,'population'] = 23726460
country_total_df_with_population.at[154,'population'] = 3457786
country_total_df_with_population.at[159,'population'] = 1189265
country_total_df_with_population.at[166,'population'] = 15008226
country_total_df_with_population.at[173,'population'] = 1797086
country_total_df_with_population.at[175,'population'] = 10975927

A final check to insure that there are no missing values.

In [147]:
country_total_df_with_population[country_total_df_with_population['population'].isna()]

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,population


In [148]:
country_total_df_with_population.isnull().values.any()

False

The dataframe is now complete. 

# Dataframe with dead per million
To make it easier to work with a separate dataframe is created where the death count is listed as number of dead per million instead of total number of dead. 

First a list of the relevant columns is created. These are all the date columns. There are a few special columns that do not contain other values for the countries than dead count. These are the columns: country, country code, geometry and the newly added population. 

In [169]:
#List special columns
special_columns = ['country', 'country_code', 'geometry', 'population']
#List date columns by taking every column that is not in the other list
date_columns = [x for x in country_total_df_with_population.columns if x not in special_columns]

The new dataframe is created by dividing by population divided with a million. There are some 'no data' values which must be skipped and there are some countries with a population of zero witch must also be skipped. 

In [174]:
#Copy the 
dead_per_million_df = country_total_df_with_population.copy()
for date in date_columns: 
    #Convert column to numeric
    dead_per_million_df[date]= pd.to_numeric(dead_per_million_df[date], errors='coerce')
    #Calculate dead per million
    dead_per_million_df[date] = (dead_per_million_df[date]/((country_total_df_with_population['population'])/1000000)).where(not(isinstance(country_total_df_with_population[date], str) ) and (country_total_df_with_population['population']>0), dead_per_million_df[date])

#Fill missing values with 'No data' again
dead_per_million_df.fillna('No data', inplace = True)

Take a look at the new dataframe. 

In [175]:
dead_per_million_df.head()

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,population
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,883483.0
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",0,0,0,0,0,0,0,...,0.177562,0.177562,0.177562,0.177562,0.177562,0.177562,0.177562,0.177562,0.177562,56318348.0
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,567402.0
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",0,0,0,0,0,0,0,...,46.5759,51.5142,56.0476,60.4462,64.7907,69.3512,72.4814,76.6371,80.469,37057765.0
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",0,0,0,0,0,0,0,...,130.58,138.01,145.13,152.207,157.622,164.546,167.992,172.21,178.626,326687501.0
