# Data prep
Here the data is prepared for exploration and analysis.

## Geopandas
The geo file is needed for drawing the map. It is read using geopandas and modified for better usability.

In [1]:
import geopandas as gpd

#Shape file
shapefile = '../data/geo_data/ne_110m_admin_0_countries.shp'
#Read shapefile using Geopandas. Only take the columns needed
gdf = gpd.read_file(shapefile)[['ADMIN', 'ADM0_A3', 'geometry']]
#Rename columns
gdf.columns = ['country', 'country_code', 'geometry']
#Remove antartica because it is irrelevant and takes up a lot of space
gdf = gdf.drop(gdf.index[159]) 
gdf.head()

Unnamed: 0,country,country_code,geometry
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000..."
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982..."
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948..."
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742..."
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000..."


## Death count
The death count file is read using pandas and also modified for better usability.

In [15]:
import pandas as pd 

#Death count datafile
datafile = '../data/time_series_covid19_deaths_global_iso3_regions.csv'
#Read using pandas. Only include relevant columns
deaths_df = pd.read_csv(datafile,usecols = lambda column : column not in ['Lat', 'Long','Region Code', 'Region Name', 'Sub-region Code', 'Sub-region Name', 'Intermediate Region Code','Intermediate Region Name'])
#Rename column for consistency
deaths_df = deaths_df.rename(columns={"ISO 3166-1 Alpha 3-Codes": "country_code"})
deaths_df.head()

Unnamed: 0,Province/State,Country/Region,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,...,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20,country_code
0,#adm1+name,#country+name,,,,,,,,,...,,,,,,,,,,#country+code
1,,Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,36.0,36.0,40.0,42.0,43.0,47.0,50.0,57.0,58.0,AFG
2,,Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,26.0,26.0,27.0,27.0,27.0,27.0,28.0,28.0,30.0,ALB
3,,Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,384.0,392.0,402.0,407.0,415.0,419.0,425.0,432.0,437.0,DZA
4,,Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,37.0,37.0,37.0,37.0,40.0,40.0,40.0,40.0,41.0,AND


For some countries the death count has been entered for individual provinces or states, therefore the data is grouped by country and summed to get the count for each country.

In [3]:
country_total_df = deaths_df.groupby(['country_code']).sum().reset_index()

Finally the dataframe is merged with the geo data in order to add geometry for each country and NaNs are replaced with 'No data'.

In [4]:
country_total_df = gdf.merge(country_total_df, on = 'country_code', how = 'left')
country_total_df.fillna('No data', inplace = True)
country_total_df.head()

Unnamed: 0,country,country_code,geometry,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,...,4/19/20,4/20/20,4/21/20,4/22/20,4/23/20,4/24/20,4/25/20,4/26/20,4/27/20,4/28/20
0,Fiji,FJI,"MULTIPOLYGON (((180.00000 -16.06713, 180.00000...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,United Republic of Tanzania,TZA,"POLYGON ((33.90371 -0.95000, 34.07262 -1.05982...",0,0,0,0,0,0,0,...,7,10,10,10,10,10,10,10,10,10
2,Western Sahara,SAH,"POLYGON ((-8.66559 27.65643, -8.66512 27.58948...",No data,No data,No data,No data,No data,No data,No data,...,No data,No data,No data,No data,No data,No data,No data,No data,No data,No data
3,Canada,CAN,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",0,0,0,0,0,0,0,...,1564,1726,1909,2077,2240,2401,2570,2686,2840,2982
4,United States of America,USA,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",0,0,0,0,0,0,0,...,40945,42659,45086,47412,49724,51493,53755,54881,56259,58355


## Countermeasures
The countermeasures data file is read using pandas. Flag and notes columns are emitted as these are not relevant for this analysis. 

In [13]:
countermeasures_file = '../data/OxCGRT.csv'
#Read csv file using pandas
countermeasures_df = pd.read_csv(countermeasures_file, usecols = lambda column : ('_Flag' not in column and '_Notes' not in column))
countermeasures_df = countermeasures_df.rename(columns={"CountryCode": "country_code"})
countermeasures_df.head()

Unnamed: 0,CountryName,country_code,Date,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,...,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,M1_Wildcard,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,LegacyStringencyIndex,LegacyStringencyIndexForDisplay
0,Aruba,ABW,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,20200101,0.0,0.0,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
2,Angola,AGO,20200101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
3,Albania,ALB,20200101,0.0,0.0,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
4,Andorra,AND,20200101,0.0,0.0,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


Columns names are modified so they do not include spaces.

In [16]:
countermeasures_df.columns = countermeasures_df.columns.str.replace(' ', '_')

The date column is formatted so it has the same format as in the death count dataframe

In [17]:
def convertDate(date): 
    # convert 20200122 into 1/22/20
    s = str(date)
    year = s[2:4]
    month = s[5:6]
    day = str(int(s[6:8]))
    return month + '/' + day + '/' + year

countermeasures_df['Date'] = countermeasures_df['Date'].apply(convertDate)

In [7]:
countermeasures_df.head()

Unnamed: 0,CountryName,country_code,Date,C1_School_closing,C1_Flag,C1_Notes,C2_Workplace_closing,C2_Flag,C2_Notes,C3_Cancel_public_events,...,H5_Investment_in_vaccines,H5_Notes,M1_Wildcard,M1_Notes,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,LegacyStringencyIndex,LegacyStringencyIndexForDisplay
0,Aruba,ABW,1/1/20,0.0,,,0.0,,,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0
1,Afghanistan,AFG,1/1/20,0.0,,https://en.unesco.org/themes/education-emergen...,0.0,,"Form January 1 to March 25th, there were no cl...",0.0,...,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0
2,Angola,AGO,1/1/20,0.0,,,0.0,,,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0
3,Albania,ALB,1/1/20,0.0,,,0.0,,,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0
4,Andorra,AND,1/1/20,0.0,,,0.0,,,0.0,...,0.0,,,,,,0.0,0.0,0.0,0.0


### Create separate dataframes for each countermeasure
To enable map interactions separate dataframes are need for each countermeasure.

A list of items to include is defined. Only the containments and closures are selected. These are the columns starting with uppercase C. 

In [9]:
columns = list(countermeasures_df.columns)
items_to_include = [item for item in columns if (item.startswith('C') and '_' in item)]

The countermeasure dataframes are created and saved in a dictionary. Each dataframe is also merged with the geodata. 

In [10]:
countermeasure_dfs = {}
for item in items_to_include:
    t = countermeasures_df[['country_code', 'Date', item]]
    countermeasure_dfs[item] = t.set_index(['country_code', 'Date'])[item].unstack().reset_index()
    countermeasure_dfs[item] = gdf.merge(countermeasure_dfs[item], on = 'country_code', how = 'left')
    countermeasure_dfs[item].fillna('No data', inplace = True)