In [148]:
import pandas as pd

# Retrieve COVID data from ECDC 

In [149]:
covid_url = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
import json
import urllib

In [150]:
covid_json_unformated = urllib.request.urlopen(covid_url).read().decode("utf-8")
covid_json = json.loads(covid_json_unformated)
cdf = pd.DataFrame(covid_json['records'])

In [151]:
cdf.shape

(61900, 12)

In [152]:
cdf.sample(10)

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2019,continentExp,Cumulative_number_for_14_days_of_COVID-19_cases_per_100000
16019,15/09/2020,15,9,2020,1,0,Djibouti,DJ,DJI,973557.0,Africa,0.9244451
28659,21/06/2020,21,6,2020,0,0,Isle_of_Man,IM,IMN,84589.0,Europe,0.0
27145,24/10/2020,24,10,2020,4369,118,Indonesia,ID,IDN,270625567.0,Asia,21.15542912
56150,09/11/2020,9,11,2020,33,0,Togo,TG,TGO,8082359.0,Africa,3.90974962
36262,30/06/2020,30,6,2020,0,0,Malta,MT,MLT,493559.0,Europe,2.83654031
5677,07/06/2020,7,6,2020,69,11,Belgium,BE,BEL,11455519.0,Europe,18.06116336
30376,14/11/2020,14,11,2020,4469,71,Jordan,JO,JOR,10101697.0,Asia,665.71982905
55884,07/11/2020,7,11,2020,0,0,Timor_Leste,TL,TLS,1293120.0,Asia,0.07733234
11631,17/09/2020,17,9,2020,3,0,Chad,TD,TCD,15946882.0,Africa,0.45776974
24743,23/03/2020,23,3,2020,0,0,Guinea,GN,GIN,12771246.0,Africa,


Rename columns to something more Pythonian. If you think they look already great, then at least rename `notification_rate_per_100000_population_14-days` to `14d-incidence`

In [153]:
cdf.rename(columns={"dateRep": "date", "countriesAndTerritories": "region", "geoId": "Geographical ID", "countryterritoryCode": "Region Code", "popData2019" : "population (2019)", "continentExp": "continent", "Cumulative_number_for_14_days_of_COVID-19_cases_per_100000": "14-day incidence"}, inplace = True)

cdf.head()

Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence
0,14/12/2020,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.01377925
1,13/12/2020,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.05277624
2,12/12/2020,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.86876792
3,11/12/2020,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.13426564
4,10/12/2020,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.96865815


Identify which columns have not been casted to an appropriate type during loading!

In [154]:
cdf.dtypes

date                  object
day                   object
month                 object
year                  object
cases                  int64
deaths                 int64
region                object
Geographical ID       object
Region Code           object
population (2019)    float64
continent             object
14-day incidence      object
dtype: object

We did not cover datetime objects in pandas, however they are quite powerful!

Try:

In [155]:
cdf['date'] = pd.to_datetime(cdf['date'])

Now you can treat the column as a datetime objects using `df[col].dt` , e.g. https://docs.python.org/3/library/datetime.html#datetime.date.year

In [156]:
cdf['date'].dt.day.head()

0    14
1    13
2    12
3    12
4    12
Name: date, dtype: int64

In [157]:
cdf.head()

Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.01377925
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.05277624
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.86876792
3,2020-11-12,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.13426564
4,2020-10-12,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.96865815


Create a new column `deltaTime_since_start_of_recording`

In [158]:
cdf.loc[0, 'date']

Timestamp('2020-12-14 00:00:00')

In [159]:
minimum = min(cdf['date'])

In [160]:
cdf.loc[0, 'date'] - min(cdf['date'])

Timedelta('349 days 00:00:00')

In [161]:
subset = cdf.head().copy()

In [162]:
cdf['deltaTime_since_start_of_recording'] = cdf['date'].apply(lambda x: x - minimum)

In [163]:
cdf

Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence,deltaTime_since_start_of_recording
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.01377925,349 days
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.05277624,348 days
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.86876792,347 days
3,2020-11-12,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.13426564,317 days
4,2020-10-12,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.96865815,286 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61895,2020-03-25,25,03,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,,85 days
61896,2020-03-24,24,03,2020,0,1,Zimbabwe,ZW,ZWE,14645473.0,Africa,,84 days
61897,2020-03-23,23,03,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,,83 days
61898,2020-03-22,22,03,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,,82 days


Change 14-day incidence column from object to float.

In [164]:
cdf['14-day incidence'] = pd.to_numeric(cdf['14-day incidence'])

In [165]:
cdf.dtypes

date                                   datetime64[ns]
day                                            object
month                                          object
year                                           object
cases                                           int64
deaths                                          int64
region                                         object
Geographical ID                                object
Region Code                                    object
population (2019)                             float64
continent                                      object
14-day incidence                              float64
deltaTime_since_start_of_recording    timedelta64[ns]
dtype: object

Create histograms for different columns or describe the df. Can you spot the inconsistency in the data? Fix it! :)

In [166]:
import plotly.express as px

import numpy as np

In [167]:
!pip install -U kaleido



In [168]:
cdf.describe()

Unnamed: 0,cases,deaths,population (2019),14-day incidence,deltaTime_since_start_of_recording
count,61900.0,61900.0,61777.0,59021.0,61900
mean,1155.147237,26.05546,40987700.0,66.320586,188 days 16:08:43.890145394
std,6779.224479,131.227055,153129400.0,162.32924,95 days 19:39:14.114916173
min,-8261.0,-1918.0,815.0,-147.419587,0 days 00:00:00
25%,0.0,0.0,1293120.0,0.757526,111 days 00:00:00
50%,15.0,0.0,7169456.0,6.724045,192 days 00:00:00
75%,273.0,4.0,28515830.0,52.572719,270 days 00:00:00
max,234633.0,4928.0,1433784000.0,1900.83621,349 days 00:00:00


In [169]:
#remove negative values
cdf_cleaned = cdf[(cdf['cases']>=0) & (cdf['deaths'] >=0) & (cdf['14-day incidence']>=0)]

cdf_cleaned.describe()

Unnamed: 0,cases,deaths,population (2019),14-day incidence,deltaTime_since_start_of_recording
count,58959.0,58959.0,58959.0,58959.0,58959
mean,1212.754168,27.316135,41241310.0,66.402666,191 days 20:55:00.870096166
std,6941.109276,133.162595,153765500.0,162.373044,94 days 23:25:18.727953598
min,0.0,0.0,815.0,0.0,2 days 00:00:00
25%,0.0,0.0,1324820.0,0.763241,117 days 00:00:00
50%,20.0,0.0,7813207.0,6.732521,196 days 00:00:00
75%,310.0,5.0,28608720.0,52.601692,272 days 00:00:00
max,234633.0,4928.0,1433784000.0,1900.83621,349 days 00:00:00


In [170]:
def create_plots(df, directory):
    for col in df.columns:
        if df[col].dtypes not in [np.int64,np.float64]:
            continue
        else:
            fig = px.histogram(df, x=col)
            fig.write_image(f"{directory}/"+col+".png")

In [171]:
create_plots(cdf_cleaned, 'histograms')

Identify those countries (grouped by continent) which showed the most drastic increase most drastic and decrease of the `14d-incidence` within the different years since recording. Visualize intuitively!

In [172]:
def calculate_in_decrease(df, col_meta = 'region', col_1 = 'date', col_2 = '14-day incidence'):

    set_of_meta_values = set(df[col_meta])

    dict = {}

    for meta_value in set_of_meta_values:

        subset = df[df[col_meta] == meta_value]

        maximum_date = max(subset[col_1])

        minimum_date = min(subset[col_1])
        
        incidence_2 = subset[subset[col_1] == maximum_date][col_2]  

        incidence_2 = incidence_2.values[0]

        incidence_1 = subset[subset[col_1] == minimum_date][col_2] 

        incidence_1 = incidence_1.values[0]

        dif = incidence_2 - incidence_1

        dict.update({meta_value: dif})

    return dict

In [173]:
dict_region = calculate_in_decrease(cdf_cleaned)

In [183]:
df_region = pd.DataFrame.from_dict(dict_region, orient='index')

df_region.rename(columns = {0: "14-day incidence change within year"}, inplace=True)

df_region

Unnamed: 0,14-day incidence change within year
India,33.109055
Ireland,80.420208
Turks_and_Caicos_islands,52.364246
Togo,3.167392
Laos,0.027896
...,...
South_Sudan,0.596631
Yemen,0.003429
Trinidad_and_Tobago,9.964379
Democratic_Republic_of_the_Congo,1.977173


In [184]:
list_continent = []

for region in df_region.index.values.tolist():
    continent = cdf[cdf['region'] == region]['continent'].values[0]

    list_continent.append(continent)

df_region['continent'] = list_continent

df_region

Unnamed: 0,14-day incidence change within year,continent
India,33.109055,Asia
Ireland,80.420208,Europe
Turks_and_Caicos_islands,52.364246,America
Togo,3.167392,Africa
Laos,0.027896,Asia
...,...,...
South_Sudan,0.596631,Africa
Yemen,0.003429,Asia
Trinidad_and_Tobago,9.964379,America
Democratic_Republic_of_the_Congo,1.977173,Africa


In [187]:
df_region['region'] = df_region.index

In [193]:
grps = df_region.groupby(['continent'])

increase = grps.max()

increase

Unnamed: 0_level_0,14-day incidence change within year,region
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,139.107096,Zimbabwe
America,909.950276,Venezuela
Asia,557.933056,Yemen
Europe,1388.098666,United_Kingdom
Oceania,531.714915,Vanuatu


In [195]:
lat = [-22.21667, 4.60226, 12.64881, 50.10319, -19.55]

lng = [25.83066, -72.55212, 42.95452, -7.64133, 167.16235]

increase['lat'] = lat

increase['lng'] = lng

increase

Unnamed: 0_level_0,14-day incidence change within year,region,lat,lng
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,139.107096,Zimbabwe,-22.21667,25.83066
America,909.950276,Venezuela,4.60226,-72.55212
Asia,557.933056,Yemen,12.64881,42.95452
Europe,1388.098666,United_Kingdom,50.10319,-7.64133
Oceania,531.714915,Vanuatu,-19.55,167.16235


In [210]:
fig = px.scatter_mapbox(
    increase, 
    lat='lat', 
    lon='lng', 
    hover_name='14-day incidence change within year',
    zoom=1,
    size='14-day incidence change within year',
    mapbox_style="stamen-terrain"
)
fig.show()


In [197]:
decrease = grps.min()

decrease

Unnamed: 0_level_0,14-day incidence change within year,region
continent,Unnamed: 1_level_1,Unnamed: 2_level_1
Africa,-9.166862,Algeria
America,-5.574505,Anguilla
Asia,-18.603108,Afghanistan
Europe,-613.496933,Albania
Oceania,-5.102822,Australia


In [214]:
lat = [36.737232,18.227230,34.543896,41.327953,-33.865143]
lng = [3.086472,-63.048988,69.160652,19.819025,151.209900]

decrease['lat'] = lat

decrease['lng'] = lng

decrease

Unnamed: 0_level_0,14-day incidence change within year,region,lat,lng
continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Africa,-9.166862,Algeria,36.737232,3.086472
America,-5.574505,Anguilla,18.22723,-63.048988
Asia,-18.603108,Afghanistan,34.543896,69.160652
Europe,-613.496933,Albania,41.327953,19.819025
Oceania,-5.102822,Australia,-33.865143,151.2099


In [225]:
fig = px.scatter_mapbox(
    decrease, 
    lat='lat', 
    lon='lng', 
    hover_name='14-day incidence change within year',
    zoom=1,
    size= decrease["14-day incidence change within year"].abs(),
    size_max=50,
    color_discrete_sequence=["red"],
    mapbox_style="stamen-terrain"
)
fig.show()


Which country showed the highest/lowest fluctuation in `14d-incidence` within a year?

In [226]:
def calculate_fluctuation(df, col_meta = 'region', col = '14-day incidence'):

    set_of_meta_values = set(df[col_meta])

    dict = {}

    for meta_value in set_of_meta_values:

        subset = df[df[col_meta] == meta_value]

        sd = np.std(subset[col])

        dict.update({meta_value: sd})

    return dict

In [228]:
dict_fluct = calculate_fluctuation(cdf_cleaned)

print(f'The country with the highest fluctutation is {max(dict_fluct)}.')
print(f'The country with the lowest fluctutation is {min(dict_fluct)}.')

The country with the highest fluctutation is Zimbabwe.
The country with the lowest fluctutation is Afghanistan.


Create a line plot showing the `14-incidence` for all European countries. Use `groupby` operation to generate the data list for the plotly plot. 

In [246]:
subset_europe = cdf_cleaned[cdf_cleaned['continent'] == 'Europe']

fig = px.line(subset_europe, x="date", y="14-day incidence", color='region')
fig.show()

Create a smoothed version of the `14d-incidence` by averaging 3 months.

In [248]:
from collections import deque

In [249]:
def sliding_window(y_data, window_size):

    sliding_window = deque([], maxlen = window_size)
    mean_list = []
    for y in y_data:
        sliding_window.append(y)
        mean = np.mean(list(sliding_window))
        mean_list.append(mean)
    return mean_list

In [252]:
mean_list = sliding_window(cdf_cleaned['14-day incidence'], 90)

cdf_averaged = cdf_cleaned.copy()

cdf_averaged['14-day incidence'] = mean_list

cdf_averaged

Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence,deltaTime_since_start_of_recording
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779,349 days
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,8.033278,348 days
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,7.645108,347 days
3,2020-11-12,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.517397,317 days
4,2020-10-12,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,7.407649,286 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...
61882,2020-07-04,07,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.601248,186 days
61883,2020-06-04,06,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.585695,156 days
61884,2020-05-04,05,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.575149,125 days
61885,2020-04-04,04,04,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.564073,95 days


In [253]:
subset_europe = cdf_averaged[cdf_averaged['continent'] == 'Europe']

fig = px.line(subset_europe, x="date", y="14-day incidence", color='region')
fig.show()

Create a radial plot of death rate / 100000 people (see popData2019), where one year completes a circle, i.e. 360˚. Visualize the recored years for Italy, Germany, Sweden and Greece. Hint you might need to turn the dateTime into `day within the year` (%j) and adjust 365 to 360 degrees. 

In [257]:
cdf_cleaned['death rate'] = cdf_cleaned.apply(lambda x: x['deaths']/x['population (2019)']*100000, axis=1)

cdf_cleaned



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence,deltaTime_since_start_of_recording,death rate
0,2020-12-14,14,12,2020,746,6,Afghanistan,AF,AFG,38041757.0,Asia,9.013779,349 days,0.015772
1,2020-12-13,13,12,2020,298,9,Afghanistan,AF,AFG,38041757.0,Asia,7.052776,348 days,0.023658
2,2020-12-12,12,12,2020,113,11,Afghanistan,AF,AFG,38041757.0,Asia,6.868768,347 days,0.028916
3,2020-11-12,11,12,2020,63,10,Afghanistan,AF,AFG,38041757.0,Asia,7.134266,317 days,0.026287
4,2020-10-12,10,12,2020,202,16,Afghanistan,AF,AFG,38041757.0,Asia,6.968658,286 days,0.042059
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61882,2020-07-04,07,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.047796,186 days,0.000000
61883,2020-06-04,06,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.047796,156 days,0.000000
61884,2020-05-04,05,04,2020,0,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.047796,125 days,0.000000
61885,2020-04-04,04,04,2020,1,0,Zimbabwe,ZW,ZWE,14645473.0,Africa,0.054624,95 days,0.000000


In [None]:
import plotly.graph_objects as go

In [262]:
subset_radial_plot = cdf_cleaned[(cdf_cleaned['region'] == 'Germany') | (cdf_cleaned['region'] == 'Italy') | (cdf_cleaned['region'] == 'Sweden') | (cdf_cleaned['region'] == 'Greece')]

subset_radial_plot

Unnamed: 0,date,day,month,year,cases,deaths,region,Geographical ID,Region Code,population (2019),continent,14-day incidence,deltaTime_since_start_of_recording,death rate
21880,2020-12-14,14,12,2020,16362,188,Germany,DE,DEU,83019213.0,Europe,341.136696,349 days,0.226454
21881,2020-12-13,13,12,2020,20200,321,Germany,DE,DEU,83019213.0,Europe,334.881517,348 days,0.386657
21882,2020-12-12,12,12,2020,28438,496,Germany,DE,DEU,83019213.0,Europe,328.149341,347 days,0.597452
21883,2020-11-12,11,12,2020,29875,598,Germany,DE,DEU,83019213.0,Europe,320.027124,317 days,0.720315
21884,2020-10-12,10,12,2020,23679,440,Germany,DE,DEU,83019213.0,Europe,311.512228,286 days,0.529998
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54291,2020-01-17,17,01,2020,0,0,Sweden,SE,SWE,10230185.0,Europe,0.000000,17 days,0.000000
54292,2020-01-16,16,01,2020,0,0,Sweden,SE,SWE,10230185.0,Europe,0.000000,16 days,0.000000
54293,2020-01-15,15,01,2020,0,0,Sweden,SE,SWE,10230185.0,Europe,0.000000,15 days,0.000000
54294,2020-01-14,14,01,2020,0,0,Sweden,SE,SWE,10230185.0,Europe,0.000000,14 days,0.000000


In [263]:
subset_radial_plot['theta'] = subset_radial_plot['deltaTime_since_start_of_recording']/360



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [265]:
px.line_polar(subset_radial_plot, r='death rate', theta='theta', color='region')

# Illuminaty confirmed.