# Coronavirus in the world

This report uses three datasets that are online:
<ul>
    <li>The main dataset retrieves data in format json with number of cases and deaths by country and date. It is external and gets actualised many times during the day.</li>
    <li>There is dataset created by me to provide population to the countries that missed it.</li>
    <li>There is dataset created by me to shorten the name of some countries.</li>
</ul>

First it retrieves a table and a plot. It uses summarized dataset by country where are added the measures/population with a scale of 100.000. Those are the parameters: 
<ul>
    <li>Metric to be ordered by&nbsp;(string) : It orders by the metric chosen.</li>
    <li>Min pop/country&emsp;&emsp;&emsp;&ensp;(int)&emsp;&nbsp; : It filters by the countries with a minimum population</li>
    <li>Top n rows&emsp;&emsp;&emsp;&emsp;&emsp;&ensp;&nbsp;(int)&emsp;&nbsp; : It filters by the number of top countries</li>
</ul>

Afterwards it retrieves a plot. It uses a dataset with a granularity of country and date. From the beginning it is filtered by the countries selected in the first dataset. Those are the parameters:
<ul>
    <li>Metric&emsp;&emsp; (string) : Metric to get analyzed.</li>
    <li>From Date (date)&ensp; : From the date it will be showed the data. By dafault is minimum date when data is above 0 in any country for the selected metric</li>
    <li>To Date&emsp;&nbsp; (date)&ensp; : To the date it will be showed the data. By dafault is maximum date when data is above 0 in any country for the selected metric</li>
    <li>Plot Kind&ensp;&nbsp; (string) : Plot kind applied to plot.</li>
    <li>Countries&ensp; (string) : Countries to be analyzed. It is restricted to the countries selected in the previous table and plot.</li>
</ul>

In [None]:
# run only once
#import sys
#!{sys.executable} -m pip install pandas

In [None]:
# run only once
#import sys
#!{sys.executable} -m pip install matplotlib

In [None]:
# run only once
#import sys
#!{sys.executable} -m pip install seaborn

In [17]:
# import libraries
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
import urllib.request
from datetime import datetime

from IPython.display import display, HTML
import ipywidgets as widgets
from ipywidgets import interact

In [18]:
# constants
path_source = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
path_countries_with_no_population = "https://raw.githubusercontent.com/JoanFerrerSerrat/Reports/master/Source/countries_with_no_population.csv"
path_countries_shorter_names = "https://raw.githubusercontent.com/JoanFerrerSerrat/Reports/master/Source/countries_shorter_names.csv"

In [19]:
# read main source
response = urllib.request.urlopen(path_source)
fileWeb = response.read()
coronavirus_json_raw = json.loads(fileWeb)
coronavirus_initial_df = json_normalize(coronavirus_json_raw['records'])

# custom data for some countries without population 
country_without_population_df = pd.read_csv(path_countries_with_no_population, sep=';', encoding='utf-8')

# custom data to shorten country names
country_shorter_names_df = pd.read_csv(path_countries_shorter_names, sep=';', encoding='utf-8')

In [20]:
# Cleansing dataset

# just needed columns
coronavirus_df = coronavirus_initial_df.loc[:, ["countriesAndTerritories", "dateRep", "cases", "deaths", "popData2018"]]

# rename columns
rename_column = {"countriesAndTerritories" : "country", "dateRep" : "date", "popData2018": "population"}
coronavirus_df.rename(columns=rename_column, inplace=True)

coronavirus_df["population"] = np.where((coronavirus_df["population"] == "") | (coronavirus_df["population"].isna()), "0", coronavirus_df["population"])

# shorten country names
coronavirus_df = pd.merge(coronavirus_df, country_shorter_names_df, left_on="country", right_on="long_name", how="left")
coronavirus_df["country"] = np.where(coronavirus_df["long_name"].notna(), coronavirus_df["short_name"], coronavirus_df["country"])
del coronavirus_df["long_name"]
del coronavirus_df["short_name"]

# convert to the poper data type
datatype_ict={'cases':int, 'deaths':int, 'population':float}
coronavirus_df = coronavirus_df.astype(datatype_ict)

# fullfill blanks in population
coronavirus_df = pd.merge(coronavirus_df, country_without_population_df, on="country", how="left", suffixes=['','_temp'])
coronavirus_df["population"] = np.where(coronavirus_df["population"] == 0, coronavirus_df["population_temp"], coronavirus_df["population"])
#coronavirus_df["population"] = np.where(((coronavirus_df["population"] == "") | (coronavirus_df["population"].isna())) & (coronavirus_df["population_temp"].notna()), coronavirus_df["population_temp"], coronavirus_df["population"]) 
coronavirus_df = coronavirus_df[coronavirus_df["population"].notna()]

# cast to datetime
coronavirus_df['date'] = coronavirus_df['date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))
del coronavirus_df["population_temp"]

# sort by country and date
coronavirus_df.sort_values(by = ['country', 'date'], inplace = True)

All the metrics divided by population have a escala 100000. ex. deaths by 100000 people

In [28]:
scale = 100000

# Calculate aggregates by country
#coronavirus_agg_df = coronavirus_df.groupby(["country"]).agg(
#                             {"cases": {"cases_sum" : sum, "cases_last": "last"}, 
#                              "deaths": {"deaths_sum" : sum, "deaths_last": "last"},
#                              "population" : np.mean})
#
coronavirus_agg_df = coronavirus_df.groupby(["country"]).agg(
                             {"cases": ["sum", "last"], 
                              "deaths": ["sum", "last"],
                              "population" : np.mean}).reset_index()

coronavirus_agg_df.columns = coronavirus_agg_df.columns.droplevel()
coronavirus_agg_df.columns = ["country", "cases_sum", "cases_last", "deaths_sum", "deaths_last", "population"]
coronavirus_agg_df.head()

# relative measures to population
coronavirus_agg_df["cases_sum/pop"]   = scale * (coronavirus_agg_df["cases_sum"]  /coronavirus_agg_df["population"])
coronavirus_agg_df["cases_last/pop"]  = scale * (coronavirus_agg_df["cases_last"] /coronavirus_agg_df["population"])
coronavirus_agg_df["deaths_sum/pop"]  = scale * (coronavirus_agg_df["deaths_sum"] /coronavirus_agg_df["population"])
coronavirus_agg_df["deaths_last/pop"] = scale * (coronavirus_agg_df["deaths_last"]/coronavirus_agg_df["population"])

It retrieves a table and a graphic. It uses summarized dataset by country where are added the measures/population with a scale of 100.000. Those are the parameters: 
<ul>
    <li>Metric to be ordered by&nbsp;(string) : It orders by the metric chosen.</li>
    <li>Min pop/country&emsp;&emsp;&emsp;&ensp;(int)&emsp;&nbsp;   : It filters by the countries with a minimum population</li>
    <li>Top n rows&emsp;&emsp;&emsp;&emsp;&emsp;&ensp;&nbsp;(int)&emsp;&nbsp;   : It filters by the number of top countries</li>
</ul>
The countries selected in this dataset will be also pre-filtered in the next graphic.

In [14]:

from ipywidgets import HBox, Label
style = {'description_width': 'initial'}

# metric_order_by control
list_columns_can_be_ordered = coronavirus_agg_df.select_dtypes(np.number).columns.tolist()
list_columns_can_be_ordered.remove('population')
description_metric = "Metric to be ordered by"
country_list = []
default_metric = "deaths_sum/pop"

# min_pop
min_n_population = 0
max_n_population = 150000000
description_n_population = "Min pop/country"
step_n_population = 100000
default_n_population = 40000000

# top
top_n_rows_min = 2
top_n_rows_default = 30
top_n_rows_description = "Top n rows"
top_n_rows_disabled = False

# list columns shown in table (remove population)
list_columns_show = list(coronavirus_agg_df.columns)
list_columns_show.remove("population")

@interact
def show_metric_by_country(metric_order_by=widgets.Dropdown(options = list_columns_can_be_ordered,
                                                             description = description_metric,
                                                             style = style,
                                                             value = default_metric), 
                            min_pop=widgets.IntSlider(min = min_n_population, 
                                                      max = max_n_population, 
                                                      description = description_n_population,
                                                      style = style,
                                                      step = step_n_population, 
                                                      value = default_n_population),
                            top=widgets.IntText(value = top_n_rows_default, 
                                                description = top_n_rows_description,
                                                style = style,
                                                disabled = top_n_rows_disabled)):
    global country_list
    head = top
    if head < 2:
        head = 2
    coronavirus_agg_df.sort_values(by = metric_order_by, ascending = False, inplace = True)
    coronavirus_agg_df.reset_index()
    display(HTML(f'<h2>Ordered by the column {metric_order_by}.<br/>Countries with more than {min_pop} people.<br/>It retrieves the first {head} rows.<h2>'))
    display(coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, list_columns_show].head(head))
    coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, ["country", metric_order_by]].head(10)\
                                                                                                    .plot(x = "country", \
                                                                                                     y=metric_order_by, \
                                                                                                     figsize=(50, 10))
    plt.xticks(fontsize=30, rotation=90)
    plt.gca().margins(x=0)
    plt.yticks(fontsize=30)
    metric = metric_order_by
    country_list = coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, "country"].head(head).to_list()

interactive(children=(Dropdown(description='Metric to be ordered by', index=6, options=('cases_sum', 'cases_la…

it retrieves a plot. It uses a dataset with a granularity of country and date. From the beginning it is filtered by the countries selected in the first dataset. Those are the parameters:
<ul>
    <li>Metric&emsp;&emsp; (string) : Metric to get analyzed.</li>
    <li>From Date (date)&ensp; : From the date it will be showed the data. By dafault is minimum date when data is above 0 in any country for the selected metric</li>
    <li>To Date&emsp;&nbsp; (date)&ensp; : To the date it will be showed the data. By dafault is maximum date when data is above 0 in any country for the selected metric</li>
    <li>Plot Kind&ensp;&nbsp; (string) : Plot kind applied to plot.</li>
    <li>Countries&ensp; (string) : Countries to be analyzed. It is restricted to the countries selected in the previous table and plot.</li>
</ul>

In [15]:
# metric_
list_metrics = coronavirus_df.select_dtypes(np.number).columns.tolist()
list_metrics.remove('population')

metric_default = "deaths"
metric_description = "Metric"

# date_from
date_from_description = 'From Date'
date_from_disabled = False

# date_to
date_to_description = 'To Date'
date_to_disabled = False

# kind_plot
list_graphic_kind = ["swarm", "point", "bar"]
plot_description = "Plot Kind"
plot_kind_default = "point"

# countries
countries_description = "Countries"
countries_default = country_list[:2]


@interact
def show_metric_by_country_date(metric_   = widgets.Dropdown( options = list_metrics, 
                                                description = metric_description, 
                                                style = style,
                                                value = metric_default ),
                  date_from = widgets.DatePicker( description = date_from_description,
                                                  style = style,
                                                  disabled = date_from_disabled ),
                  date_to   = widgets.DatePicker( description = date_to_description,
                                                  style = style,
                                                  disabled = date_to_disabled ),
                  kind_plot = widgets.Dropdown( options = list_graphic_kind, 
                                                description = plot_description,
                                                style = style,
                                                value = plot_kind_default ),
                  countries = widgets.SelectMultiple( description = countries_description, 
                                                      style = style,
                                                      value = countries_default, 
                                                      options = country_list )):
   
    # deal with dates
    # documentation says it should work with date_from.value and date_to.value, it says it doesn't exists the property value
    min_date_with_data = coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & (coronavirus_df[metric_] > 0), ["date"]].min()
    max_date_with_data = coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & (coronavirus_df[metric_] > 0), ["date"]].max()
    if date_from is None:
        dt_from = min_date_with_data[0]
    else:
        dt_from = max(pd.Timestamp(datetime(date_from.year, date_from.month, date_from.day)), min_date_with_data[0])
    if date_to is None:
        dt_to = max_date_with_data[0]
    else:
        dt_to = min(pd.Timestamp(datetime(date_to.year, date_to.month, date_to.day)), max_date_with_data[0])
    
    display(countries)
    
    coronavirus_df['date_plot'] = coronavirus_df['date'].dt.strftime('%d-%m-%Y')
    chart = sns.catplot(x = 'date_plot', y = metric_, hue = 'country', kind = kind_plot, \
                        data = (coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & \
                               (coronavirus_df.date>=dt_from) & (coronavirus_df.date<=dt_to)]), legend_out=False)
    chart.fig.set_size_inches(40,10)
    chart.set_xticklabels(rotation=90, horizontalalignment='center', fontsize=20)
    chart.set_yticklabels(fontsize=20)
    plt.xlabel("")
    plt.ylabel(metric_, fontsize=24)
    plt.legend(loc='upper left')

interactive(children=(Dropdown(description='Metric', index=1, options=('cases', 'deaths'), style=DescriptionSt…