In [1]:
# import libraries
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pandas.io.json import json_normalize
import urllib.request
from datetime import datetime

#from IPython.display import Image, display, HTML
from IPython.display import display, HTML
import ipywidgets as widgets
from ipywidgets import interact
#import scipy

#import chart_studio.plotly as py
#import plotly.graph_objs as go
#from plotly.offline import iplot, init_notebook_mode
#init_notebook_mode(connected=True)

#import cufflinks as cf
#cf.go_offline(connected=True)
#cf.set_config_file(colorscale='plotly', world_readable=True)

# Extra options
#pd.options.display.max_rows = 30
#pd.options.display.max_columns = 25

# Show all code cells outputs
#from IPython.core.interactiveshell import InteractiveShell
#InteractiveShell.ast_node_interactivity = 'all'
#
#pd.set_option('display.max_rows', None)

In [2]:
# constants
path_source = "https://opendata.ecdc.europa.eu/covid19/casedistribution/json/"
path_countries_with_no_population = "https://https://raw.githubusercontent.com/JoanFerrerSerrat/Reports/master/Source/countries_with_no_population.csv"
path_countries_shorter_names = "https://raw.githubusercontent.com/JoanFerrerSerrat/Reports/master/Source/countries_with_no_population.csv"

In [3]:
# read main source
response = urllib.request.urlopen(path_source)
fileWeb = response.read()
coronavirus_json_raw = json.loads(fileWeb)
coronavirus_initial_df = json_normalize(coronavirus_json_raw['records'])

# custom data for some countries without population 
country_without_population_df = pd.read_csv(path_countries_with_no_population, sep=';', encoding='utf-8')

# custom data to shorten country names
country_shorter_names_df = pd.read_csv(path_countries_shorter_names, sep=';', encoding='utf-8')

In [4]:
# Cleansing dataset

# just needed columns
coronavirus_df = coronavirus_initial_df.loc[:, ["countriesAndTerritories", "dateRep", "cases", "deaths", "popData2018"]]

# rename columns
rename_column = {"countriesAndTerritories" : "country", "dateRep" : "date", "popData2018": "population"}
coronavirus_df.rename(columns=rename_column, inplace=True)

coronavirus_df["population"] = np.where((coronavirus_df["population"] == "") | (coronavirus_df["population"].isna()), "0", coronavirus_df["population"])

# shorten country names
coronavirus_df = pd.merge(coronavirus_df, country_shorter_names_df, left_on="country", right_on="long_name", how="left")
coronavirus_df["country"] = np.where(coronavirus_df["long_name"].notna(), coronavirus_df["short_name"], coronavirus_df["country"])
del coronavirus_df["long_name"]
del coronavirus_df["short_name"]

# convert to the poper data type
datatype_ict={'cases':int, 'deaths':int, 'population':float}
coronavirus_df = coronavirus_df.astype(datatype_ict)

# fullfill blanks in population
coronavirus_df = pd.merge(coronavirus_df, country_without_population_df, on="country", how="left", suffixes=['','_temp'])
coronavirus_df["population"] = np.where(coronavirus_df["population"] == 0, coronavirus_df["population_temp"], coronavirus_df["population"])
#coronavirus_df["population"] = np.where(((coronavirus_df["population"] == "") | (coronavirus_df["population"].isna())) & (coronavirus_df["population_temp"].notna()), coronavirus_df["population_temp"], coronavirus_df["population"]) 
coronavirus_df = coronavirus_df[coronavirus_df["population"].notna()]

# cast to datetime
coronavirus_df['date'] = coronavirus_df['date'].apply(lambda x: datetime.strptime(x, '%d/%m/%Y'))
del coronavirus_df["population_temp"]

# sort by country and date
coronavirus_df.sort_values(by = ['country', 'date'], inplace = True)

All the metrics divided by population have a escala 100000. ex. deaths by 100000 people

In [5]:
escala = 100000

coronavirus_agg_df = coronavirus_df.groupby(["country"]).agg(
                             {"cases": {"cases_sum" : sum, "cases_last": "last"}, 
                              "deaths": {"deaths_sum" : sum, "deaths_last": "last"},
                              "population" : {"population": np.mean}})

coronavirus_agg_df.columns = coronavirus_agg_df.columns.droplevel()

coronavirus_agg_df["cases_sum/pop"]   = escala * (coronavirus_agg_df["cases_sum"]  /coronavirus_agg_df["population"])
coronavirus_agg_df["cases_last/pop"]  = escala * (coronavirus_agg_df["cases_last"] /coronavirus_agg_df["population"])
coronavirus_agg_df["deaths_sum/pop"]  = escala * (coronavirus_agg_df["deaths_sum"] /coronavirus_agg_df["population"])
coronavirus_agg_df["deaths_last/pop"] = escala * (coronavirus_agg_df["deaths_last"]/coronavirus_agg_df["population"])
coronavirus_agg_df = coronavirus_agg_df.reset_index()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [8]:
# metric_order_by control
list_columns_can_be_ordered = coronavirus_agg_df.select_dtypes(np.number).columns.tolist()
list_columns_can_be_ordered.remove('population')
country_list = []
default_metric = "deaths_sum/pop"

# min_pop
min_n_population = 0
max_n_population = 150000000
step_n_population = 100000
default_n_population = 40000000

# list columns shown in table (remove population)
list_columns_show = list(coronavirus_agg_df.columns)
list_columns_show.remove("population")

@interact
def show_articles_more_than(metric_order_by=widgets.Dropdown(options=list_columns_can_be_ordered, 
                                                             value=default_metric), 
                            min_pop=widgets.IntSlider(min=min_n_population, 
                                                      max=max_n_population, 
                                                      step=100000, 
                                                      value=default_n_population)):
    global country_list
    coronavirus_agg_df.sort_values(by = metric_order_by, ascending = False, inplace = True)
    coronavirus_agg_df.reset_index()
    display(HTML(f'<h2>Ordered by the column {metric_order_by}. Countries with more than {min_pop} people<h2>'))
    display(coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, list_columns_show])
    coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, ["country", metric_order_by]].head(10)\
                                                                                                    .plot(x = "country", \
                                                                                                     y=metric_order_by, \
                                                                                                     figsize=(50, 10))
    plt.xticks(fontsize=30, rotation=90)
    plt.gca().margins(x=0)
    plt.yticks(fontsize=30)
    metric = metric_order_by
    country_list = coronavirus_agg_df.loc[coronavirus_agg_df["population"] > min_pop, "country"].to_list()

interactive(children=(Dropdown(description='metric_order_by', index=5, options=('cases_sum', 'cases_last', 'de…

There is the option to select the countries that has been selected in the previous table and graphic.

In [7]:
# metric_
list_metrics = coronavirus_df.select_dtypes(np.number).columns.tolist()
list_metrics.remove('population')

metric_default = "deaths"
metric_description = "Metric"

# date_from
date_from_description = 'From Date'
date_from_disabled = False

# date_to
date_to_description = 'To Date'
date_to_disabled = False

# kind_plot
list_graphic_kind = ["swarm", "point", "bar"]
plot_description = "Plot kind"
plot_kind_default = "point"

# countries
countries_description = "Countries"
countries_default = country_list[:2]


@interact
def plot_temporal(metric_   = widgets.Dropdown( options = list_metrics, 
                                                description = metric_description, 
                                                value = metric_default ),
                  date_from = widgets.DatePicker( description = date_from_description, 
                                                  disabled = date_from_disabled ),
                  date_to   = widgets.DatePicker( description = date_to_description, 
                                                  disabled = date_to_disabled ),
                  kind_plot = widgets.Dropdown( options = list_graphic_kind, 
                                                description = plot_description, 
                                                value = plot_kind_default ),
                  countries = widgets.SelectMultiple( description = countries_description, 
                                                      value = countries_default, 
                                                      options = country_list )):
   
    # deal with dates
    # documentation says it should work with date_from.value and date_to.value, it says it doesn't exists the property value
    min_date_with_data = coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & (coronavirus_df[metric_] > 0), ["date"]].min()
    max_date_with_data = coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & (coronavirus_df[metric_] > 0), ["date"]].max()
    if date_from is None:
        dt_from = min_date_with_data[0]
    else:
        dt_from = max(pd.to_datetime(datetime.date(date_from.year, date_from.month, date_from.day)), min_date_with_data[0])
    if date_to is None:
        dt_to = max_date_with_data[0]
    else:
        dt_to = min(pd.to_datetime(datetime.date(date_to.year, date_to.month, date_to.day)), max_date_with_data[0])
    
    display(countries)
    
    coronavirus_df['date_plot'] = coronavirus_df['date'].dt.strftime('%d-%m-%Y')
    chart = sns.catplot(x = 'date_plot', y = metric_, hue = 'country', kind = kind_plot, \
                        data = (coronavirus_df.loc[(coronavirus_df.country.isin(countries)) & \
                               (coronavirus_df.date>=dt_from) & (coronavirus_df.date<=dt_to)]), legend_out=False)
    chart.fig.set_size_inches(40,10)
    chart.set_xticklabels(rotation=90, horizontalalignment='center', fontsize=20)
    chart.set_yticklabels(fontsize=20)
    plt.xlabel("")
    plt.ylabel(metric_, fontsize=24)
    plt.legend(loc='upper left')

interactive(children=(Dropdown(description='Metric', index=1, options=('cases', 'deaths'), value='deaths'), Da…