In [None]:
# initial setup
# %run "../../../common/0_notebooks_base_setup.py"


---

<img src='../../../common/logo_DH.png' align='left' width=35%/>


# Visualización 2 - bokeh y plotly

---

## Dataset

En esta práctica vamos a usar datos de COVID-19 que disponibiliza Johns Hopkins University en 

https://github.com/CSSEGISandData/COVID-19

Novel Coronavirus COVID-19 (2019-nCoV) Data Repository by Johns Hopkins CSSE (CSSE = Computer Science and Software Engineering)

Los datasets que usamos se actualizan diariamente y llevan el registro de cantidad de confirmados, muertos y recuperados por país y región.

Usando estos datos vamos a construir algunas visualizaciones con las bibliotecas bokeh y plotly.

(Nota: la actualizacion es diaria en mayo 2020.)


## Imports

In [2]:
import pandas as pd
import numpy as np
import datetime

## Leemos los datos desde github

In [3]:
world_confirmed_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
world_deaths_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'
world_recovered_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv'

world_confirmed = pd.read_csv(world_confirmed_url)
world_deaths = pd.read_csv(world_deaths_url)
world_recovered = pd.read_csv(world_recovered_url)

In [4]:
world_confirmed.head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/14/22,1/15/22,1/16/22,1/17/22,1/18/22,1/19/22,1/20/22,1/21/22,1/22/22,1/23/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,158639,158678,158717,158826,158974,159070,159303,159516,159548,159649
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,228777,230940,232637,233654,236486,239129,241512,244182,246412,248070
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,224979,225484,226057,226749,227559,228918,230470,232325,234536,236670


In [5]:
world_deaths.head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/14/22,1/15/22,1/16/22,1/17/22,1/18/22,1/19/22,1/20/22,1/21/22,1/22/22,1/23/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,7376,7378,7379,7381,7383,7386,7386,7390,7390,7393
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,3262,3265,3269,3271,3277,3283,3286,3292,3297,3305
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,6393,6404,6412,6423,6435,6443,6453,6468,6481,6495


In [6]:
world_recovered.head(3)

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,1/14/22,1/15/22,1/16/22,1/17/22,1/18/22,1/19/22,1/20/22,1/21/22,1/22/22,1/23/22
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Ejercicio 1

Construyamos tres DataFrame (confirmed_data, deaths_data, recovered_data) con las siguiente columnas
* Country, de tipo string
* Fecha, de tipo timestamp
* Cantidad, de tipo int

Como esta operación tenemos que repetirla para los tres DataFrame, definamos una función que lo resuelva.

Ayuda: usemos la función `melt` de pandas

https://pandas.pydata.org/docs/reference/api/pandas.melt.html


In [7]:
def prepare_data(raw_data):

    # vemos cuáles son las columnas en el DataFrame
    #print(world_deaths.columns)

    # armamos una lista con las columnas que queremos exluir
    exclude_columns = ['Province/State', 'Country/Region', 'Lat', 'Long']

    id_vars = ['Country/Region']
    value_vars = [element for element in raw_data.columns if element not in exclude_columns]
    result = pd.melt(raw_data, id_vars=id_vars, value_vars=value_vars)

    result["Fecha"] = result.variable.apply(lambda x: pd.Timestamp(x))

    result = result.drop("variable", axis = 1)

    result = result.rename({'value': 'Cantidad', 'Country/Region': 'Country'}, axis=1)
    
    return result

#variable_timestamp
#world_deaths_latam_plot["variable_timestamp"] = variable_timestamp

#world_deaths_latam_plot.variable.apply(lambda x: pd.Timestamp(x))

world_deaths_melt = prepare_data(world_deaths)
world_deaths_melt.head(3)

Unnamed: 0,Country,Cantidad,Fecha
0,Afghanistan,0,2020-01-22
1,Albania,0,2020-01-22
2,Algeria,0,2020-01-22


In [8]:
world_recovered_melt = prepare_data(world_recovered)
world_recovered_melt.head(3)

Unnamed: 0,Country,Cantidad,Fecha
0,Afghanistan,0,2020-01-22
1,Albania,0,2020-01-22
2,Algeria,0,2020-01-22


In [9]:
world_confirmed_melt = prepare_data(world_confirmed)
world_confirmed_melt.head(3)

Unnamed: 0,Country,Cantidad,Fecha
0,Afghanistan,0,2020-01-22
1,Albania,0,2020-01-22
2,Algeria,0,2020-01-22


In [10]:
world_deaths_melt.dtypes

Country             object
Cantidad             int64
Fecha       datetime64[ns]
dtype: object

In [11]:
world_recovered_melt.dtypes

Country             object
Cantidad             int64
Fecha       datetime64[ns]
dtype: object

In [12]:
world_confirmed_melt.dtypes

Country             object
Cantidad             int64
Fecha       datetime64[ns]
dtype: object

## Ejercicio 2 - Preparación de datos

Vamos a construir dos DataFrames.

Elijamos los 10 países con mayor cantidad de infectados para la fecha 2 de mayo de 2020.

**DataFrame 1: data_confirmed_countries**

Construyamos un dataset con los datos de casos confirmados para esos países y los campos:

* Country, de tipo string
* Fecha, de tipo timestamp
* Cantidad, de tipo int
* Delta_Days, de tipo int que representa la cantidad de días desde el primer confirmado en alguno de los 10 países

**DataFrame 2: data_confirmed_recovered_countries**

Combinemos los datos de infectados y recuperados para los 10 países que elegimos arriba.

Los campos deben ser:
* Country, de tipo string
* Fecha, de tipo timestamp
* Cantidad_Recovered, de tipo int
* Cantidad_Confirmed, de tipo int


**DataFrame 1: data_confirmed_countries**

In [13]:
fecha_mask = world_confirmed_melt.Fecha == pd.Timestamp(day=2, month=5, year=2020)
world_confirmed_fecha = world_confirmed_melt.loc[fecha_mask, :]
world_confirmed_fecha_sort = world_confirmed_fecha.sort_values(by="Cantidad", ascending = False)
countries_to_plot = world_confirmed_fecha_sort["Country"][0:10].values
countries_to_plot

array(['US', 'Spain', 'Italy', 'United Kingdom', 'France', 'Germany',
       'Turkey', 'Russia', 'Brazil', 'Iran'], dtype=object)

In [14]:
world_confirmed_countries_mask = [ x in countries_to_plot for x in world_confirmed_melt.Country]
data_confirmed_countries = world_confirmed_melt.loc[world_confirmed_countries_mask, :]

Calculamos la fecha del primer caso confirmado en esos países.

Para eso vemos cuál es la fecha mínima para los registros cuyo valor en el campo Cantidad es mayor que 0.

In [15]:
data_confirmed_countries_gt0_mask = data_confirmed_countries.Cantidad > 0
data_confirmed_countries_gt0_fecha = data_confirmed_countries.loc[data_confirmed_countries_gt0_mask, "Fecha"]
fecha_umbral = data_confirmed_countries_gt0_fecha.min()
fecha_umbral

Timestamp('2020-01-22 00:00:00')

Creo la columna Delta_Days en el DataFrame original:

In [16]:
delta = world_confirmed_melt.Fecha - fecha_umbral
world_confirmed_melt["Delta_Days"] = [x.days for x in delta]
data_confirmed_countries = world_confirmed_melt.loc[world_confirmed_countries_mask, :]
data_confirmed_countries

Unnamed: 0,Country,Cantidad,Fecha,Delta_Days
30,Brazil,0,2020-01-22,0
119,France,0,2020-01-22,0
120,France,0,2020-01-22,0
121,France,0,2020-01-22,0
122,France,0,2020-01-22,0
...,...,...,...,...
205226,United Kingdom,20425,2022-01-23,732
205227,United Kingdom,155,2022-01-23,732
205228,United Kingdom,4,2022-01-23,732
205229,United Kingdom,5524,2022-01-23,732


In [17]:
data_confirmed_countries.dtypes

Country               object
Cantidad               int64
Fecha         datetime64[ns]
Delta_Days             int64
dtype: object

**DataFrame 2: data_confirmed_recovered_countries**

Comenazamos haciendo un join de los dos DataFrame

In [18]:

world_confirmed_melt.index = [world_confirmed_melt.Country, world_confirmed_melt.Fecha]

world_confirmed_melt_join = world_confirmed_melt.drop(['Country', 'Fecha', 'Delta_Days'], axis = 1)

world_recovered_melt.index = [world_recovered_melt.Country, world_recovered_melt.Fecha]
world_recovered_melt_join = world_recovered_melt.drop(['Country', 'Fecha'], axis = 1)
world_recovered_melt_join

data_confirmed_recovered = world_confirmed_melt_join.join(world_recovered_melt_join, lsuffix='_Confirmed', rsuffix='_Recovered')
data_confirmed_recovered
data_confirmed_recovered = data_confirmed_recovered.reset_index(drop = False)
#data_confirmed_recovered

country_mask = [ x in countries_to_plot for x in data_confirmed_recovered.Country]
data_confirmed_recovered_countries = data_confirmed_recovered.loc[country_mask, :]
data_confirmed_recovered_countries


Unnamed: 0,Country,Fecha,Cantidad_Confirmed,Cantidad_Recovered
63038,Brazil,2020-01-22,0,0
63039,Brazil,2020-01-23,0,0
63040,Brazil,2020-01-24,0,0
63041,Brazil,2020-01-25,0,0
63042,Brazil,2020-01-26,0,0
...,...,...,...,...
1276148,United Kingdom,2022-01-23,15859288,0
1276149,United Kingdom,2022-01-23,15859288,0
1276150,United Kingdom,2022-01-23,15859288,0
1276151,United Kingdom,2022-01-23,15859288,0


# Ejercicio 3 - bokeh

Grafiquemos usando la paleta Category20 de bokeh un gráfico de lineas y puntos de cantidad de días desde el primer confirmado vs. cantidad de confirmados, en escala logarítmica, usando los datos del primer DatFrame resultado del ejercicio 3

In [19]:
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.palettes import Category20
from bokeh.models import HoverTool


In [20]:
Category20[20]

('#1f77b4',
 '#aec7e8',
 '#ff7f0e',
 '#ffbb78',
 '#2ca02c',
 '#98df8a',
 '#d62728',
 '#ff9896',
 '#9467bd',
 '#c5b0d5',
 '#8c564b',
 '#c49c94',
 '#e377c2',
 '#f7b6d2',
 '#7f7f7f',
 '#c7c7c7',
 '#bcbd22',
 '#dbdb8d',
 '#17becf',
 '#9edae5')

In [21]:
keys = data_confirmed_countries["Country"].unique()

colors = [Category20[20][i] for i in range(len(keys)) ]

colormap = dict(zip(keys, colors))
colormap


{'Brazil': '#1f77b4',
 'France': '#aec7e8',
 'Germany': '#ff7f0e',
 'Iran': '#ffbb78',
 'Italy': '#2ca02c',
 'Russia': '#98df8a',
 'Spain': '#d62728',
 'Turkey': '#ff9896',
 'US': '#9467bd',
 'United Kingdom': '#c5b0d5'}

In [22]:

#Specify the selection tools to be made available
select_tools = ['box_zoom', 'pan', 'wheel_zoom', 'reset', 'crosshair', 'save']

# Format the tooltip
tooltips = [
    ('', '@Country'),
    ("(x,y)", "($x, $y)")
]

# Create a blank figure with labels
p = figure(y_axis_type="log",
           plot_width = 800, plot_height = 600, 
           title = 'Confirmed in top 10 countries',
           x_axis_label = 'Days from first confirmed', 
           y_axis_label = 'Confirmed count',
           tools=select_tools)

## en el checkpoint dibujar solo dos series

for country in colormap.keys():
    country_mask =  data_confirmed_countries['Country'] == country
    country_data = data_confirmed_countries.loc[country_mask]
    
    # Add line glyph
    #p.line('Delta_Days', 'Cantidad', line_width = 2,
    #       color = colormap[country], alpha=0.8, 
    #       legend_label = country, source = country_data)
    
    # Add circle glyph
    p.circle('Delta_Days', 'Cantidad', size = 6,
        color = colormap[country], 
        legend_label = country, source = country_data)

p.legend.location = "top_left"
p.legend.click_policy='hide'

p.add_tools(HoverTool(tooltips=tooltips))


# Set to output the plot in the notebook
output_notebook()
# Show the plot
show(p)

## Ejercicio 4 - plotly

https://chart-studio.plotly.com

Usemos una animación para mostrar la cantidad de confirmados vs. cantidad de recuperados para la última semana de datos.

Para ello, filtremos los datos del segundo DataFrame resultado del ejercicio 3.



Comenzamos calculando la fecha máxima de `data_confirmed_recovered_countries`

In [23]:
max_fecha = data_confirmed_recovered_countries.Fecha.max()
print(max_fecha)
fecha_umbral = max_fecha + datetime.timedelta(days=-7)
print(fecha_umbral)

data_confirmed_recovered["Day"] = [x.days for x in (data_confirmed_recovered.Fecha - fecha_umbral)]

#data_confirmed_recovered

country_mask = [ x in countries_to_plot for x in data_confirmed_recovered.Country]
data_confirmed_recovered_countries = data_confirmed_recovered.loc[country_mask, :]

data_to_plot_mask = data_confirmed_recovered_countries.Fecha >= fecha_umbral
data_to_plot = data_confirmed_recovered_countries.loc[data_to_plot_mask, ]
data_to_plot

2022-01-23 00:00:00
2022-01-16 00:00:00


Unnamed: 0,Country,Fecha,Cantidad_Confirmed,Cantidad_Recovered,Day
63763,Brazil,2022-01-16,23015128,0,0
63764,Brazil,2022-01-17,23089509,0,1
63765,Brazil,2022-01-18,23229851,0,2
63766,Brazil,2022-01-19,23425392,0,3
63767,Brazil,2022-01-20,23595178,0,4
...,...,...,...,...,...
1276148,United Kingdom,2022-01-23,15859288,0,7
1276149,United Kingdom,2022-01-23,15859288,0,7
1276150,United Kingdom,2022-01-23,15859288,0,7
1276151,United Kingdom,2022-01-23,15859288,0,7


In [24]:
import plotly
import plotly.express as px
import chart_studio

In [25]:
#chart-studio api
username = 'you_username' # your username
api_key = 'your_apikey' # your api api_key

chart_studio.tools.set_credentials_file(username=username, api_key=api_key)
import chart_studio.plotly as py

In [26]:
fig = px.scatter(data_frame=data_to_plot, 
                 x='Cantidad_Confirmed', y='Cantidad_Recovered', 
                 color='Country', size='Cantidad_Confirmed', height=500, width=900,
                 animation_frame="Day",
                 text='Country', title= f'Recuperados vs Confirmados',
                 range_x=[100,1200000], range_y=[0,200000])

fig.update_traces(textposition='top center')

fig.layout.update(showlegend=True, 
                  yaxis =  {"title": {"text": "Cantidad de recuperados"}},
                  xaxis =  {"title": {"text": "Cantidad de confirmados"}}
                 ) 


if api_key: py.plot(fig, filename = 'scatter_recuperados_confirmados_top10', auto_open=True)
fig.show()


PlotlyRequestError: No such user