# Wykład 9 - addendum - aktualizowana wersja obróbki danych o COVID-19


https://github.com/MichalKorzycki/WarsztatPythonDataScience.git/

*API*

https://api.covid19api.com/

_Impact of non-pharmaceutical interventions (NPIs) to reduce COVID19 mortality and healthcare demand_ - Neil M Ferguson et al. 

https://www.imperial.ac.uk/media/imperial-college/medicine/sph/ide/gida-fellowships/Imperial-College-COVID19-NPI-modelling-16-03-2020.pdf

In [None]:
import json
import requests 
import pandas as pd
import numpy as np
from datetime import datetime 
import warnings
warnings.filterwarnings("ignore")

url = "https://api.covid19api.com/all"

with requests.Session() as s:
    input_data = s.get(url).json()
    now = datetime.now().strftime("%Y-%m-%d")
    fname = "all-"+now+".json"
    with open(fname, 'w') as f:
        json.dump(input_data, f)

print("Przeczytano %d wierszy z %s" % (len(input_data), url) )

raw_data = pd.DataFrame(input_data)

---
## Parametry sterujące analizą
- `DAYS_WINDOW` - za ile dni wstecz od najświeższych danych w zbiorze ma być wykonana analiza np. `DAYS_WINDOW=7` - dane z tygodnia
- `N` - dla ilu krajów naraz ma być wykonana analiza
- `SET` - dla której _"N-ki"_  ma być wykonana analiza; np. przy `N=10` dla `SET=0` będzie to 1-sza dziesiątka, `SET=1` będzie to 2-ga dziesiątka itd.
- `ROLL` - za ile dni ma być liczona średnia krocząca
- `DELTADAYS` - przy ustalaniu składu kolejnych `N` brana jest wartość `Confirmed` sprzed `DELTADAYS` dni. Przydaje się w sytuacji gdy nie "spłynęły" wszystkie dane dla bieżącego dnia
---

In [None]:
DAYS_WINDOW=39
N=10 
SET=0
ROLL=7
DELTADAYS=1

In [None]:

data = raw_data[ ["Country", "CountryCode", "Confirmed", "Deaths", "Recovered", "Active", "Date"] ]
data['Date'] = pd.to_datetime(data['Date'], errors='coerce', format='%Y-%m-%dT%H:%M:%S') 
data['Day'] = data['Date'].dt.date

data = data.dropna()
print("Na wejściu mamy %d rekordów i %d kolumn" % (data.shape[0],data.shape[1]))

lastday = max(data["Date"])
daysbefore = lastday + pd.Timedelta(days=-DAYS_WINDOW)
print("Dane od %s do %s" % (str(daysbefore).split(' ')[0], str(lastday).split(' ')[0]) )

lastday = lastday + pd.Timedelta(days=-DELTADAYS)

df = data[ data["Date"] > pd.to_datetime(daysbefore) ]
print("Zostało %d rekordów i %d kolumn" % (df.shape[0],df.shape[1]))

df = df.replace('Iran (Islamic Republic of)', 'Iran')
df = df.replace('Iran, Islamic Republic of', 'Iran')
df = df.replace('Korea, South', 'South Korea')
df = df.replace('Korea (South)', 'South Korea')
df = df.replace('Republic of Korea', 'South Korea')
df = df.replace('Russian Federation', 'Russia')
df = df.replace(' Azerbaijan', 'Azerbaijan')
df = df.replace('Republic of Ireland', 'Ireland')
df = df.replace('Republic of Moldova', 'Moldova')
df = df.replace('Hong Kong SAR', 'Hong Kong')
df = df.replace('Taipei and environs', 'Taiwan')
df = df.replace('Taiwan*', 'Taiwan')

confirmed = df[['Country', 'CountryCode', 'Date', 'Day', 'Confirmed']]
confirmed["status"] = "Confirmed"
confirmed.rename(columns={'Confirmed':'cases'}, inplace=True)
deaths = df[['Country', 'CountryCode', 'Date', 'Day', 'Deaths']]
deaths["status"] = "Deaths"
deaths.rename(columns={'Deaths':'cases'}, inplace=True)
recovered = df[['Country', 'CountryCode', 'Date', 'Day', 'Recovered']]
recovered["status"] = "Recovered"
recovered.rename(columns={'Recovered':'cases'}, inplace=True)
active = df[['Country', 'CountryCode', 'Date', 'Day', 'Active']]
active["status"] = "Active"
active.rename(columns={'Active':'cases'}, inplace=True)

df = pd.concat([confirmed, deaths, recovered, active], axis=0, sort=False)
print("Po 'ręcznym' melt mamy %d rekordów i %d kolumn: %s" % ( df.shape[0],df.shape[1], " ".join(df.columns) ))

df = df.groupby(['Country', 'CountryCode', 'Date', 'Day', 'status', 'cases',]).sum()
df.reset_index(inplace=True)
print("Po agregacji prowincji mamy %d rekordów i %d kolumn: %s" % ( df.shape[0],df.shape[1], " ".join(df.columns) ))


df = df.pivot_table(
        values='cases', 
        index=['Country', 'CountryCode', 'Date', 'Day'], 
        columns='status', 
        aggfunc=np.sum)

df.reset_index(inplace=True)

print("Po operacji pivot mamy %d rekordów i %d kolumn:  %s" % ( df.shape[0], df.shape[1], " ".join(df.columns) ))

topdf = df[ df["Date"] == lastday ]
topdf.reset_index(inplace=True)
topdf = topdf.sort_values(by=['Confirmed'], ascending=False)

topdf.reset_index(drop=True, inplace=True)

first_N_countries = topdf.iloc[N*SET:N*SET+N]["Country"]
smaller_top_N = topdf.iloc[N*SET:(N*SET+N//2)]["Country"]

italy = df[ df['Country'] == 'Italy'  ] 
us = df[ df['Country'] == 'United States of America'  ] 

df = df[ df['Country'].isin(first_N_countries)  ]
df = df.sort_values(by=['Country', 'Date'])
df.reset_index(inplace=True)

print("Po odfiltrowaniu mamy %d rekordów i %d kolumn: %s" % ( df.shape[0],df.shape[1]," ".join(df.columns) ))
print("Przygotowane dane z %d dni dla %d krajów" % 
      ( len(df["Date"].value_counts()), len(df["Country"].value_counts()) ))

df.head()

In [None]:
topdf.head(30)

In [None]:
smaller_df = df[ df['Country'].isin(smaller_top_N)  ]
smaller_df = smaller_df.sort_values(by=['Country', 'Date'])
smaller_df.reset_index(drop=True, inplace=True)

print("Po odfiltrowaniu mniejszych danych mamy %d rekordów i %d kolumn: %s" % ( smaller_df.shape[0],smaller_df.shape[1]," ".join(smaller_df.columns) ))
print("Przygotowane mniejsze dane z %d dni dla %d krajów" % 
      ( len(smaller_df["Date"].value_counts()), len(smaller_df["Country"].value_counts()) ))
smaller_df.head()

# Wizualizacja

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

Wykres dla `N` krajów

In [None]:
def fix_legend(chart, marker="o"):
    handles, labels = chart.get_legend_handles_labels()
    sorting_order = dict(map(lambda x: (x[1],x[0]), enumerate(first_N_countries)))
    labels_handles = list(zip(labels,handles))

    labels_handles.sort(key = lambda x: sorting_order.get(x[0],-1))
    labels = [ x[0] for x in labels_handles[1:]]
    handles = [ x[1] for x in labels_handles[1:]]
    for handle in handles: 
        handle.set_marker(marker)
        handle.set_markeredgecolor("black")
        
    return handles, labels

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Confirmed',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright',  markeredgecolor="black",   
                     data=df
                    )

chart.set_title('Confirmed COVID-19 Cases')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Confirmed',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright',    markeredgecolor="black",
                     data=df
                    )

chart.set_title('Confirmed COVID-19 Cases')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.yscale("log")

plt.show();

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Deaths',
                     hue='Country',linestyle='-', marker='s',
                     palette='bright',    markeredgecolor="black",
                     data=df
                    )

chart.set_title('COVID-19 Deaths')

handles, labels = fix_legend(chart, marker='s')
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

Dwa zestawy danych 

In [None]:
plotdata=df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Confirmed',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",    
                     data=plotdata
                    )


chart.set_title('Confirmed COVID-19 cases vs number of deaths for %d countries' % N)

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc=2, title="Confirmed")

ax2 = chart.twinx()

chart2 = sns.lineplot(x='Day',
                     y='Deaths',
                     hue='Country', linestyle='-', marker='s',
                     palette='bright',    
                     data=plotdata, markeredgecolor="black",
                       ax=ax2
                    )

handles, labels = fix_legend(chart2, marker="s")
legend2 = plt.legend(handles, labels, loc=2, frameon=False, title="Deaths", bbox_to_anchor=(0.15, 1))

plt.show();

Mniej krajów

In [None]:
plotdata=smaller_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Confirmed',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright',  markeredgecolor="black",
                     data=plotdata
                    )


chart.set_title('Confirmed COVID-19 cases vs number of deaths for %d countries' % N)

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc=2, title="Confirmed")

ax2 = chart.twinx()

chart2 = sns.lineplot(x='Day',
                     y='Deaths',
                     hue='Country', linestyle='-', marker='s',
                     palette='bright',    markeredgecolor="black",
                     data=plotdata,
                       ax=ax2
                    )

handles, labels = fix_legend(chart2, marker="s")
legend2 = plt.legend(handles, labels, loc=2, frameon=False, title="Deaths", bbox_to_anchor=(0.15, 1))

plt.show();

## Dodanie wymiarów - stosunek między wymiarami

In [None]:
df["Mortality"] = 100*df["Deaths"]    / df["Confirmed"] 
df["Recovery"]  = 100*df["Recovered"] / df["Confirmed"] 

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Mortality',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright',    markeredgecolor="black",
                     data=df
                    )

chart.set_title('COVID-19 Mortality')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

In [None]:
plotdata=df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='Recovery',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright',  markeredgecolor="black",
                     data=plotdata
                    )

chart.set_title('COVID-19 Recovery rate')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

## Wymiary jako funkcja kilku wierszy - różnica i średnia krocząca 

In [None]:
result = []

countries = df["Country"].unique()
dataframes = [ df[ df["Country"] == x] for x in countries ] 

for m_df in dataframes:
    country = m_df['Country'].iloc[0]
    m_df = m_df[ ["Day", "Confirmed", "Deaths", "Recovered"] ]
    m_df.set_index("Day", inplace=True)
    df_diff = m_df.diff()
    df_diff.columns=["confirmed change", "deaths change", "recovered change"]
    
    m_df = pd.concat([m_df, df_diff], axis=1, sort=False)
    
    m_df["confirmed pct change"] = 100.0 * m_df["confirmed change"] / m_df["Confirmed"]
    m_df["confirmed pct change"]  = m_df["confirmed pct change"].apply(lambda x: x if x > -50.0 else 0.0)

    m_df["deaths pct change"] = 100.0 * m_df["deaths change"] / m_df["Deaths"]
    m_df["deaths pct change"]  = m_df["deaths pct change"].apply(lambda x: x if x > -50.0 else 0.0)
    m_df["deaths pct change"]  = m_df["deaths pct change"].apply(lambda x: x if x < 399.0 else 0.0)
    
    m_df["rolling deaths change"] = m_df["deaths change"] .rolling(window=ROLL).mean()
    m_df["rolling confirmed pct change"] = m_df["confirmed pct change"] .rolling(window=ROLL).mean()
    m_df["rolling deaths pct change"] = m_df["deaths pct change"] .rolling(window=ROLL).mean()
    m_df = m_df.dropna()
    
    m_df = m_df.sort_values(by="Day")
    m_df["Country"] = country
    m_df.reset_index(inplace=True)
    result.append(m_df)
    
new_df = pd.concat(result, axis=0, sort=False)
today = max(new_df["Day"])
d=new_df[ new_df["Day"] == today].sort_values(by=['Confirmed'], ascending=False).reset_index()
d["index"] = d["Country"]
d.drop(["Country"], axis=1, inplace=True)
d.head(N)

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='confirmed pct change',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 Confirmed percentage change')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.grid(color='grey', linestyle=':', linewidth=1, alpha=0.3, axis="x")

plt.show();

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='rolling confirmed pct change',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 Confirmed percentage change daily rolling average over %d days' % ROLL)

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='deaths pct change',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 Deaths change in pct daily')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.grid(color='grey', linestyle=':', linewidth=1, alpha=0.3, axis="x")

plt.show();

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='rolling deaths pct change',
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 deaths change in pct daily rolling average over %d days' % ROLL)

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")


plt.show();

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='deaths change',
                     hue='Country',linestyle='-', marker='s',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 Daily Deaths')

handles, labels = fix_legend(chart, marker="s")
plt.legend(handles, labels, frameon=False, loc="best")

plt.grid(color='grey', linestyle=':', linewidth=1, alpha=0.3, axis="y")

plt.show();

In [None]:
plotdata=new_df

plt.figure(figsize=(20,10))
plt.style.use("dark_background")

chart = sns.lineplot(x='Day',
                     y='rolling deaths change',
                     hue='Country',linestyle='-', marker='s',
                     palette='bright', markeredgecolor="black",   
                     data=plotdata
                    )

chart.set_title('COVID-19 Daily Deaths Rolling average over %d days' % ROLL)

handles, labels = fix_legend(chart, marker='s')
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

## Zmiana wymiaru _X_

In [None]:
plt.figure(figsize=(20,10))
plt.style.use("dark_background")
plt.xscale("log")
plt.yscale("log")

chart = sns.lineplot(x='Confirmed',
                     y='Deaths', 
                     hue='Country',linestyle='-', marker='o',
                     palette='bright', markeredgecolor="black",   
                     alpha=0.5,
                     data=df
                    )

chart.set_title('COVID-19 Mortality')

handles, labels = fix_legend(chart)
plt.legend(handles, labels, frameon=False, loc="best")

plt.show();

---