In [44]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import *
import pylab

from scipy.optimize import curve_fit, leastsq

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

sns.set_style('whitegrid')

In [45]:
df_cases_raw = pd.read_csv("./data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv", index_col=False)
df_deaths_raw = pd.read_csv("./data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv", index_col=False)
df_recoveries_raw = pd.read_csv("./data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv", index_col=False)

df_population_raw = pd.read_csv("./data/population_data/API_SP.POP.TOTL_DS2_en_csv_v2_887275.csv")

In [46]:
df_cases_raw.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,5,7,7,7,11,16,21,22,22,22,24,24,40,40,74,84,94,110,110
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,10,12,23,33,38,42,51,55,59,64,70,76,89,104,123,146,174,186,197
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,3,5,12,12,17,17,19,20,20,20,24,26,37,48,54,60,74,87,90,139,201,230,264,302,367,409,454
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,39,39,53,75,88,113,133,164,188,224,267,308
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,3,3,4,4,5


In [47]:
df_cases_grouped = df_cases_raw.groupby(by='Country/Region', as_index=False).agg('sum')
df_deaths_grouped = df_deaths_raw.groupby(by='Country/Region', as_index=False).agg('sum')

df_cases_grouped.head()

Unnamed: 0,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,1/28/20,1/29/20,1/30/20,1/31/20,2/1/20,2/2/20,2/3/20,2/4/20,2/5/20,2/6/20,2/7/20,2/8/20,2/9/20,2/10/20,2/11/20,2/12/20,2/13/20,2/14/20,2/15/20,2/16/20,2/17/20,2/18/20,2/19/20,2/20/20,2/21/20,2/22/20,2/23/20,2/24/20,2/25/20,2/26/20,2/27/20,2/28/20,2/29/20,3/1/20,3/2/20,3/3/20,3/4/20,3/5/20,3/6/20,3/7/20,3/8/20,3/9/20,3/10/20,3/11/20,3/12/20,3/13/20,3/14/20,3/15/20,3/16/20,3/17/20,3/18/20,3/19/20,3/20/20,3/21/20,3/22/20,3/23/20,3/24/20,3/25/20,3/26/20,3/27/20,3/28/20
0,Afghanistan,33.0,65.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,5,7,7,7,11,16,21,22,22,22,24,24,40,40,74,84,94,110,110
1,Albania,41.1533,20.1683,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,10,12,23,33,38,42,51,55,59,64,70,76,89,104,123,146,174,186,197
2,Algeria,28.0339,1.6596,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,3,5,12,12,17,17,19,20,20,20,24,26,37,48,54,60,74,87,90,139,201,230,264,302,367,409,454
3,Andorra,42.5063,1.5218,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,39,39,53,75,88,113,133,164,188,224,267,308
4,Angola,-11.2027,17.8739,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,3,3,4,4,5


In [48]:
def preprocess_frame(df):
    df = df.groupby(by='Country/Region', as_index=False).agg('sum')
    df = df.drop(['Lat', 'Long'], 1)
    for i in range(1,len(df_cases.columns.values)):
        df_cases.columns.values[i] = i
    return df

df_cases = preprocess_frame(df_cases_raw)
df_deaths = preprocess_frame(df_deaths_raw)
df_recoveries = preprocess_frame(df_recoveries_raw)

df_population = df_population_raw[["Country Name", "2018"]]

df_cases.head()

Unnamed: 0,Country/Region,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
0,Afghanistan,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,5,7,7,7,11,16,21,22,22,22,24,24,40,40,74,84,94,110,110
1,Albania,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,10,12,23,33,38,42,51,55,59,64,70,76,89,104,123,146,174,186,197
2,Algeria,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,3,5,12,12,17,17,19,20,20,20,24,26,37,48,54,60,74,87,90,139,201,230,264,302,367,409,454
3,Andorra,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,39,39,53,75,88,113,133,164,188,224,267,308
4,Angola,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,3,3,4,4,5


In [49]:
%matplotlib notebook

top_cases = df_cases_grouped.nlargest(10, df_cases_raw.columns[-1])["Country/Region"]

print ("Country cases")
print ("-------------")
for country in top_cases:
    print (country, " : ", df_cases_grouped[df_cases_grouped["Country/Region"] == country].iloc()[0,-1])
    
top_deaths = df_deaths_grouped.nlargest(10, df_deaths_raw.columns[-1])['Country/Region']
print ("\nCountry deaths")
print ("-------------")
for country in top_deaths:
    print (country, " : ", df_deaths_grouped[df_deaths_grouped["Country/Region"] == country].iloc()[0,-1])

Country cases
-------------
US  :  121478
Italy  :  92472
China  :  81999
Spain  :  73235
Germany  :  57695
France  :  38105
Iran  :  35408
United Kingdom  :  17312
Switzerland  :  14076
Netherlands  :  9819

Country deaths
-------------
Italy  :  10023
Spain  :  5982
China  :  3299
Iran  :  2517
France  :  2317
US  :  2026
United Kingdom  :  1021
Netherlands  :  640
Germany  :  433
Belgium  :  353


In [50]:
print ("Mortality rates")
print ("---------------")
for country in top_cases:
    print (country, ": ", round(float(df_deaths[df_deaths["Country/Region"] == country].iloc[0,-1])/float(df_cases[df_cases["Country/Region"] == country].iloc[0,-1])*100, 3), "%")
    
    
print ("\nPopulation percentage infected")
print ("---------------")
for country in top_cases:
    country_pop = country
    if country == "US":
        country_pop = "United States"
    if country == "Iran":
        country_pop = "Iran, Islamic Rep."
    if country == "Korea, South":
        country_pop = "Korea, Rep."
    print (country, ": ", \
    round(float(df_cases[df_cases["Country/Region"] == country].iloc[0,-1])/float(df_population[df_population["Country Name"]==country_pop]["2018"])*100, 3), "%")
    
print ("\nPopulation permil dead")
print ("---------------")
for country in top_cases:
    country_pop = country
    if country == "US":
        country_pop = "United States"
    if country == "Iran":
        country_pop = "Iran, Islamic Rep."
    if country == "Korea, South":
        country_pop = "Korea, Rep."
    print (country, ": ", \
    round(float(df_deaths[df_deaths["Country/Region"] == country].iloc[0,-1])/float(df_population[df_population["Country Name"]==country_pop]["2018"])*1000, 5), "permil")

Mortality rates
---------------
US :  1.668 %
Italy :  10.839 %
China :  4.023 %
Spain :  8.168 %
Germany :  0.75 %
France :  6.081 %
Iran :  7.109 %
United Kingdom :  5.898 %
Switzerland :  1.876 %
Netherlands :  6.518 %

Population percentage infected
---------------
US :  0.037 %
Italy :  0.153 %
China :  0.006 %
Spain :  0.157 %
Germany :  0.07 %
France :  0.057 %
Iran :  0.043 %
United Kingdom :  0.026 %
Switzerland :  0.165 %
Netherlands :  0.057 %

Population permil dead
---------------
US :  0.00619 permil
Italy :  0.16586 permil
China :  0.00237 permil
Spain :  0.12803 permil
Germany :  0.00522 permil
France :  0.03459 permil
Iran :  0.03077 permil
United Kingdom :  0.01536 permil
Switzerland :  0.031 permil
Netherlands :  0.03714 permil


In [59]:
def shift_to_day_zero(df, df_reference):
    for key in df["Country/Region"]:
        shift = 0
        for i in df[df["Country/Region"] == key]:
            if i == 0:
                ++shift
            else:
                continue
        df[df["Country/Region"] == key] = \
        df[df["Country/Region"] == key].shift(shift, axis=1)
    

df_cases_reference = df_cases.copy()
shift_to_day_zero(df_cases, df_cases_reference)
shift_to_day_zero(df_deaths, df_cases_reference)
#shift_to_day_zero(df_recoveries, df_cases_reference)

In [60]:
df_cases

Unnamed: 0,Country/Region,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67
0,Afghanistan,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,5,7,7,7,11,16,21,22,22,22,24,24,40,40,74,84,94,110,110
1,Albania,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,10,12,23,33,38,42,51,55,59,64,70,76,89,104,123,146,174,186,197
2,Algeria,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,3,5,12,12,17,17,19,20,20,20,24,26,37,48,54,60,74,87,90,139,201,230,264,302,367,409,454
3,Andorra,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,39,39,53,75,88,113,133,164,188,224,267,308
4,Angola,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,2,3,3,3,4,4,5
5,Antigua and Barbuda,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,3,3,3,7,7,7
6,Argentina,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,2,8,12,12,17,19,19,31,34,45,56,68,79,97,128,158,266,301,387,387,502,589,690
7,Armenia,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,4,8,18,26,52,78,84,115,136,160,194,235,249,265,290,329,407
8,Australia,0,0,0,0,4,5,5,6,9,9,12,12,12,13,13,14,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,25,27,30,39,52,55,60,63,76,91,107,128,128,200,250,297,377,452,568,681,791,1071,1549,1682,2044,2364,2810,3143,3640
9,Austria,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,3,3,9,14,18,21,29,41,55,79,104,131,182,246,302,504,655,860,1018,1332,1646,2013,2388,2814,3582,4474,5283,5588,6909,7657,8271


In [57]:
%matplotlib notebook

countries = top_cases
#countries = ["Germany", "US", "Italy", "Spain", "China"]
def plot_confirmed_cases(df, countries):
    g = sns.lineplot(df.columns[1:], df[df["Country/Region"] == 'Greece'], label='Greece')
    #sns.lineplot(df['Day'], df['notChina'], label='notChina')
    for country in countries:
        sns.lineplot(df.columns[1:], df[df["Country/Region"] == country], label=country)
    g.set(xlabel='Days since first confirmed case', ylabel='Confirmed cases')
    g.set_yscale("log")
    plt.legend()
 
plot_confirmed_cases(df_cases, countries)

TypeError: 'int' object is not iterable

In [None]:
%matplotlib notebook

def plot_case_death_recovery(country):
    g = sns.lineplot(df_cases['Day'], df_cases[country], label=country+' cases')
    sns.lineplot(df_recoveries['Day'], df_recoveries[country], label=country+' recovered')
    sns.lineplot(df_deaths['Day'], df_deaths[country], label=country+' deaths')
    g.set(ylabel='entries')
    g.set_yscale("log")
    plt.legend()

plot_case_death_recovery('China')
plot_case_death_recovery('notChina')

In [None]:
%matplotlib notebook

def sigmoid(x, x0, k, a, c):
     y = a / (1 + np.exp(-k*(x-x0))) + c
     return y

xdata, ydata = df_cases['Day'], df_cases['China']
popt, pcov = curve_fit(sigmoid, xdata, ydata, p0=(1.0, -1.0, 1.0, 0.0))
print ("Fit:")
print ("x0 =", popt[0])
print ("k  =", popt[1])
print ("a  =", popt[2])
print ("c  =", popt[3])
print ("Asymptotes are", popt[3], "and", popt[3] + popt[2] )

print (popt)

x = np.linspace(-1, df_cases['China'].shape[0], 50)
y = sigmoid(x, *popt)

pylab.plot(xdata, ydata, 'k.', label='data')
pylab.plot(x,y, 'r', label='fit')
pylab.ylim(0, 100000)
pylab.xlabel("days since first case")
pylab.ylabel("number of confirmed cases")
pylab.legend(loc='best')
pylab.show()

In [None]:
%matplotlib notebook

def func(x, a, b, c):
    return a * np.exp(b * x) + c

def fit_cases_data(country, func, df, interval):
    firstday = interval[0]
    lastday = interval[1]

    xdata = df['Day'][(df['Day']>=firstday) & (df['Day']<lastday)]
    ydata = df[country][(df['Day']>=firstday) & (df['Day']<lastday)]
    

    popt, pcov = curve_fit(func, xdata, ydata, [0.1,0.1,0.1], bounds=[[-100, -100, 0],[100, 100, 100]])
    print(popt)
    print("covariance matrix")
    print(pcov)
    firstday = 0
    lastday = df_cases[country].dropna().shape[0]
    x = np.linspace(firstday, lastday+7 , 100)
    xdata = df['Day'][(df['Day']>=firstday) & (df['Day']<lastday)]
    ydata = df[country][(df['Day']>=firstday) & (df['Day']<lastday)]
    plt.plot(xdata, ydata, 'k.', label='data')
    plt.plot(x, func(x, *popt), 'r-',label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))

    perr=np.sqrt(np.diag(pcov)) #standard errors
    plt.plot(x,func(x, *popt+perr), 'g--', linewidth=0.5)
    plt.plot(x,func(x, *popt-perr), 'g--', linewidth=0.5)

    plt.xlabel('days since first case')
    plt.ylabel('number of confirmed cases')
    plt.legend()
    plt.yscale('log')
    plt.show()


In [None]:
%matplotlib notebook
country = "India"
print("Last day:", df_cases["Date"][-1])
fit_cases_data(country, func, df_cases, [0, df_cases[country].dropna().shape[0]])
#fit_cases_data(country, func, df_cases, [16, df_cases[country].dropna().shape[0]])

In [None]:
def fit_cases_data_sigmoid(country, func, df):
    firstday = 0
    lastday = df[country].dropna().shape[0]

    xdata = df['Day'][(df['Day']>=firstday) & (df['Day']<lastday)]
    ydata = df[country][(df['Day']>=firstday) & (df['Day']<lastday)]
    
    plt.plot(xdata, ydata, 'k.', label='data')

    popt, pcov = curve_fit(func, xdata, ydata, [1.0, 0.1, -1e5, 1e5], maxfev=100000)
    print(popt)
    print("covariance matrix")
    print(pcov)
    x = np.linspace(firstday, lastday+10 , 100)
    plt.plot(x, func(x, *popt), 'r-',label='fit: a=%5.3f, b=%5.3f, c=%5.3f, d=%5.3f' % tuple(popt))

    perr=np.sqrt(np.diag(pcov)) #standard errors
    plt.plot(x,func(x, *popt+perr), 'g--', linewidth=0.5)
    plt.plot(x,func(x, *popt-perr), 'g--', linewidth=0.5)

    plt.xlabel('days since first case')
    plt.ylabel('number of confirmed cases')
    plt.legend()
    #plt.yscale('log')
    plt.show()

In [None]:
%matplotlib notebook

fit_cases_data_sigmoid("Germany", sigmoid, df_cases)

In [None]:
%matplotlib notebook

#Gaussian function
def gauss_function(x, a, x0, sigma):
    return a*np.exp(-(x-x0)**2/(2*sigma**2))

def fit_gauss(df, func, country, startday):
    df_perday = df[country].dropna()
    arr = df_perday.values
    arr2 = arr
    for i in reversed(range(arr.size)):
        if i!=0:
            arr2[i] = arr2[i]-arr[i-1]
    x = np.linspace(startday,arr2.size, arr2.size-startday)
    y = arr2[startday:]
    #yerr = sqrt(y)
    #for i in range(yerr.size):
    #    if yerr[i] == 0:
    #        yerr[i] = 100
    #    else:
    #        yerr[i]*10
    #print(yerr)
    mean = 50
    sigma = 10
    popt, pcov = curve_fit(gauss_function, x, y, p0 = [100, mean, sigma])#, sigma=yerr)
    plt.errorbar(x,y,fmt='+')#, yerr=yerr)
    x = np.linspace(startday,2*arr2.size, (arr2.size-startday)*1000)
    plot(x,gauss_function(x, *popt))
    print(pcov)
    perr=np.sqrt(np.diag(pcov)) #standard errors
    plt.plot(x,func(x, *popt+perr), 'r')
    plt.plot(x,func(x, *popt-perr), 'r')
    #plt.yscale("log")
    
    


In [None]:
%matplotlib notebook

fit_gauss(df_cases, gauss_function, "Germany", 0)

In [None]:
%matplotlib notebook

data = np.linspace(0.5, df_cases["China"].size+0.5, df_cases["China"].size)

myHist, myBinEdges = np.histogram(data, bins=df_cases["China"].size)
print(data)
print(myHist)
print(myBinEdges)
wid = myBinEdges[1:] - myBinEdges[:-1]
plt.bar(myBinEdges[:-1], df_cases["China"], width=wid)
plt.show() 