In [45]:
import pandas as pd

import numpy as np

from datetime import datetime

%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

import seaborn as sn
import plotly
import plotly.graph_objects as go


In [46]:

df_analyse=pd.read_csv(r'C:\Windows\System32\applied_data_science_-_covid-19\data\processed\COVID_small_flat_table.csv',sep = ';',
                      parse_dates=[0])
df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,date,Italy,US,Spain,Germany,India,"Korea, South"
213,2020-08-22,258136,5667070,386054,233861,3044940,17399
214,2020-08-23,259345,5701645,386054,234494,3106348,17665
215,2020-08-24,260298,5739536,405436,236122,3167323,17945
216,2020-08-25,261174,5777710,412553,237583,3224547,18265
217,2020-08-26,262540,5821819,419849,239010,3310234,18706


In [47]:
def quick_plot(x_in, df_input, y_scale = 'log', slider=False):
    fig = go.Figure()
    
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x= x_in,
                        y=df_input[each],
                        mode='lines',
                        opacity=0.8,
                        name=each ))
    fig.update_layout(autosize=True,
    width=1024,
    height=768,
    font=dict(
        family="PT Sans, monospace",
        size=18,
        color="#7f7f7f"
    )
                     )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
               nticks= 20,
               tickfont = dict(size=14,color="#7f7f7f")
                    )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
        
        
    fig.show()





In [48]:
quick_plot(df_analyse.date,
          df_analyse.iloc[:,1:],
          y_scale='log',
          slider=True)

In [49]:
threshold=100

In [50]:
compare_list = []
for pos,country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [51]:
pd_sync_timelines = pd.DataFrame(compare_list,index=df_analyse.columns[1:]).T

In [52]:
pd_sync_timelines['date'] = np.arange(pd_sync_timelines.shape[0])

In [53]:
quick_plot(pd_sync_timelines.date,
          pd_sync_timelines.iloc[:,1:],
          y_scale='log',
          slider=True)

# N(t)=N_0*2^{t/T}

In [54]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)


In [55]:
max_days=165

norm_slopes={
    'doubling every two days':doubling_rate(100, np.arange(max_days),2),
    'doubling every four days':doubling_rate(100, np.arange(max_days),4),
    'doubling every ten days':doubling_rate(100, np.arange(max_days),10),
    #'doubling every twenty days':doubling_rate(100, np.arange(max_days),20),
}

In [56]:
pd_sync_timelines_w_slope = pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [57]:
quick_plot(pd_sync_timelines_w_slope.date,
          pd_sync_timelines_w_slope.iloc[:,0:5],
          y_scale='log',
          slider=True)

In [58]:
pd_sync_timelines_w_slope.to_csv(r'C:\Windows\System32\applied_data_science_-_covid-19\data\processed\COVID_small_sync_timeline_table.csv',sep=';',index=False)

In [59]:
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=False)

In [60]:
from scipy import signal

In [61]:
df_analyse=pd.read_csv(r'C:\Windows\System32\applied_data_science_-_covid-19\data\processed\COVID_small_flat_table.csv',sep = ';',
                      parse_dates=[0])

country_list=df_analyse.columns[1:]

In [62]:
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],3,1)

In [63]:
filter_cols=['US_filter','Spain_filter','Germany_filter','India_filter', 'Korea, South_filter']

In [64]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
          df_analyse[filter_cols].iloc[start_pos:,:],
          y_scale='log',
          slider=True)

In [65]:
l_vec=len(df_analyse['Germany'])
x=np.arange(l_vec-5).reshape(-1, 1)
Y=np.log(np.array(df_analyse['Germany'][5:]))

In [66]:
reg.fit(x,Y)

LinearRegression(fit_intercept=False)

In [67]:
reg.predict([[5]])

array([0.41330959])

In [68]:
x_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(x_hat)

In [69]:
LR_inspect=df_analyse[['date', 'Germany']].copy()

In [70]:
LR_inspect['prediction']=np.exp(Y_hat)

In [71]:
quick_plot(LR_inspect.date,
          LR_inspect.iloc[:,1:],
          y_scale='log',
          slider=True)

In [72]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec=len(df_analyse['Germany'])
x=np.arange(l_vec-50).reshape(-1, 1)
Y=np.log(np.array(df_analyse['Germany'][50:]))

In [73]:
reg.fit(x,Y)

LinearRegression()

In [74]:
reg.intercept_

10.860708290885487

In [75]:
reg.coef_

array([0.01162982])

In [76]:
reg.coef_/reg.intercept_

array([0.00107082])

In [77]:
def get_rate_via_regression(in_array):
    y =np.array(in_array)
    X =np.arange(-1,2).reshape(-1,1)
    
    assert len(in_array)==3
    
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [78]:
days_back = 3
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
    window=days_back,
    min_periods=days_back).apply(get_rate_via_regression, raw=False)

In [79]:
days_back = 3
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
    window=days_back,
    min_periods=days_back).apply(get_rate_via_regression, raw=False)

In [80]:
df_analyse['Germany_DT_math']=df_analyse['Germany'].rolling(window=days_back,
                             min_periods=days_back).apply(doubling_time, raw=False)

In [81]:
df_analyse.columns

Index(['date', 'Italy', 'US', 'Spain', 'Germany', 'India', 'Korea, South',
       'Italy_filter', 'US_filter', 'Spain_filter', 'Germany_filter',
       'India_filter', 'Korea, South_filter', 'Italy_DR', 'US_DR', 'Spain_DR',
       'Germany_DR', 'India_DR', 'Korea, South_DR', 'US_filter_DR',
       'Spain_filter_DR', 'Germany_filter_DR', 'India_filter_DR',
       'Korea, South_filter_DR', 'Germany_DT_math'],
      dtype='object')

In [82]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
          df_analyse.iloc[start_pos:,[13,14,15,16]],
          y_scale='log',
          slider=True)

In [83]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
          df_analyse.iloc[start_pos:,[9,10,11,12,8,6,7]],
          y_scale='linear',
          slider=True)

In [84]:
country_list=df_analyse.columns[1:]
for each in country_list:
    df_analyse[each+'_DR']=df_analyse[each].rolling(window=3,
                             min_periods=3).apply(get_rate_via_regression)

In [85]:
quick_plot(df_analyse.date[40:],
          df_analyse.iloc[40:,[8,14]],
          y_scale='log',
          slider=True)

In [86]:
def doubling_time (in_array):
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [87]:
df_analyse['Germany_DT_wiki']=df_analyse['Germany'].rolling(window=3,
                             min_periods=3).apply(doubling_time)