In [51]:
import os
import pandas as pd
import numpy as np


import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns


import plotly.graph_objects as go

from sklearn import linear_model
from scipy import signal


In [52]:
%matplotlib inline
mpl.rcParams['figure.figsize'] = (20, 16)
pd.set_option('display.max.rows',1000)
sns.set(style='darkgrid')
reg = linear_model.LinearRegression(fit_intercept=False)

![CRISP_DM](../reports/figures/CRISP_DM.png)

## Data Load

* Focus is often a visual representation of the data

In [53]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv', sep=';', 
                       parse_dates=[0])
df_analyse.sort_values('date',ascending=True).tail()
#df_analyse.head()

Unnamed: 0,date,Albania,Egypt,Germany,Italy,Spain,United Kingdom,US
885,2022-06-25,278793,515645,27771112,18184917,12681820,22786805,87023834
886,2022-06-26,279077,515645,27771911,18234242,12681820,22786805,87042385
887,2022-06-27,279077,515645,27914240,18259261,12681820,22834153,87167855
888,2022-06-28,279167,515645,28048190,18343422,12734038,22855565,87293787
889,2022-06-29,280298,515645,28180861,18438877,12734038,22895669,87487627


## Helper functions

In [54]:
def quick_plot(x_in, df_input, y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()
    
    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                x=x_in,
                y=df_input[each],
                #mode='markers+lines',
                opacity=0.8,
                #line_width =2,
                #marker_size=4,   
                name=each,
        ))
    fig.update_layout(autosize=True,
                      width = 1024,
                      height = 768,
                      font = dict(
                      family="PT Sans, monospace",
                      size = 18,
                      color= "#7f7f7f"
                        )
                      )
    fig.update_yaxes(type=y_scale)
    fig.update_xaxes(tickangle=-45,
                     nticks=20,
                     tickfont=dict(size=14,color= "#7f7f7f")
                    )
    if slider == True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()
        
    

In [99]:
df_analyse.shape

(890, 30)

In [55]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [56]:
threshold=100

In [57]:
compare_list=[]
for pos, country in enumerate(df_analyse.columns[1:]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))
                       

In [58]:
pd_sync_timelines=pd.DataFrame(compare_list, index=df_analyse.columns[1:]).T

In [59]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [60]:
pd_sync_timelines.head()

Unnamed: 0,Albania,Egypt,Germany,Italy,Spain,United Kingdom,US,date
0,104.0,109.0,117.0,155.0,120.0,134.0,107.0,0
1,123.0,110.0,150.0,229.0,165.0,189.0,184.0,1
2,146.0,150.0,188.0,322.0,222.0,246.0,237.0,2
3,174.0,196.0,240.0,453.0,259.0,295.0,403.0,3
4,186.0,196.0,349.0,655.0,400.0,374.0,519.0,4


In [61]:
quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)

$N(t)=N_0*2^(t/T)$

In [62]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [63]:
max_days=20

norm_slopes={
    #'doubling every day':doubling_rate(100,np.arange(max_days),1),
    'doubling every two days':doubling_rate(100,np.arange(max_days),2),
    'doubling every 4 days':doubling_rate(100,np.arange(max_days),4),
    'doubling every 10 days':doubling_rate(100,np.arange(max_days),10),
}

In [104]:
pd_sync_timelines_w_slopes = pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [105]:
pd_sync_timelines_w_slopes

Unnamed: 0,doubling every two days,doubling every 4 days,doubling every 10 days,Albania,Egypt,Germany,Italy,Spain,United Kingdom,US,date
0,100.0,100.0,100.0,104.0,109.0,117.0,155.0,120.0,134.0,107.0,0
1,141.421356,118.920712,107.177346,123.0,110.0,150.0,229.0,165.0,189.0,184.0,1
2,200.0,141.421356,114.869835,146.0,150.0,188.0,322.0,222.0,246.0,237.0,2
3,282.842712,168.179283,123.114441,174.0,196.0,240.0,453.0,259.0,295.0,403.0,3
4,400.0,200.0,131.950791,186.0,196.0,349.0,655.0,400.0,374.0,519.0,4
5,565.685425,237.841423,141.421356,197.0,256.0,534.0,888.0,500.0,429.0,594.0,5
6,800.0,282.842712,151.571657,212.0,285.0,684.0,1128.0,673.0,483.0,782.0,6
7,1131.37085,336.358566,162.450479,223.0,294.0,847.0,1694.0,1073.0,630.0,1147.0,7
8,1600.0,400.0,174.110113,243.0,327.0,1112.0,2036.0,1695.0,888.0,1586.0,8
9,2262.7417,475.682846,186.606598,259.0,366.0,1296.0,2502.0,2277.0,1299.0,2219.0,9


In [65]:
quick_plot(pd_sync_timelines_w_slopes.date,
           pd_sync_timelines_w_slopes.iloc[:,0:5],
           y_scale='log',
           slider=True)

### Understanding Linear Regression

In [66]:
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1,1)
y=np.log(np.array(df_analyse['Germany'][5:]))

In [67]:
reg.fit(X,y)

LinearRegression(fit_intercept=False)

In [68]:
X_hat=np.arange(l_vec).reshape(-1,1)
Y_hat=reg.predict(X_hat)

In [106]:
LR_inspect=df_analyse[['date','Germany']].copy()
LR_inspect

Unnamed: 0,date,Germany
0,2020-01-22,0
1,2020-01-23,0
2,2020-01-24,0
3,2020-01-25,0
4,2020-01-26,0
5,2020-01-27,1
6,2020-01-28,4
7,2020-01-29,4
8,2020-01-30,4
9,2020-01-31,5


In [70]:
LR_inspect['prediction']=np.exp(Y_hat)

In [71]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)

### Doubling Rate - Piecewise Linear Regression

In [107]:
reg = linear_model.LinearRegression(fit_intercept=True)
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1,1)
y=np.log(np.array(df_analyse['Germany'][5:]))


In [108]:
df_analyse

Unnamed: 0,date,Albania,Egypt,Germany,Italy,Spain,United Kingdom,US
0,2020-01-22,0,0,0,0,0,0,1
1,2020-01-23,0,0,0,0,0,0,1
2,2020-01-24,0,0,0,0,0,0,2
3,2020-01-25,0,0,0,0,0,0,2
4,2020-01-26,0,0,0,0,0,0,5
5,2020-01-27,0,0,1,0,0,0,5
6,2020-01-28,0,0,4,0,0,0,5
7,2020-01-29,0,0,4,0,0,0,6
8,2020-01-30,0,0,4,0,0,0,6
9,2020-01-31,0,0,5,2,0,2,8


In [102]:
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';',
                       parse_dates=[0])  

In [103]:
df_analyse

Unnamed: 0,date,Albania,Egypt,Germany,Italy,Spain,United Kingdom,US
0,2020-01-22,0,0,0,0,0,0,1
1,2020-01-23,0,0,0,0,0,0,1
2,2020-01-24,0,0,0,0,0,0,2
3,2020-01-25,0,0,0,0,0,0,2
4,2020-01-26,0,0,0,0,0,0,5
5,2020-01-27,0,0,1,0,0,0,5
6,2020-01-28,0,0,4,0,0,0,5
7,2020-01-29,0,0,4,0,0,0,6
8,2020-01-30,0,0,4,0,0,0,6
9,2020-01-31,0,0,5,2,0,2,8


In [74]:
country_list=df_analyse.columns[1:]
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each],
                           5, # window size used for filtering
                           1) # order of fitted polynomial

In [75]:
filter_cols=['Albania_filter',
             'Egypt_filter', 
             'Germany_filter', 
             'Italy_filter', 
             'Spain_filter', 
             'United Kingdom_filter', 
             'US_filter']


In [100]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:],
           y_scale='log',
           slider=True
          )

In [101]:
df_analyse.columns

Index(['date', 'Albania', 'Egypt', 'Germany', 'Italy', 'Spain',
       'United Kingdom', 'US', 'Albania_filter', 'Egypt_filter',
       'Germany_filter', 'Italy_filter', 'Spain_filter',
       'United Kingdom_filter', 'US_filter', 'Albania_DR', 'Egypt_DR',
       'Germany_DR', 'Italy_DR', 'Spain_DR', 'United Kingdom_DR', 'US_DR',
       'Albania_filter_DR', 'Egypt_filter_DR', 'Germany_filter_DR',
       'Italy_filter_DR', 'Spain_filter_DR', 'United Kingdom_filter_DR',
       'US_filter_DR', 'Germany_DT'],
      dtype='object')

In [77]:
reg.fit(X,y)

LinearRegression()

In [78]:
def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [79]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [97]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[20,21]],  #15,16,17,17,18,19]]
           y_scale='linear',
           slider=True)

In [93]:
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country].rolling(
                                window=days_back,
                                min_periods=days_back).apply(get_doubling_time_via_regression, raw=False)

In [98]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[27,28]], # 22,23,24,25,26]],
           y_scale='linear',
           slider=True)

In [96]:
df_analyse.columns

Index(['date', 'Albania', 'Egypt', 'Germany', 'Italy', 'Spain',
       'United Kingdom', 'US', 'Albania_filter', 'Egypt_filter',
       'Germany_filter', 'Italy_filter', 'Spain_filter',
       'United Kingdom_filter', 'US_filter', 'Albania_DR', 'Egypt_DR',
       'Germany_DR', 'Italy_DR', 'Spain_DR', 'United Kingdom_DR', 'US_DR',
       'Albania_filter_DR', 'Egypt_filter_DR', 'Germany_filter_DR',
       'Italy_filter_DR', 'Spain_filter_DR', 'United Kingdom_filter_DR',
       'US_filter_DR', 'Germany_DT'],
      dtype='object')

In [83]:
def doubling_time(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [84]:
df_analyse['Germany_DT']=df_analyse['Germany'].rolling(window=3, min_periods=3).apply(doubling_time)

In [85]:
quick_plot(df_analyse.date,
           df_analyse.iloc[40:,[6,7]],
           y_scale='linear')