![](CRISP_DM.png)

In [None]:
#required packages
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
from scipy import signal
# for plotting
%matplotlib inline
mpl.rcParams['figure.figsize'] = (16, 10)
pd.set_option('display.max_rows', 500)
import plotly.graph_objects as go

In [None]:
# package for linear regression
from sklearn import linear_model

In [None]:
# dataframe from csv file
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';')  
df_analyse.sort_values('date',ascending=True).head()

In [None]:
# country_list
country_list=df_analyse.columns[1:]

## 60.i functions

In [None]:
# quick plotting function to plot time series 
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(x=x_in, y=df_input[each], name=each, opacity=1.0))
    
    fig.update_layout(autosize=True, width=800,height=800,xaxis_title = 'Timeline in days', 
                      yaxis_title = 'Confirmed infected people (Source:Johns-hopkins csse)', 
                      font=dict(family="PT Sans, monospace",size=14,color="#850303"))
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,nticks=20,tickfont=dict(size=12,color="#850303"))
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()

In [None]:
#test function
quick_plot(df_analyse.date,df_analyse.iloc[:,1:],y_scale='linear', slider=True)

### 60.i.a. Modification of dataframe stucture

In [None]:
# set threshold value for number of country 
threshold=100
# create empty list
empty_list=[]
# add data of more than 100 countries into empty list
for pos,country in enumerate(df_analyse.columns[1:]):
    empty_list.append(np.array(df_analyse[country][df_analyse[country]>threshold]))

In [None]:
#convert list into dataframe and trnaspose it to in column
pd_sync_timeline=pd.DataFrame(empty_list,index=df_analyse.columns[1:]).T

In [None]:
# add date column starting from 0
pd_sync_timeline['date']=np.arange(pd_sync_timeline.shape[0])
pd_sync_timeline.head()

In [None]:
# use helper function
quick_plot(pd_sync_timeline.date,pd_sync_timeline.iloc[:,:-1],y_scale='log',slider=True)

## 60.ii  Doubling Rate
*  Formula: $N(t)=N_0*2^{t/T}$

In [None]:
# define function for calculating doubling rate
def calc_double_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [None]:
# set max days value
max_days=34
#calculate doubling rate for different amount of interval
norm_slopes={'doubling every two days':calc_double_rate(100,np.arange(20),2),
    'doubling every 4 days':calc_double_rate(100,np.arange(20),4), 'doubling every 10 days':calc_double_rate(100,np.arange(20),10),}

In [None]:
# list norm_slopes after converting into dataframe and pd_sync_timelines
pd_sync_tl_with_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timeline], axis=1)
pd_sync_tl_with_slope.head()

In [None]:
# plot the dataframe with doubling rate
quick_plot(pd_sync_tl_with_slope.date,pd_sync_tl_with_slope.iloc[:,0:5],
           y_scale='log',slider=True)

In [None]:
#save dataframe into CSV format into local drive
pd_sync_tl_with_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

## 60.iii Linear Regression

In [None]:
reg = linear_model.LinearRegression(fit_intercept=False)

In [None]:
# linear regression for the US 
l_vec=len(df_analyse['US'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['US'][5:]))

In [None]:
#fit the model
reg.fit(X,y)

In [None]:
# prediction calculation
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

In [None]:
#copy of old dataframe including info of date & US column info
LR=df_analyse[['date','US']].copy()
# predicted value in copied dataframe
LR['prediction']=np.exp(Y_hat)

In [None]:
# plot the orginal dataset and predicted data from linear regression
quick_plot(LR.date,LR.iloc[:,1:],y_scale='log',slider=True)

In [None]:
regression = linear_model.LinearRegression(fit_intercept=True)

In [None]:
# import CSV file for creating dataframe and make country list by excluding first column
df_analyse=pd.read_csv('../data/processed/COVID_small_flat_table.csv',sep=';')  
country_list=df_analyse.columns[1:]

In [None]:
# filter data using window size = 5 for filtering and order of fitted polynomial = 1
for each in country_list:
    df_analyse[each+'_filter']=signal.savgol_filter(df_analyse[each], 5, 1)

In [None]:
filter_cols=['Italy_filter','US_filter', 'Spain_filter', 'Germany_filter', 'Brazil_filter']

In [None]:
start_pos=5
quick_plot(df_analyse.date[start_pos:],
           df_analyse[filter_cols].iloc[start_pos:,:], #['US','US_filter']
           y_scale='log',slider=True)

In [None]:
df_analyse.head()

In [None]:
# function for calculation of doubling rate through regression
def cal_double_t_reg(in_array):
    ''' Use a linear regression to approximate the doubling rate'''
    
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)
    
    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    
    return intercept/slope

In [None]:
# function for doubling time by numpy package
def cal_double_t(in_array):
    ''' Use a classical doubling time formular, 
     see https://en.wikipedia.org/wiki/Doubling_time '''
    y = np.array(in_array)
    return len(y)*np.log(2)/np.log(y[-1]/y[0])

In [None]:
# calculate slope of regression of last x days
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(country_list):
    df_analyse[country+'_DR']=df_analyse[country]\
    .rolling(window=days_back,min_periods=days_back)\
    .apply(cal_double_t_reg, raw=False)

In [None]:
# run all filtered data
days_back = 3 
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country]\
    .rolling(window=days_back,min_periods=days_back)\
    .apply(cal_double_t_reg, raw=False)

In [None]:
# matematicalvarification
df_analyse['Germany_DR_cross_check']=df_analyse['Germany']\
    .rolling(window=days_back,min_periods=days_back)\
    .apply(cal_double_t, raw=False)

In [None]:
# run all filtered data
days_back = 3 # this gives a smoothing effect
for pos,country in enumerate(filter_cols):
    df_analyse[country+'_DR']=df_analyse[country]\
    .rolling(window=days_back,min_periods=days_back)\
    .apply(cal_double_t_reg, raw=False)

In [None]:
# columns checking
df_analyse.columns

In [None]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[11,12,13,14]], 
           y_scale='linear',slider=True)

In [None]:
start_pos=40
quick_plot(df_analyse.date[start_pos:],
           df_analyse.iloc[start_pos:,[16,17,18,19]], #17,18,19   # US comparison 12,17
           y_scale='linear',slider=True)