# Data Load


In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np

%matplotlib inline
mpl.rcParams['figure.figsize']=(16,10)
pd.set_option('display.max_rows',500)

import plotly.graph_objects as go

In [3]:
df_analyse = pd.read_csv(r"C:\Users\joelg\Python_Codes_VC\Enterprise_Data_Science\Repo_Enter_Data_Science\data\processed\COVID_small_flat_table.csv", 
sep=';', parse_dates=[0])
df_analyse.sort_values('date',ascending=True).tail()

Unnamed: 0,data,Italy,US,Spain,Germany,"Korea, South",date
864,2022-06-04,17490451,84748884,12403245,26493235,18163686,2022-06-04
865,2022-06-05,17505973,84762022,12403245,26496611,18168708,2022-06-05
866,2022-06-06,17514589,84882287,12403245,26498361,18174880,2022-06-06
867,2022-06-07,17543136,85003945,12436538,26583016,18188200,2022-06-07
868,2022-06-08,17566061,85214036,12436538,26660652,18200346,2022-06-08


# HElper Functions

In [4]:
def quick_plot(x_in, df_input,y_scale='log',slider=False):
    """ Quick basic plot for quick static evaluation of a time series
    
        you can push selective columns of your data frame by .iloc[:,[0,6,7,8]]
        
        Parameters:
        ----------
        x_in : array 
            array of date time object, or array of numbers
        df_input : pandas dataframe 
            the plotting matrix where each column is plotted
            the name of the column will be used for the legend
        scale: str
            y-axis scale as 'log' or 'linear'
        slider: bool
            True or False for x-axis slider
    
        
        Returns:
        ----------
        
    """
    fig = go.Figure()

    for each in df_input.columns:
        fig.add_trace(go.Scatter(
                        x=x_in,
                        y=df_input[each],
                        name=each,
                        opacity=0.8))
    
    fig.update_layout(autosize=True,
        width=1024,
        height=768,
        font=dict(
            family="PT Sans, monospace",
            size=18,
            color="#7f7f7f"
            )
        )
    fig.update_yaxes(type=y_scale),
    fig.update_xaxes(tickangle=-45,
                 nticks=20,
                 tickfont=dict(size=14,color="#7f7f7f")
                )
    if slider==True:
        fig.update_layout(xaxis_rangeslider_visible=True)
    fig.show()
      

In [5]:
quick_plot(df_analyse.date,
           df_analyse.iloc[:,1:],
           y_scale='linear',
           slider=True)

In [20]:
threshold=25000

In [21]:
compare_list=[]
for pos,country in enumerate(df_analyse.columns[1:5]):
    compare_list.append(np.array(df_analyse[country][df_analyse[country] > threshold]))

In [22]:
compare_list

[array([   27980,    31506,    35713,    41035,    47021,    53578,
           59138,    63927,    69176,    74386,    80589,    86498,
           92472,    97689,   101739,   105792,   110574,   115242,
          119827,   124632,   128948,   132547,   135586,   139422,
          143626,   147577,   152271,   156363,   159516,   162488,
          165155,   168941,   172434,   175925,   178972,   181228,
          183957,   187327,   189973,   192994,   195351,   197675,
          199414,   201505,   203591,   205463,   207428,   209328,
          210717,   211938,   213013,   214457,   215858,   217185,
          218268,   219070,   219814,   221216,   222104,   223096,
          223885,   224760,   225435,   225886,   226699,   227364,
          228006,   228658,   229327,   229858,   230158,   230555,
          231139,   231732,   232248,   232664,   232997,   233197,
          233515,   233836,   234013,   234531,   234801,   234998,
          235278,   235561,   235763,   236142, 

In [23]:

pd_sync_timelines=pd.DataFrame(compare_list,index=df_analyse.columns[1:5]).T

In [24]:
pd_sync_timelines

Unnamed: 0,Italy,US,Spain,Germany
0,27980.0,26025.0,25374.0,27436.0
1,31506.0,34944.0,28768.0,31554.0
2,35713.0,46096.0,35136.0,36508.0
3,41035.0,56714.0,39885.0,42288.0
4,47021.0,68841.0,49515.0,48582.0
...,...,...,...,...
810,17490451.0,,,
811,17505973.0,,,
812,17514589.0,,,
813,17543136.0,,,


In [25]:
pd_sync_timelines['date']=np.arange(pd_sync_timelines.shape[0])

In [26]:
pd_sync_timelines.head()

Unnamed: 0,Italy,US,Spain,Germany,date
0,27980.0,26025.0,25374.0,27436.0,0
1,31506.0,34944.0,28768.0,31554.0,1
2,35713.0,46096.0,35136.0,36508.0,2
3,41035.0,56714.0,39885.0,42288.0,3
4,47021.0,68841.0,49515.0,48582.0,4


In [27]:
quick_plot(pd_sync_timelines.date,
           pd_sync_timelines.iloc[:,:-1],
           y_scale='log',
           slider=True)

# Doubling Rate
$N(t)=N_0* 2^{t/T}$

In [29]:
def doubling_rate(N_0,t,T_d):
    return N_0*np.power(2,t/T_d)

In [30]:
max_days=800
doubling_rate(100,np.arange(max_days),1)

array([1.00000000e+002, 2.00000000e+002, 4.00000000e+002, 8.00000000e+002,
       1.60000000e+003, 3.20000000e+003, 6.40000000e+003, 1.28000000e+004,
       2.56000000e+004, 5.12000000e+004, 1.02400000e+005, 2.04800000e+005,
       4.09600000e+005, 8.19200000e+005, 1.63840000e+006, 3.27680000e+006,
       6.55360000e+006, 1.31072000e+007, 2.62144000e+007, 5.24288000e+007,
       1.04857600e+008, 2.09715200e+008, 4.19430400e+008, 8.38860800e+008,
       1.67772160e+009, 3.35544320e+009, 6.71088640e+009, 1.34217728e+010,
       2.68435456e+010, 5.36870912e+010, 1.07374182e+011, 2.14748365e+011,
       4.29496730e+011, 8.58993459e+011, 1.71798692e+012, 3.43597384e+012,
       6.87194767e+012, 1.37438953e+013, 2.74877907e+013, 5.49755814e+013,
       1.09951163e+014, 2.19902326e+014, 4.39804651e+014, 8.79609302e+014,
       1.75921860e+015, 3.51843721e+015, 7.03687442e+015, 1.40737488e+016,
       2.81474977e+016, 5.62949953e+016, 1.12589991e+017, 2.25179981e+017,
       4.50359963e+017, 9

In [31]:
max_days=800

norm_slopes={
    #'doubling every day':doubling_rate(100,np.arange(10),1),
    'doubling every two days':doubling_rate(100,np.arange(max_days),2),
    'doubling every 4 days':doubling_rate(100,np.arange(max_days),4),
    'doubling every 10 days':doubling_rate(100,np.arange(max_days),10),
}

In [32]:
pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

Unnamed: 0,doubling every two days,doubling every 4 days,doubling every 10 days,Italy,US,Spain,Germany,date
0,100.000000,100.000000,100.000000,27980.0,26025.0,25374.0,27436.0,0
1,141.421356,118.920712,107.177346,31506.0,34944.0,28768.0,31554.0,1
2,200.000000,141.421356,114.869835,35713.0,46096.0,35136.0,36508.0,2
3,282.842712,168.179283,123.114441,41035.0,56714.0,39885.0,42288.0,3
4,400.000000,200.000000,131.950791,47021.0,68841.0,49515.0,48582.0,4
...,...,...,...,...,...,...,...,...
810,,,,17490451.0,,,,810
811,,,,17505973.0,,,,811
812,,,,17514589.0,,,,812
813,,,,17543136.0,,,,813


In [33]:
pd_sync_timelines_w_slope=pd.concat([pd.DataFrame(norm_slopes),pd_sync_timelines], axis=1)

In [34]:
pd_sync_timelines_w_slope

Unnamed: 0,doubling every two days,doubling every 4 days,doubling every 10 days,Italy,US,Spain,Germany,date
0,100.000000,100.000000,100.000000,27980.0,26025.0,25374.0,27436.0,0
1,141.421356,118.920712,107.177346,31506.0,34944.0,28768.0,31554.0,1
2,200.000000,141.421356,114.869835,35713.0,46096.0,35136.0,36508.0,2
3,282.842712,168.179283,123.114441,41035.0,56714.0,39885.0,42288.0,3
4,400.000000,200.000000,131.950791,47021.0,68841.0,49515.0,48582.0,4
...,...,...,...,...,...,...,...,...
810,,,,17490451.0,,,,810
811,,,,17505973.0,,,,811
812,,,,17514589.0,,,,812
813,,,,17543136.0,,,,813


In [37]:
quick_plot(pd_sync_timelines_w_slope.date,
           pd_sync_timelines_w_slope.iloc[:,0:6],
           y_scale='log',
           slider=True)

In [38]:
pd_sync_timelines_w_slope.to_csv('../data/processed/COVID_small_sync_timeline_table.csv',sep=';',index=False)

# Understanding Linear Regression

In [5]:

from sklearn import linear_model
#from sklearn.linear_model import LinearRegression
#reg = linear_model.LinearRegression(fit_intercept=False)

In [None]:
l_vec=len(df_analyse['Germany'])
X=np.arange(l_vec-5).reshape(-1, 1)
y=np.log(np.array(df_analyse['Germany'][5:]))

In [None]:
reg.fit(X,y)

In [None]:
X_hat=np.arange(l_vec).reshape(-1, 1)
Y_hat=reg.predict(X_hat)

In [None]:
LR_inspect=df_analyse[['date','Germany']].copy()

In [None]:
LR_inspect['prediction']=np.exp(Y_hat)

In [None]:
quick_plot(LR_inspect.date,
           LR_inspect.iloc[:,1:],
           y_scale='log',
           slider=True)