In [23]:
import pandas as pd
import numpy as np

from datetime import datetime

%matplotlib inline
pd.set_option('display.max_rows', 500)

In [24]:
pd_JH_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
pd_JH_data=pd_JH_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
pd_JH_data.head()

Unnamed: 0,date,state,country,confirmed
0,2020-01-22,Alberta,Canada,0.0
1,2020-01-22,no,Kosovo,0.0
2,2020-01-22,no,Kuwait,0.0
3,2020-01-22,no,Kyrgyzstan,0.0
4,2020-01-22,no,Laos,0.0


In [25]:
test_data=pd_JH_data[((pd_JH_data['country']=='Brazil')|
                      (pd_JH_data['country']=='Belgium'))&
                     (pd_JH_data['date']>'2020-03-20')]

In [26]:
test_data.head()

Unnamed: 0,date,state,country,confirmed
17040,2020-03-21,no,Brazil,1021.0
17080,2020-03-21,no,Belgium,2815.0
17325,2020-03-22,no,Brazil,1546.0
17365,2020-03-22,no,Belgium,3401.0
17610,2020-03-23,no,Brazil,1924.0


In [27]:
test_data.groupby(['country']).agg(np.max)

Unnamed: 0_level_0,date,state,confirmed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Belgium,2022-06-25,no,4211511.0
Brazil,2022-06-25,no,32023166.0


In [28]:
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)

def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

In [29]:
test_data.groupby(['state','country']).agg(np.max)

Unnamed: 0_level_0,Unnamed: 1_level_0,date,confirmed
state,country,Unnamed: 2_level_1,Unnamed: 3_level_1
no,Belgium,2022-06-25,4211511.0
no,Brazil,2022-06-25,32023166.0


In [30]:
def rolling_reg(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result

In [31]:
test_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed')

state  country        
no     Belgium  17080             NaN
                17365             NaN
                17650        7.154454
                17935        8.765745
                18220        7.230039
                             ...     
       Brazil   251310     306.468271
                251595     467.040562
                251881     443.065967
                252165     482.642448
                252449    1059.984036
Name: confirmed, Length: 1654, dtype: float64

In [32]:
pd_DR_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(rolling_reg,'confirmed').reset_index()

In [33]:
pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR', 'level_2':'index'})
pd_DR_result.head()

Unnamed: 0,state,country,index,confirmed_DR
0,Alberta,Canada,0,
1,Alberta,Canada,499,
2,Alberta,Canada,762,
3,Alberta,Canada,1069,
4,Alberta,Canada,1333,


In [34]:
pd_JH_data=pd_JH_data.reset_index()
pd_JH_data.head()

Unnamed: 0,index,date,state,country,confirmed
0,0,2020-01-22,Alberta,Canada,0.0
1,1,2020-01-22,no,Kosovo,0.0
2,2,2020-01-22,no,Kuwait,0.0
3,3,2020-01-22,no,Kyrgyzstan,0.0
4,4,2020-01-22,no,Laos,0.0


In [35]:
pd_result_larg=pd.merge(pd_JH_data,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR
0,0,2020-01-22,Alberta,Canada,0.0,
1,1,2020-01-22,no,Kosovo,0.0,
2,2,2020-01-22,no,Kuwait,0.0,
3,3,2020-01-22,no,Kyrgyzstan,0.0,
4,4,2020-01-22,no,Laos,0.0,


In [36]:
from scipy import signal

def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0) # attention with the neutral element here
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, # window size used for filtering
                           1)
    df_result[column+'_filtered']=result
    return df_result
        

In [37]:
pd_filtered_result=pd_JH_data[['state','country','confirmed']].groupby(['state','country']).apply(savgol_filter).reset_index()

In [38]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
pd_result_larg.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered
0,0,2020-01-22,Alberta,Canada,0.0,,0.0
1,1,2020-01-22,no,Kosovo,0.0,,0.0
2,2,2020-01-22,no,Kuwait,0.0,,0.0
3,3,2020-01-22,no,Kyrgyzstan,0.0,,0.0
4,4,2020-01-22,no,Laos,0.0,,0.0


In [39]:

pd_filtered_doubling=pd_result_larg[['state','country','confirmed_filtered']].groupby(['state','country']).apply(rolling_reg,'confirmed_filtered').reset_index()

pd_filtered_doubling=pd_filtered_doubling.rename(columns={'confirmed_filtered':'confirmed_filtered_DR',
                             'level_2':'index'})

pd_filtered_doubling.tail()

Unnamed: 0,state,country,index,confirmed_filtered_DR
252505,no,Zimbabwe,251226,3753.702504
252506,no,Zimbabwe,251439,2861.453797
252507,no,Zimbabwe,251746,2586.681542
252508,no,Zimbabwe,251988,2556.58517
252509,no,Zimbabwe,252509,2454.311538


In [40]:
pd_result_larg=pd.merge(pd_result_larg,pd_filtered_doubling[['index','confirmed_filtered_DR']],on=['index'],how='left')
pd_result_larg.tail()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
252505,252505,2022-06-25,no,Andorra,43774.0,inf,43839.0,673.4462
252506,252506,2022-06-25,no,Algeria,266030.0,35469.78,266032.4,28604.63
252507,252507,2022-06-25,no,Albania,278793.0,957.0538,278787.0,986.2061
252508,252508,2022-06-25,no,Argentina,9341492.0,inf,9341492.0,-3343450000000000.0
252509,252509,2022-06-25,no,Zimbabwe,255309.0,2950.882,255352.4,2454.312


In [20]:
mask=pd_result_larg['confirmed']>100
pd_result_larg['confirmed_filtered_DR']=pd_result_larg['confirmed_filtered_DR'].where(mask, other=np.NaN) 

In [21]:
pd_result_larg[pd_result_larg['country']=='Brazil'].tail()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
251310,251310,2022-06-21,no,Brazil,31818827,306.468271,31807715.2,503.864679
251595,251595,2022-06-22,no,Brazil,31890733,467.040562,31889994.6,417.252407
251881,251881,2022-06-23,no,Brazil,31962782,443.065967,31943734.8,468.763054
252165,252165,2022-06-24,no,Brazil,32023166,482.642448,31997845.9,592.368538
252449,252449,2022-06-25,no,Brazil,32023166,1059.984036,32051957.0,591.336083


In [22]:
pd_result_larg.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)