![](CRISP_DM.png)

In [1]:
#Importing required packages
import pandas as pd
import numpy as np
from sklearn import linear_model
from scipy import signal
%matplotlib inline
pd.set_option('display.max_rows', 500)

## 30.i Large relational dataset

In [2]:
# Creation of dataframe from last notebook where all the data is sorted
pd_John_data=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
# start data value from ascending order and reset the index
pd_John_data=pd_John_data.sort_values('date',ascending=True).reset_index(drop=True).copy()
pd_John_data.head()

Unnamed: 0,date,state,country,confirmed
0,2020-01-22,Alberta,Canada,0
1,2020-01-22,no,"Korea, South",1
2,2020-01-22,no,Kosovo,0
3,2020-01-22,no,Kuwait,0
4,2020-01-22,no,Kyrgyzstan,0


## 30.ii Test data 

In [3]:
# data of US and Germany after 20.03.2020
test_data=pd_John_data[((pd_John_data['country']=='US')|
                      (pd_John_data['country']=='Germany'))&
                     (pd_John_data['date']>'2020-03-20')]

In [4]:
test_data.tail()

Unnamed: 0,date,state,country,confirmed
61527,2020-09-09,no,US,6360212
61755,2020-09-10,no,Germany,258149
61796,2020-09-10,no,US,6396100
62020,2020-09-11,no,Germany,259735
62062,2020-09-11,no,US,6443743


In [5]:
#use groupby to check both countries max number of infected cases 
test_data.groupby(['country']).agg(np.max)

Unnamed: 0_level_0,date,state,confirmed
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Germany,2020-09-11,no,259735
US,2020-09-11,no,6443743


### 30.ii.a Regression calculation through doubling rate

In [6]:
reg = linear_model.LinearRegression(fit_intercept=True)
#function for calculating doubling time
def get_double_t_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate'''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_
    return intercept/slope

### 30.ii.c Rolling regression

In [7]:
# function for calculation of rolling regression
def roll_regression(df_input,col='confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_double_t_regression,raw=False)
    return result

In [8]:
# data grouping using 'state' & 'country' columns and then apply rolling regerssion
test_data[['state','country','confirmed']]\
            .groupby(['state','country'])\
            .apply(roll_regression,'confirmed');

In [9]:
# apply rolling regression on main dataframe and reset index
pd_DR_result=pd_John_data[['state','country','confirmed']]\
                        .groupby(['state','country'])\
                        .apply(roll_regression,'confirmed').reset_index()

In [10]:
# rename the column name 
pd_DR_result=pd_DR_result.rename(columns={'confirmed':'confirmed_DR','level_2':'index'})
pd_DR_result.head()

Unnamed: 0,state,country,index,confirmed_DR
0,Alberta,Canada,0,
1,Alberta,Canada,465,
2,Alberta,Canada,701,
3,Alberta,Canada,966,
4,Alberta,Canada,1263,


In [11]:
pd_John_data=pd_John_data.reset_index()

In [12]:
# merge the result of rolling regression with main data frame using common column named index column
pd_result_large=pd.merge(pd_John_data,pd_DR_result[['index','confirmed_DR']],on=['index'],how='left')
pd_result_large.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR
0,0,2020-01-22,Alberta,Canada,0,
1,1,2020-01-22,no,"Korea, South",1,
2,2,2020-01-22,no,Kosovo,0,
3,3,2020-01-22,no,Kuwait,0,
4,4,2020-01-22,no,Kyrgyzstan,0,


## 30.iii Data Filtering 

In [13]:
# define savgol_filter for filtering the data 
def savgol_filter(df_input,column='confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    # fillup empty raw with 0 value 
    filter_in=df_input[column].fillna(0) 
    result=signal.savgol_filter(np.array(filter_in),
                           5, 
                           1)
    df_result[column+'_filtered']=result
    return df_result

In [14]:
# apply groupby using state and country column data and apply above function
pd_filtered_result=pd_John_data[['state','country','confirmed']]\
                                .groupby(['state','country'])\
                                .apply(savgol_filter).reset_index()

In [15]:
# merge value to large dataframe using common column name index
pd_result_large=pd.merge(pd_result_large,pd_filtered_result[['index','confirmed_filtered']],on=['index'],how='left')
pd_result_large.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered
0,0,2020-01-22,Alberta,Canada,0,,0.0
1,1,2020-01-22,no,"Korea, South",1,,0.8
2,2,2020-01-22,no,Kosovo,0,,0.0
3,3,2020-01-22,no,Kuwait,0,,0.0
4,4,2020-01-22,no,Kyrgyzstan,0,,0.0


## 30.iv Filtered doubling rate

In [16]:
# filtering the doubling rate and save into new dataframe
pd_filter_double=pd_result_large[['state','country','confirmed_filtered']]\
                                .groupby(['state','country'])\
                                .apply(roll_regression,'confirmed_filtered')\
                                .reset_index()
pd_filter_double=pd_filter_double.rename(columns={'confirmed_filtered':'confirmed_filtered_DR',
                             'level_2':'index'})

In [17]:
# perform mergeing using index as column and left 
pd_result_large=pd.merge(pd_result_large,pd_filter_double[['index','confirmed_filtered_DR']],on=['index'],how='left')
pd_result_large.head()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
0,0,2020-01-22,Alberta,Canada,0,,0.0,
1,1,2020-01-22,no,"Korea, South",1,,0.8,
2,2,2020-01-22,no,Kosovo,0,,0.0,
3,3,2020-01-22,no,Kuwait,0,,0.0,
4,4,2020-01-22,no,Kyrgyzstan,0,,0.0,


In [18]:
df_mask=pd_result_large['confirmed']>100
pd_result_large['confirmed_filtered_DR']=pd_result_large['confirmed_filtered_DR'].where(df_mask, other=np.NaN) 

In [19]:
pd_result_large[pd_result_large['country']=='Germany'].tail()

Unnamed: 0,index,date,state,country,confirmed,confirmed_DR,confirmed_filtered,confirmed_filtered_DR
60955,60955,2020-09-07,no,Germany,253626,196.36864,253560.4,205.700334
61223,61223,2020-09-08,no,Germany,254957,156.975534,254978.6,191.543791
61487,61487,2020-09-09,no,Germany,256433,181.692436,256580.0,168.922815
61755,61755,2020-09-10,no,Germany,258149,160.722431,258121.0,163.289121
62020,62020,2020-09-11,no,Germany,259735,156.33293,259662.0,167.502271


In [20]:
pd_result_large.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)