# 1. Update all data

In [1]:
# %load ../src/data/get_data.py
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
import subprocess
import os
from datetime import datetime

def get_johns_hopkins_data():
    git_pull = subprocess.Popen( "git pull" ,
                     cwd = os.path.dirname( '../data/raw/COVID-19/' ),
                     shell = True,
                     stdout = subprocess.PIPE,
                     stderr = subprocess.PIPE )
    (out, error) = git_pull.communicate()
    print("Error : " + str(error))
    print("out : " + str(out))

def get_current_data_germany():
    #fetching data of 16 states.
    data = requests.get('https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/Coronaf%C3%A4lle_in_den_Bundesl%C3%A4ndern/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')
    json_object = json.loads(data.content)
    full_list = []
    for pos, each_dict in enumerate (json_object['features']):
        full_list.append(each_dict['attributes'])

    pd_full_list = pd.DataFrame(full_list)
    pd_full_list.to_csv('../data/raw/NPGEO/GER_state_data.csv',sep=';')
    print('Number of regions row: '+str(pd_full_list.shape[0]))

if __name__ == '__main__':
    print('Get Data Started.')
    get_johns_hopkins_data()
    get_current_data_germany()
    print('Get Data Started.')


Get Data Started.
Error : b'From https://github.com/CSSEGISandData/COVID-19\n   3d7b0cc0..f8e8a5c8  master     -> origin/master\n   a449659b..2cca6425  web-data   -> origin/web-data\n'
out : b'Updating 3d7b0cc0..f8e8a5c8\nFast-forward\n csse_covid_19_data/README.md                                            | 1 +\n .../csse_covid_19_time_series/time_series_covid19_recovered_global.csv  | 2 +-\n 2 files changed, 2 insertions(+), 1 deletion(-)\n'
Number of regions row: 16
Get Data Started.


# 2. Process Pipeline

In [2]:
# %load ../src/data/process_JH_data.py
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import json
import subprocess
import os
from datetime import datetime

def store_relational_JH_data():
    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)

    pd_data_base = pd_raw.rename(columns={'Country/Region':'country', 'Province/State':'state'})

    pd_data_base['state'] = pd_data_base['state'].fillna('no')

    pd_data_base = pd_data_base.drop(['Lat','Long'],axis=1)

    pd_relational_model = pd_data_base.set_index(['state','country']) \
                                        .T                            \
                                        .stack(level=[0,1])           \
                                        .reset_index()                \
                                        .rename(columns={'level_0':'date',
                                                            0:'confirmed'}
                                                )

    pd_relational_model['date'] = pd_relational_model.date.astype('datetime64[ns]')
    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';', index = False)
    #print(pd_relational_model.head())
    print('Number of rows stored - Relational Model: '+str(pd_relational_model.shape[0]))

def store_confimed_data_for_sir():
    data_path = '../data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw = pd.read_csv(data_path)
    pd_raw = pd_raw.drop(['Lat','Long','Province/State'],axis=1)
    pd_raw = pd_raw.rename(columns={'Country/Region':'country'})
    pd_flat_table = pd_raw.set_index('country') \
                    .T \
                    .stack(level=[0]) \
                    .reset_index() \
                    .rename(columns={'level_0':'date',
                                    0:'confirmed'}
                                    )
    pd_flat_table['date'] = pd_flat_table.date.astype('datetime64[ns]')
    pd_flat_table = pd.pivot_table(pd_flat_table, values='confirmed', index='date', columns='country', aggfunc=np.sum, fill_value=0).reset_index()
    pd_flat_table.to_csv('../data/processed/COVID_full_flat_table.csv',sep=';',index = False)
    #print(pd_flat_table.tail())
    print('Number of rows stored - Full Flat Table: '+str(pd_flat_table.shape[0]))

if __name__ == '__main__':
    print('Process Pipeline Started.')
    store_relational_JH_data()
    store_confimed_data_for_sir()
    print('Process Pipeline Ended.')


Process Pipeline Started.
Number of rows stored - Relational Model: 59584
Number of rows stored - Full Flat Table: 224
Process Pipeline Ended.


# 3. Filter and Doubling Rate Calculation

In [3]:
# %load ../src/features/build_features.py
import numpy as np
import pandas as pd
from sklearn import linear_model
from datetime import datetime
from scipy import signal

reg = linear_model.LinearRegression(fit_intercept=True)

def get_doubling_time_via_regression(in_array):
    '''uses linear regression to approximate the slope'''
    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1,1)

    assert len(in_array)==3

    reg.fit(X,y)
    intercept = reg.intercept_
    slope = reg.coef_

    return intercept/slope

def savgol_filter(df_input, column ='confirmed', window = 5):
    '''savgol filter. to ensure data structre is kept'''
    degree = 1
    df_result = df_input
    filter_in = df_input[column].fillna(0)

    result = signal.savgol_filter(np.array(filter_in),
                                 window,
                                 degree)
    df_result[column+'_filtered'] = result
    return df_result

def rolling_reg(df_input, col='confirmed'):
    '''df_input -> data frame'''
    days_back = 3
    result = df_input[col].rolling(
                                    window = days_back,
                                    min_periods = days_back,
                                    ).apply(get_doubling_time_via_regression, raw=False)
    return result

def calc_filtered_data(df_input, filter_on='confirmed'):
    '''
        Calculate savgol filter and return merged data frame
    '''
    must_contain = set(['state','country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'Error in calc_filtered_data not all columns in data frame'

    pd_filtered_result = df_input[['state','country',filter_on]].groupby(['state','country']).apply(savgol_filter).reset_index()
    df_output = pd.merge(df_input,pd_filtered_result[['index',filter_on+'_filtered']], on=['index'], how='left')

    return df_output

def calc_doubling_rate(df_input, filter_on = 'confirmed'):
    '''
        Calcualte approximated doubling rate and return merged data frame
    '''
    must_contain = set(['state','country',filter_on])
    #print(must_contain)
    #print(df_input.columns)
    assert must_contain.issubset(set(df_input.columns)), 'Error in calc_doubling_rate not all columns in data frame'

    pd_DR_result = df_input.groupby(['state','country']).apply(rolling_reg,filter_on).reset_index()
    pd_DR_result = pd_DR_result.rename(columns={filter_on:filter_on+'_DR','level_2':'index'})

    df_output = pd.merge(df_input,pd_DR_result[['index',filter_on+'_DR']], on=['index'],how='left')

    return df_output

if __name__ == '__main__':
    print('Feature Building Started.')
    test_data = np.array([2,4,6])
    result = get_doubling_time_via_regression(test_data)
    print('the test slope is: '+str(result))

    pd_JH_data = pd.read_csv('../data/processed/COVID_relational_confirmed.csv', sep=';', parse_dates=[0])
    # sorting is required - becasue we are assuming sliding window approch; so we are going top to down from first to last sample step by step
    pd_JH_data = pd_JH_data.sort_values('date',ascending=True).reset_index().copy()
    #print(pd_JH_data.columns)
    #must_contain = set(['state','country','confirmed'])
    #assert must_contain.issubset(set(pd_JH_data.columns)), 'Error in calc_filtered_data not all columns in data frame'

    pd_result_large = calc_filtered_data(pd_JH_data)
    #print(pd_result_large.columns)
    pd_result_large = calc_doubling_rate(pd_result_large)
    #print(pd_result_large.columns)
    pd_result_large = calc_doubling_rate(pd_result_large,'confirmed_filtered')
    # Mask the data to NaN which has lower than 100 doubling rate. Result in good visualization
    mask=pd_result_large['confirmed']>100
    pd_result_large['confirmed_filtered_DR']=pd_result_large['confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_large.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_large.head())
    print('Feature Building Started.')


Feature Building Started.
the test slope is: [2.]
   index       date    state       country  confirmed  confirmed_filtered  \
0      0 2020-01-22  Alberta        Canada        0.0                 0.0   
1    169 2020-01-22       no  Korea, South        1.0                -4.8   
2    170 2020-01-22       no        Kosovo        0.0                 0.0   
3    171 2020-01-22       no        Kuwait        0.0                 0.0   
4    172 2020-01-22       no    Kyrgyzstan        0.0                10.8   

   confirmed_DR  confirmed_filtered_DR  
0           NaN                    NaN  
1           NaN                    NaN  
2           NaN                    NaN  
3           NaN                    NaN  
4           NaN                    NaN  
Feature Building Started.


# 4. SIR Model calculation

In [None]:
# %load ../src/models/sir_model.py
# %load ../src/models/sir_model.py
import pandas as pd
import numpy as np
from scipy import optimize
from scipy import integrate
import warnings

'''
    Default paramter initializations
'''
N0 = 5000000
I0 = 20   #Infected population
S0 = N0 - I0    #Suspected population
R0 = 0          #Recovered population
beta = 0.4      #Rate of infection
gamma = 0.1     #Rate of recovery
t = 0
def Handle_SIR_Modelling(ydata):
    global t
    t = np.arange(len(ydata))

    global I0
    I0 = ydata[0]   #Infected population
    popt, pcov = optimize.curve_fit(fit_odeint, t, ydata, maxfev=10000)
    fitted = fit_odeint(t, *popt)
    return t, ydata, fitted




def SIR_model_fit(SIR, time, beta, gamma):
    '''
    Simple SIR model implementation.
    S: Suspected population
    I: Infected population
    R: Recovered population
    beta: rate of infection
    gamma: rate of recovery
    time: for integral as define in odeint function of scipy.integrate
    as per slides: ds+dI+dR = 0 and S+R+I=N (total population)

    Make a note tht in this model a recovered person can not get infected again.
    '''

    S,I,R = SIR
    dS_dt = -beta*S*I/N0
    dI_dt = beta*S*I/N0 - gamma*I
    dR_dt = gamma*I

    return dS_dt, dI_dt, dR_dt

def fit_odeint(x, beta, gamma):
    return integrate.odeint(SIR_model_fit, (S0, I0, R0), t, args=(beta, gamma))[:,1]  #we are only fetching dI

if __name__ == '__main__':
    print('SIR Modelling Started.')
    warnings.filterwarnings('ignore')
    df_analyse = pd.read_csv('../data/processed/COVID_full_flat_table.csv', sep=';')
    df_analyse.sort_values('date', ascending=True)
    df_analyse = df_analyse.drop(['date'],axis=1)
    df_SIR_model = pd.DataFrame()
    start_count = 0
    total_rows = len(df_analyse.US)
    for each_country in df_analyse:
        #if each_country == 'Germany':
        #print(each_country)
        temp_fitted = np.full(total_rows, np.NaN)
        df_SIR_model[each_country] = temp_fitted
        nonzero_row = (df_analyse[each_country] > start_count).idxmax(1)
        ydata = np.array(df_analyse[each_country][nonzero_row:])
        t, ydata, fitted = Handle_SIR_Modelling(ydata)
        df_SIR_model[each_country].iloc[0:len(fitted)] = fitted
    df_SIR_model.to_csv('../data/processed/COVID_SIR_Model_Data.csv',sep=';', index = False)
    print('Number of rows stored - Full Flat Table: '+str(df_SIR_model.shape[0]))
    print('SIR Modelling Ended.')


# 4. Visual Board

In [None]:
# %load ../src/visualization/visualize.py
# %load ../src/visualization/visualize.py
# %load ../src/visualization/visualize.py
import pandas as pd
import numpy as np
import dash
dash.__version__
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output,State
import plotly.graph_objects as go
import os

print(os.getcwd())
df_input_large = pd.read_csv('../data/processed/COVID_final_set_1.csv', sep=';')
df_analyse = pd.read_csv('../data/processed/COVID_full_flat_table.csv', sep=';')
df_SIR_data = pd.read_csv('../data/processed/COVID_SIR_Model_Data.csv', sep=';')

fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
            # Applied Data science on COVID-19 DataSet

            Goal of the project is to learn data science concpet by applying CRISP_DM,
            it covers full walkthrough of: automated data gathering, data transformations,
            filtering and machine learning to approximating the doubling time, and
            {static} deployement of responsive dashboard.
            '''),


    dcc.Tabs([
        dcc.Tab(label='Time-Series Visualization', children=[

            dcc.Markdown('''
            ## Multi-Select country for visualization
            '''),

            dcc.Dropdown(
                id = 'country_drop_down',
                options = [{'label':name, 'value':name} for name in df_input_large['country'].unique()],
                value = ['US', 'Germany', 'India'], # default selected values
                multi = True # for allowing multi value selection
            ),

            dcc.Markdown('''
            ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
            '''),

            dcc.Dropdown(
                id = 'doubling_time',
                options = [
                    {'label':'Timeline Confirmed', 'value':'confirmed'},
                    {'label':'Timeline Confirmed Filtered', 'value':'confirmed_filtered'},
                    {'label':'Timeline Doubling Rate', 'value':'doubling_rate'},
                    {'label':'Timeline Doubling Rate Filtered','value':'doubling_rate_filtered'}
                ],
                value = 'confirmed', # default selected values
                multi = False # Not allowing multi value selection
            ),

            dcc.Graph(figure=fig, id='main_window_slope')

        ]),

        dcc.Tab(label='SIR Model', children=[

            dcc.Markdown('''
            ## Multi-Select country for visualization
            '''),

            dcc.Dropdown(
                id = 'country_drop_down_sir',
                options = [{'label':name, 'value':name} for name in df_input_large['country'].unique()],
                value = ['Germany'], # default selected values
                multi = True # for allowing multi value selection
            ),
            dcc.Graph(figure=fig, id='sir_chart')
        ])

    ])
])

@app.callback(
    Output('sir_chart', 'figure'),
    [Input('country_drop_down_sir', 'value')])
def update_figure(country_list):
    traces = []
    if(len(country_list) > 0):
        for each in country_list:
            nonzero_row = (df_analyse[each] > 0).idxmax(1)
            country_data = df_analyse[each][nonzero_row:]
            ydata = np.array(country_data)
            t = np.arange(len(ydata))
            fitted = np.array(df_SIR_data[each])
            #t, ydata, fitted = Handle_SIR_Modelling(ydata)

            traces.append(dict(
                x = t,
                y = ydata,
                mode = 'markers+lines',
                name = each+str(' - Truth'),
                opacity = 0.9
            ))
            traces.append(
            dict(
                x = t,
                y = fitted,
                mode = 'markers+lines',
                name = each+str(' - Simulation'),
                opacity = 0.9
            ))
    return {
        'data': traces,
        'layout': dict(
            width = 1280,
            height = 720,
            title = 'Fit of SIR model for: '+', '.join(country_list),
            xaxis = {
                'title': 'Days', #'Fit of SIR model for '+str(each)+' cases',
                'tickangle': -45,
                'nticks' : 20,
                'tickfont' : dict(size = 14, color = '#7f7f7f')
            },
            yaxis = {
                'title': 'Population Infected',
                'type': 'log'
            }
        )
    }

@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time','value')])
def update_figure(country_list, show_doubling):
    ''' Updates figure based on passed coutry list and filter/doubling value'''

    if 'doubling_rate' in show_doubling:
        my_yaxis = {'type':'log',
                    'title':'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
                    }
    else:
        my_yaxis = {'type':'log',
                    'title':'Confirmed infected people (source Johns Hopkins CSSE, log-scale)'
                    }

    traces = []
    print(df_input_large.head())
    for each in country_list:

        df_plot = df_input_large[df_input_large['country']==each]

        # To handle case where we have multiple entry for the same country.
        #improvement can be done by removing this call from callback
        if show_doubling =='doubling_rate_filtered':
            df_plot = df_plot[['state','country','confirmed','confirmed_filtered','doubling_rate','doubling_rate_filtered','date']].groupby(['country','date']).agg(np.mean).reset_index()
        else:
            df_plot = df_plot[['state','country','confirmed','confirmed_filtered','doubling_rate','doubling_rate_filtered','date']].groupby(['country','date']).agg(np.sum).reset_index()

        traces.append(dict(
            x = df_plot.date,
            y = df_plot[show_doubling],
            mode = 'markers+lines',
            name = each,
            opacity = 0.9
        ))
    return {
        'data': traces,
        'layout': dict(
            width = 1280,
            height = 720,
            xaxis = {
                'title': 'Timeline',
                'tickangle': -45,
                'nticks' : 20,
                'tickfont' : dict(size = 14, color = '#7f7f7f')
            },
            yaxis = my_yaxis
        )
    }

if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)


D:\College\Semester_2\EnterpriseDataScience\Code\EDS_Covid\eda_covid_19\notebooks
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Running on http://127.0.0.1:8050/
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
Debugger PIN: 790-864-761
 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
   index        date    state       country  confirmed  confirmed_filtered  \
0      0  2020-01-22  Alberta        Canada        0.0                 0.0   
1    169  2020-01-22       no