# 1. Update all data

In [1]:
# %load ..\src\data\get_data.py

import subprocess
import os

import pandas as pd
import numpy as np

from datetime import datetime

import requests
import json

def get_john_hopkins():
    git_pull = subprocess.Popen("/usr/bin/git pull",
                         cwd = os.path.dirname('../data/raw/COVID-19_New/COVID-19/'),
                         shell = True,
                         stdout = subprocess.PIPE,
                         stderr = subprocess.PIPE)
    (out,error) = git_pull.communicate()

    print("Error : " + str(error))
    print("out : " + str(out))

if __name__ == '__main__':
    get_john_hopkins()


Error : b'The system cannot find the path specified.\r\n'
out : b''


# 2. Process Pipeline

In [2]:
# %load ..\src\data\process_JH_data.py
import pandas as pd
import numpy as np

from datetime import datetime


def store_relational_JH_data():
    ''' Transformes the COVID data in a relational data set
    '''

    data_path='../data/raw/COVID-19_New/COVID-19/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'
    pd_raw=pd.read_csv(data_path)

    pd_data_base=pd_raw.rename(columns={'Country/Region':'Country',
                      'Province/State':'State'})

    pd_data_base['State']=pd_data_base['State'].fillna('no')

    pd_data_base=pd_data_base.drop(['Lat','Long'],axis=1)


    pd_relational_model=pd_data_base.set_index(['State','Country']) \
                                .T                              \
                                .stack(level=[0,1])             \
                                .reset_index()                  \
                                .rename(columns={'level_0':'date',
                                                   0:'Confirmed'},
                                                  )

    pd_relational_model['date']=pd_relational_model.date.astype('datetime64[ns]')

    pd_relational_model.to_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',index=False)
    print(' Number of rows stored: '+str(pd_relational_model.shape[0]))

if __name__ == '__main__':

    store_relational_JH_data()

 Number of rows stored: 261345


# 3. Filter and Doubling Rate Calculation

In [3]:
# %load ..\src\features\build_features.py
import numpy as np
from sklearn import linear_model
reg = linear_model.LinearRegression(fit_intercept=True)
import pandas as pd

from scipy import signal


def get_doubling_time_via_regression(in_array):
    ''' Use a linear regression to approximate the doubling rate
        Parameters:
        ----------
        in_array : pandas.series
        Returns:
        ----------
        Doubling rate: double
    '''

    y = np.array(in_array)
    X = np.arange(-1,2).reshape(-1, 1)

    assert len(in_array)==3
    reg.fit(X,y)
    intercept=reg.intercept_
    slope=reg.coef_

    return intercept/slope

from scipy import signal

def savgol_filter(df_input,column='Confirmed',window=5):
    ''' Savgol Filter which can be used in groupby apply function 
        it ensures that the data structure is kept'''
    window=5, 
    degree=1
    df_result=df_input
    
    filter_in=df_input[column].fillna(0) 
    
    result=signal.savgol_filter(np.array(filter_in),
                           5, 
                           1)
    df_result[column+'_filtered']=result
    return df_result

def rolling_reg(df_input,col='Confirmed'):
    ''' input has to be a data frame'''
    ''' return is single series (mandatory for group by apply)'''
    days_back=3
    result=df_input[col].rolling(
                window=days_back,
                min_periods=days_back).apply(get_doubling_time_via_regression,raw=False)
    return result 

def calc_filtered_data(df_input,filter_on = 'Confirmed'):
    must_contain = set(['State','Country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'Error in calc_filtered_data not all columns in data frame'

    df_output=df_input.copy() 
    
    pd_filtered_result=df_output[['State','Country',filter_on]].groupby(['State','Country']).apply(savgol_filter)
 
    df_output=pd.merge(df_output,pd_filtered_result[[str(filter_on+'_filtered')]],left_index=True,right_index=True,how='left')

    return df_output.copy()

def calc_doubling_rate(df_input,filter_on='Confirmed'):

    must_contain=set(['State','Country',filter_on])
    assert must_contain.issubset(set(df_input.columns)), 'Error in calc_filtered_data not all columns in data frame'


    pd_DR_result= df_input.groupby(['State','Country']).apply(rolling_reg,filter_on).reset_index()

    pd_DR_result=pd_DR_result.rename(columns={filter_on:filter_on+'_DR',
                             'level_2':'index'})

    df_output=pd.merge(df_input,pd_DR_result[['index',str(filter_on+'_DR')]],left_index=True,right_on=['index'],how='left')
    df_output=df_output.drop(columns=['index'])

    return df_output

if __name__ == '__main__':
    test_data_reg=np.array([2,4,6])
    result=get_doubling_time_via_regression(test_data_reg)
    print('the test slope is: '+str(result))

    pd_raw=pd.read_csv('../data/processed/COVID_relational_confirmed.csv',sep=';',parse_dates=[0])
    pd_raw=pd_raw.sort_values('date',ascending=True).copy()

    pd_result_large=calc_filtered_data(pd_raw)
    pd_result_large=calc_doubling_rate(pd_result_large)
    pd_result_large=calc_doubling_rate(pd_result_large,'Confirmed_filtered')


    mask=pd_result_large['Confirmed']>100
    pd_result_large['Confirmed_filtered_DR']=pd_result_large['Confirmed_filtered_DR'].where(mask, other=np.NaN)
    pd_result_large.to_csv('../data/processed/COVID_final_set.csv',sep=';',index=False)
    print(pd_result_large[pd_result_large['Country']=='Germany'].tail())

the test slope is: [2.]
             date State  Country   Confirmed  Confirmed_filtered  \
141213 2022-07-22    no  Germany  30331131.0          30272764.4   
141214 2022-07-23    no  Germany  30331133.0          30341824.8   
141215 2022-07-24    no  Germany  30331133.0          30413677.4   
141216 2022-07-25    no  Germany  30476605.0          30481675.4   
141217 2022-07-26    no  Germany  30598385.0          30549673.4   

        Confirmed_DR  Confirmed_filtered_DR  
141213  3.025988e+02             371.001523  
141214  6.586269e+02             444.049693  
141215  3.033113e+07             430.659422  
141216  4.176697e+02             434.926880  
141217  2.280148e+02             448.273117  


# 4. Visual Board

In [None]:
# %load ../src/visualization/visualize.py
import pandas as pd
import numpy as np

import dash
dash.__version__
from dash import dcc
from dash import html
from dash.dependencies import Input, Output,State

import plotly.graph_objects as go

import os
print(os.getcwd())
df_input_large=pd.read_csv('../data/processed/COVID_final_set.csv',sep=';')


fig = go.Figure()

app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 data
    Goal is to create a dynamic dashboard for visualising COVID-19 data of multiple countries.
    '''),

    dcc.Markdown('''
    ## Multi-Select Country for visualization
    '''),


    dcc.Dropdown(
        id='country_drop_down',
        options=[ {'label': each,'value':each} for each in df_input_large['Country'].unique()],
        value=['US', 'Germany','Italy'], # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Select Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),


    dcc.Dropdown(
    id='doubling_time',
    options=[
        {'label': 'Timeline Confirmed ', 'value': 'Confirmed'},
        {'label': 'Timeline Confirmed Filtered', 'value': 'Confirmed_filtered'},
        {'label': 'Timeline Doubling Rate', 'value': 'Confirmed_DR'},
        {'label': 'Timeline Doubling Rate Filtered', 'value': 'Confirmed_filtered_DR'},
    ],
    value='Confirmed',
    multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope')
])



@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('country_drop_down', 'value'),
    Input('doubling_time', 'value')])
def update_figure(country_list,show_doubling):


    if 'doubling_rate' in show_doubling:
        my_yaxis={'type':"log",
               'title':'Approximated doubling rate over 3 days'
              }
    else:
        my_yaxis={'type':"log",
                  'title':'Confirmed infected people (Source: Johns Hopkins CSSE, log-scale)'
              }


    traces = []
    for each in country_list:

        df_plot=df_input_large[df_input_large['Country']==each]

        if show_doubling=='doubling_rate_filtered':
            df_plot=df_plot[['State','Country','Confirmed','Confirmed_filtered','Confirmed_DR','Confirmed_filtered_DR','date']].groupby(['Country','date']).agg(np.mean).reset_index()
        else:
            df_plot=df_plot[['State','Country','Confirmed','Confirmed_filtered','Confirmed_DR','Confirmed_filtered_DR','date']].groupby(['Country','date']).agg(np.sum).reset_index()
       #print(show_doubling)


        traces.append(dict(x=df_plot.date,
                                y=df_plot[show_doubling],
                                mode='markers+lines',
                                opacity=0.9,
                                name=each
                        )
                )

    return {
            'data': traces,
            'layout': dict (
                width=1280,
                height=720,

                xaxis={'title':'Timeline',
                        'tickangle':-45,
                        'nticks':20,
                        'tickfont':dict(size=14,color="#7f7f7f"),
                      },

                yaxis=my_yaxis
        )
    }

if __name__ == '__main__':

    app.run_server(debug=True, use_reloader=False)


C:\Users\kaust\ads_covid-19_roy\notebooks
Dash is running on http://127.0.0.1:8050/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: on
