## Final submission notebook for  exercise 3 

#### Student name: Karan Sanjay Dhage
#### student Id: 419197

# Data updated gathering and saving

In [None]:
# %load src/Source codes/Data_preparation/Data_preparation.py

import subprocess
import os
import pandas as pd
import requests
import json


def JH_Data_repo():
    """ Get data by a git pull request, the source code has to be pulled first
        Result is stored in the predefined csv structure
    """
    git_pull = subprocess.Popen("user/bin/JH_Data_gitPull",
                                cwd=os.path.dirname('data/raw/COVID-19/'),
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE) # Change directory path as per your need
    (out, error) = git_pull.communicate()

    print("Error : " + str(error))
    print("out : " + str(out))


def Example_for_Germany():
    """ Get current data from germany, attention API endpoint not too stable
        Result data frame is stored as pd.DataFrame and later saved in CSV file in local drive

    """
    data = requests.get(
        'https://services7.arcgis.com/mOBPykOjAyBO2ZKk/arcgis/rest/services/RKI_Landkreisdaten/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&f=json')

    json_object = json.loads(data.content)  # load all data
    German_data_list = []
    for pos, each_dict in enumerate(json_object['features'][:]):
        German_data_list.append(each_dict['attributes'])

    df_Germany = pd.DataFrame(German_data_list)  # prepare a Dataframe containing final data
    df_Germany.to_csv(r'C:\Users\HP\Desktop\pythonProject\data/GER_state_data.csv', sep=';')  # save data as CSV file in needed folder
    print(' Regions rows: ' + str(df_Germany.shape[0]))


if __name__ == '__main__':
    JH_Data_repo()
    Example_for_Germany()

# Data process pipeline

In [None]:
# %load src/Source codes/JH_Data_Extraction/JH_Data_Extraction.py
import pandas as pd


def JH_data_Extraction():
    """ Creation of relative data

    """

    path = r'C:\Users\HP\Desktop\pythonProject/data/raw/COVID-19/csse_covid_19_data/csse_covid_19_time_series' \
           r'/time_series_covid19_confirmed_global.csv '
    df_raw = pd.read_csv(path, delimiter=",")
    df_db = df_raw.rename(columns={'Country/Region': 'country',
                                   'Province/State': 'state'})

    df_db['state'] = df_db['state'].fillna('no')

    df_db = df_db.drop(['Lat', 'Long'], axis=1)

    df_rel = df_db.set_index(['state', 'country']) \
        .T \
        .stack(level=[0, 1]) \
        .reset_index() \
        .rename(columns={'level_0': 'date',
                         0: 'confirmed'},
                )

    df_rel['date'] = df_rel.date.astype('datetime64[ns]')

    df_rel.to_csv(r'C:\Users\HP\Desktop\pythonProject\data/COVID_relational_confirmed.csv', sep=';', index=False) # Change directory path as per your need
    print(' Stored rows: ' + str(df_rel.shape[0]))
    print(' Latest date is: ' + str(max(df_rel.date)))


if __name__ == '__main__':
    JH_data_Extraction()


# Data filtration

In [None]:
# %load src/Source codes/Filtration of data/Data_Filtration.py
import pandas as pd
import numpy as np

from sklearn import linear_model
from scipy import signal

reg = linear_model.LinearRegression(fit_intercept=True)


def doubling_T_via_reg(in_array):
    """ Using a linear regression to calculate the doubling rate
        Parameters:
        ----------
        in_array : pandas.series

        Returns:
        ----------
        Doubling rate: double
    """

    y = np.array(in_array)
    X = np.arange(-1, 2).reshape(-1, 1)  # shaping x, y arrays

    assert len(in_array) == 3
    reg.fit(X, y)  # regression
    intercept = reg.intercept_
    slope = reg.coef_

    return intercept / slope


def savgol_fil(input_data, column='confirmed', window=5):
    """ Savgol Filter which can be used in groupby apply function (data structure kept)
        parameters:
        ----------
        input_data : pandas.series
        column : str
        window : int
            used data points to calculate the filter result

        Returns:
        ----------
        df_result: pd.DataFrame
            the index of the input_data has to be preserved in result
    """

    degree = 1
    df_result = input_data

    filter_in = input_data[column].fillna(0)  # attention with the neutral element here

    result = signal.savgol_filter(np.array(filter_in),
                                  window,  # window size used for filtering
                                  1)
    df_result[str(column + '_filtered')] = result
    return df_result


def rolling_reg(input_data, col='confirmed'):
    """ Rolling Regression to approximate the doubling time'
        Parameters:
        ----------
        input_data: pd.DataFrame
        col: str
            defines the used column
        Returns:
        ----------
        result: pd.DataFrame
    """
    days_back = 3
    result = input_data[col].rolling(
        window=days_back,
        min_periods=days_back).apply(doubling_T_via_reg, raw=False)

    return result


def filtered_data(input_data, filter_on='confirmed'):
    """  Calculate savgol filter and return merged data frame
        Parameters:
        ----------
        input_data: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    """

    must_contain = set(['state', 'country', filter_on])
    assert must_contain.issubset(set(input_data.columns)), ' Error in filtered_data not all columns in data frame'

    output = input_data.copy()  # we need a copy here otherwise the filter_on column will be overwritten

    pd_filtered_result = output[['state', 'country', filter_on]].groupby(['state', 'country']).apply(
        savgol_fil)  # .reset_index()

    output = pd.merge(output, pd_filtered_result[[str(filter_on + '_filtered')]], left_index=True,
                      right_index=True, how='left')
    return output.copy()


def doubling_rate(input_data, filter_on='confirmed'):
    """ Calculate approximated doubling rate and return merged data frame
        Parameters:
        ----------
        input_data: pd.DataFrame
        filter_on: str
            defines the used column
        Returns:
        ----------
        output: pd.DataFrame
            the result will be joined as a new column on the input data frame
    """

    must_contain = set(['state', 'country', filter_on])
    assert must_contain.issubset(set(input_data.columns)), ' Error in filtered_data not all columns in data frame'

    pd_DR_result = input_data.groupby(['state', 'country']).apply(rolling_reg, filter_on).reset_index()

    pd_DR_result = pd_DR_result.rename(columns={filter_on: filter_on + '_DR',
                                                'level_2': 'index'})

    # Merging on the index of our big table and on the index column after groupby
    output = pd.merge(input_data, pd_DR_result[['index', str(filter_on + '_DR')]], left_index=True, right_on=['index'],
                      how='left')
    output = output.drop(columns=['index'])

    return output


if __name__ == '__main__':
    # select directory path as per your folder structure
    JH_data = pd.read_csv(r'/data/COVID_relational_confirmed.csv', sep=';', parse_dates=[0])
    JH_data = JH_data.sort_values('date', ascending=True).copy()
    result_larg = filtered_data(JH_data)
    result_larg = doubling_rate(result_larg)
    result_larg = doubling_rate(result_larg, 'confirmed_filtered')

    mask_data = result_larg['confirmed'] > 100
    result_larg['confirmed_filtered_DR'] = result_larg['confirmed_filtered_DR'].where(mask_data, other=np.NaN)
    result_larg.to_csv(r'C:\Users\HP\Desktop\pythonProject\data/processed/COVID_final_set.csv', sep=';', index=False)


# Final dashboard for visualization

In [None]:
# %load src/Source codes/Visualisation of data/Dashboard.py
import pandas as pd
import numpy as np
import dash
import dash_core_components as dcc
import dash_html_components as html

from dash.dependencies import Input, Output, State
import plotly.graph_objects as go
import os

dash.__version__

print(os.getcwd())
df_input_large = pd.read_csv(r'C:\Users\HP\Desktop\pythonProject\data\processed\COVID_final_set.csv', sep=';')

fig = go.Figure()
app = dash.Dash()
app.layout = html.Div([

    dcc.Markdown('''
    #  Applied Data Science on COVID-19 dataset
    This dashboard is being implemented to demonstrate the understanding and knowledge gained during the data science 
    course. This will show different aspects of data science such as automated data scrapping, data filtration and 
    machine learning concepts.

    '''),
    dcc.Markdown('''
    ## Selection of various countries for visualization purpose
    '''),

    dcc.Dropdown(
        id='countries',
        options=[{'label': each, 'value': each} for each in df_input_large['country'].unique()],
        value=['India', 'Germany', 'Poland'],  # which are pre-selected
        multi=True
    ),

    dcc.Markdown('''
        ## Selection for Timeline of confirmed COVID-19 cases or the approximated doubling time
        '''),

    dcc.Dropdown(
        id='doubling_time',
        options=[
            {'label': 'Timeline Confirmed ', 'value': 'confirmed'},
            {'label': 'Timeline Confirmed Filtered', 'value': 'confirmed_filtered'},
            {'label': 'Timeline Doubling Rate', 'value': 'confirmed_DR'},
            {'label': 'Timeline Doubling Rate Filtered', 'value': 'confirmed_filtered_DR'},
        ],
        value='confirmed',
        multi=False
    ),

    dcc.Graph(figure=fig, id='main_window_slope'),
])


@app.callback(
    Output('main_window_slope', 'figure'),
    [Input('countries', 'value'),
     Input('doubling_time', 'value')])
def update_figure(country_list, show_doubling):
    if 'doubling_rate' in show_doubling:
        my_yaxis = {'type': "log",
                    'title': 'Approximated doubling rate over 3 days (larger numbers are better #stayathome)'
                    }
    else:
        my_yaxis = {'type': "log",
                    'title': 'Logarithmic scaled Confirmed infected people (source johns hopkins csse)'
                    }

    traces = []
    for each in country_list:

        df_plot = df_input_large[df_input_large['country'] == each]

        if show_doubling == 'doubling_rate_filtered':
            df_plot = df_plot[
                ['state', 'country', 'confirmed', 'confirmed_filtered', 'confirmed_DR', 'confirmed_filtered_DR',
                 'date']].groupby(['country', 'date']).agg(np.mean).reset_index()
        else:
            df_plot = df_plot[
                ['state', 'country', 'confirmed', 'confirmed_filtered', 'confirmed_DR', 'confirmed_filtered_DR',
                 'date']].groupby(['country', 'date']).agg(np.sum).reset_index()

        traces.append(dict(x=df_plot.date,
                           y=df_plot[show_doubling],
                           mode='markers+lines',
                           opacity=0.9,
                           name=each
                           )
                      )
    return {
        'data': traces,
        'layout': dict(
            width=1280,
            height=720,
            title= "Graphical representation of log scaled confirmed cases to timeline"
            xaxis={'title': 'Timeline',
                   'tickangle': -45,
                   'nticks': 20,
                   'tickfont': dict(size=14, color="#7f7f7f"),
                   },

            yaxis=my_yaxis
        )
    }


if __name__ == '__main__':
    app.run_server(debug=True, use_reloader=False)