# Data Explorer Notebook

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import sys
import os

import numpy as np
import pandas as pd
import holoviews as hv
import networkx as nx
import bokeh
from holoviews import opts

hv.extension('bokeh')

os.chdir("../../")
sys.path.append(os.getcwd())

# sys.path.append('../')
# os.chdir("../")
print(os.getcwd())

import configparser
import logging
import pathlib
import networkx as nx

from src.features.dataloader import DataLoader
from src.models.networkx_graph import SurfaceModel

In [None]:
logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO,
                    datefmt='%d.%m.%Y %H:%M:%S')

logging.info("Initiating data_loader") # initiate data loader


loader = DataLoader(hdfs_pipe=False) # load all data
dataset = loader.prepare_dataset(
        load_patients=True,
        load_cases=True,
        load_stays=True,
        load_appointments=True,
        load_devices=True,
        load_care_data=False, # enable if visualization works with other data
        load_employees=True,
        load_rooms=True,
        load_buildings=True,
        load_partners=False,
        load_medications=False,
        load_risks=False,
        load_chop_codes=False,
        load_surgeries=False,
        load_icd_codes=False)

# Preparing Dashboard Data

## Building Data

In [None]:
records = []
for building_id, building in dataset["buildings"].items():
    records.append(building.get_record())
    
waveware_buildings_df = pd.DataFrame.from_records(records)
waveware_buildings_df.drop(columns=["SAP Building Abbreviation 1"], inplace=True)
waveware_buildings_df = waveware_buildings_df.drop_duplicates(subset=["WW Building ID"])
waveware_buildings_df

In [None]:
# # Show building data we have directly from Waveware
# base_folder = "./data/raw/model_data/"
# waveware_buildings_df = pd.read_csv(base_folder + "Waveware_Auszug Gebaeudeinformation Stand 03.12.2020.csv", encoding="ISO-8859-1", dtype=str)

# waveware_buildings_df = waveware_buildings_df.drop(["Standort", "Parzellennummer", "Zonenplan", "Denkmalpflege", "Anlage-ID", "Bemerkung", "Eigentümer (SAP)", "Vermietung (SAP)", "Portfolio (SAP)", "Baujahr", "Gebäudetyp", "GVB-Nummer", "Amtlicher Wert", "Gebäudeversicherungswert", "Gebäudezustand", "Technologiestand HLKSE", "Techn. Ausb.standard", "Zustand Technik", "Klimatisierung", "Aufzug", "Gebäudezustand Bem.", "Status"], axis=1)
# waveware_buildings_df.columns = ["Waveware Building Full ID", "Building Code", "Waveware Building ID", "Building abbreviation", "Building Common Name", "Street", "Zip Code", "Location", "SAP-Anlage Nr."]
# waveware_buildings_df.drop(["Zip Code", "Location","SAP-Anlage Nr.", "Building Code"], axis=1, inplace=True)
# waveware_buildings_df = waveware_buildings_df[waveware_buildings_df["Building Common Name"] != "Grundstück Inselareal"]

# waveware_buildings_df = waveware_buildings_df[~pd.isna(waveware_buildings_df["Building abbreviation"])]
# waveware_buildings_df.sort_values(by=["Building abbreviation"], inplace=True)
# # waveware_buildings_df.set_index("Waveware Building ID", inplace=True)
# waveware_buildings_df

In [None]:
# # augment building data with building coordinates
# # TODO: Move this to dataset improvement
# import requests

# def get_long_lat(street_string):
#     response = requests.get(f"https://nominatim.openstreetmap.org/search?q={street_string.replace(' ', '+')}+Bern&format=json")
#     types = []
#     for loc in response.json():
#         types.append(loc["type"] + ": " + loc["display_name"][:15])
#         if loc["type"] in ["hospital", "childcare", "clinic"]:
#             id_string = loc["type"] + ": " + loc["display_name"][:15]
#             long_lat = (loc["lon"], loc["lat"])
#             return pd.Series({'Type': id_string, 'Long/Lat': long_lat})
        
#     id_string = response.json()[0]["type"] + ": " + response.json()[0]["display_name"][:15]
#     long_lat = (response.json()[0]["lon"], response.json()[0]["lat"])
#     return pd.Series({'Type': id_string, 'Long/Lat': long_lat})


# waveware_buildings_coords_df = pd.concat([waveware_buildings_df, waveware_buildings_df["Street"].apply(lambda s: get_long_lat(s))], axis=1)
# waveware_buildings_coords_df["Longitude"] = waveware_buildings_coords_df["Long/Lat"].apply(lambda ll: float(ll[0]))
# waveware_buildings_coords_df["Latitude"] = waveware_buildings_coords_df["Long/Lat"].apply(lambda ll: float(ll[1]))
# waveware_buildings_coords_df.drop(["Long/Lat"], axis=1, inplace=True)
# waveware_buildings_coords_df

In [None]:
# waveware_buildings_coords_df["Waveware Building ID"].to_list()

In [None]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx
gdf = gpd.GeoDataFrame(
    waveware_buildings_df, geometry=gpd.points_from_xy(waveware_buildings_df.Longitude, waveware_buildings_df.Latitude))
gdf.set_crs(epsg=4326, inplace=True)
gdf = gdf.to_crs(epsg=3857)

In [None]:
gdf['coords'] = gdf['geometry'].apply(lambda x: x.representative_point().coords[:])
gdf['coords_x'], gdf['coords_y'] = [coords[0][0] for coords in gdf['coords']], [coords[0][1] for coords in gdf['coords']]

building_coordinates = list(gdf.apply(lambda row: [row["WW Building ID"], (row["coords_x"], row["coords_y"])], axis=1))
building_coordinates = {item[0]:item[1] for item in building_coordinates}
# for row in gdf.iteritems():
#     building_coordinates[row["WW Building ID"]] = (row["coords_x"], row["coords_y"])

gdf

In [None]:
building_coordinates

# Preparing Dashboard Components

In [None]:
import panel as pn
import random
import holoviews as hv
from holoviews import opts
import xarray as xr
import hvplot.pandas  # noqa
import hvplot.xarray  # noqa
hv.extension('bokeh')
pn.extension()

## Side Pane

In [None]:
dataset["buildings"].keys()

In [None]:
len(list(dataset["patients"].keys()))

## Time Selector

In [None]:
#build timeline
from holoviews.plotting.links import RangeToolLink

base_folder = "./data/interim/model_data/"
vre_screening_df = pd.read_csv(base_folder + "VRE_SCREENING_DATA.csv", encoding="ISO-8859-1", parse_dates=["Birth Date", "Measurement Date"], dtype="str")

# do some preprocessing
vre_screening_df["Patient ID"] = vre_screening_df["Patient ID"].apply(lambda patient_id: str(patient_id).zfill(11))
df = vre_screening_df
vre_screening_df["Measurement Date d64"] = vre_screening_df["Measurement Date"].astype("datetime64")
df = vre_screening_df.loc[vre_screening_df["Result"] != "nn"]
df["Year"] = df["Measurement Date d64"].dt.year
df["Week"] = df["Measurement Date d64"].dt.week
x = pd.DataFrame(df[["Measurement Date d64"]].groupby([df["Year"], df["Week"]]).count())
x.columns = ["Count"]
x = x.reset_index()
x["Week"] = x["Week"].astype("int32").astype("str")
x["Year"] = x["Year"].astype("int32").astype("str")
x["Year/Week"] = x["Year"] + "/" + x["Week"] + "1"
x["Measurement Week"] = x["Year/Week"].apply(lambda x: pd.to_datetime(x, format='%Y/%W%w'))
x[["Measurement Week", "Count"]]

timeline_curve = hv.Curve(x, 'Measurement Week', ('Count', 'Screenings'))

timeline_histogram = hv.Histogram(timeline_curve)
timeline_histogram

tgt = timeline_histogram.relabel('VRE Screenings').opts(width=500, height=150, labelled=['y'], toolbar='disable')
src = timeline_histogram.opts(width=800, height=150, yaxis=None, default_tools=[])

RangeToolLink(src, tgt)

layout = (src + tgt)
timeline = layout.opts(opts.Layout(shared_axes=False, merge_tools=False))

## Geographic Map

In [None]:
hv.element.tiles.Wikipedia().opts(width=600 * 2, height=550 * 2)

In [None]:
hv.element.tiles.CartoLight().opts(width=600 * 2, height=550 * 2)

## Network

In [None]:
def create_networkx_graph_from_patients(patients):
    surface_graph = SurfaceModel()
    surface_graph.add_network_data(patient_dict=dataset, patient_subset=patients)
    surface_graph.remove_isolated_nodes()
    
    return surface_graph.S_GRAPH

from networkx.drawing.layout import _process_params

def hospital_layout(G, building_coordinates, circular_scale=1, force_scale=0.01, center=None, dim=2):
    # dim=2 only
    """Position nodes on a circle.
    Parameters
    ----------
    G : NetworkX graph or list of nodes
        A position will be assigned to every node in G.
    scale : number (default: 1)
        Scale factor for positions.
    center : array-like or None
        Coordinate pair around which to center the layout.
    dim : int
        Dimension of layout.
        If dim>2, the remaining dimensions are set to zero
        in the returned positions.
        If dim<2, a ValueError is raised.
    Returns
    -------
    pos : dict
        A dictionary of positions keyed by node
    Raises
    ------
    ValueError
        If dim < 2
    Examples
    --------
    >>> G = nx.path_graph(4)
    >>> pos = nx.circular_layout(G)
    Notes
    -----
    This algorithm currently only works in two dimensions and does not
    try to minimize edge crossings.
    """
    import numpy as np

    if dim != 2:
        raise ValueError("cannot handle dimensions != 2")

    G, center = _process_params(G, center, dim)
    
    positions = {}
    
    # find room positions
    buildings_rooms = {}

    for node in G.nodes():
        if "type" in G.nodes.data()[node] and G.nodes.data()[node]["type"] == "Room":
            building_id = G.nodes.data()[node]["building_id"]
            if building_id not in buildings_rooms:
                buildings_rooms[building_id] = []
                
            buildings_rooms[building_id].append(node)
        else:
            positions[node] = np.array([826469.588389, 5.933624e06])
    
    for building_id, building_rooms in buildings_rooms.items():
        
        # find building longitude and latitude
        try:
            building = building_coordinates[building_id]
            x, y = building[0], building[1]
        except:
            x, y = 826469.588389, 5.933624e06

        room_positions = nx.circular_layout(building_rooms, center=[x, y], scale=circular_scale)
        positions = {**positions, **room_positions}
    
    # find all other positions
    positions = nx.fruchterman_reingold_layout(G, fixed=[item for sublist in list(buildings_rooms.values()) for item in sublist], pos=positions, k=force_scale)
        
    return positions

from collections import defaultdict
import random

def from_networkx(G, positions, curved_edges=0.0, nodes=None, **kwargs):
    """
    Generate a HoloViews Graph from a networkx.Graph object and
    networkx layout function or dictionary of node positions.
    Any keyword arguments will be passed to the layout
    function. By default it will extract all node and edge
    attributes from the networkx.Graph but explicit node
    information may also be supplied. Any non-scalar attributes,
    such as lists or dictionaries will be ignored.
    Args:
        G (networkx.Graph): Graph to convert to Graph element
        positions (dict or callable): Node positions
            Node positions defined as a dictionary mapping from
            node id to (x, y) tuple or networkx layout function
            which computes a positions dictionary
        kwargs (dict): Keyword arguments for layout function
    Returns:
        Graph element
    """
    if not isinstance(positions, dict):
        positions = positions(G, **kwargs)

    # Unpack edges
    edges = defaultdict(list)
    for start, end in G.edges():
        for attr, value in sorted(G.adj[start][end].items()):
            if isinstance(value, (list, dict)):
                continue # Cannot handle list or dict attrs
            edges[attr].append(value)

        # Handle tuple node indexes (used in 2D grid Graphs)
        if isinstance(start, tuple):
            start = str(start)
        if isinstance(end, tuple):
            end = str(end)
        edges['start'].append(start)
        edges['end'].append(end)
    edge_cols = sorted([k for k in edges if k not in ('start', 'end')
                        and len(edges[k]) == len(edges['start'])])
    edge_vdims = [str(col) if isinstance(col, int) else col for col in edge_cols]
    edge_data = tuple(edges[col] for col in ['start', 'end']+edge_cols)

    # Unpack user node info
    xdim, ydim, idim = hv.Graph.node_type.kdims[:3]
    if nodes:
        node_columns = nodes.columns()
        idx_dim = nodes.kdims[0].name
        info_cols, values = zip(*((k, v) for k, v in node_columns.items() if k != idx_dim))
        node_info = {i: vals for i, vals in zip(node_columns[idx_dim], zip(*values))}
    else:
        info_cols = []
        node_info = None
    node_columns = defaultdict(list)

    # Unpack node positions
    for idx, pos in sorted(positions.items()):
        node = G.nodes.get(idx)
        if node is None:
            continue
        x, y = pos
        node_columns[xdim.name].append(x)
        node_columns[ydim.name].append(y)
        for attr, value in node.items():
            if isinstance(value, (list, dict)):
                continue
            node_columns[attr].append(value)
        for i, col in enumerate(info_cols):
            node_columns[col].append(node_info[idx][i])
        if isinstance(idx, tuple):
            idx = str(idx) # Tuple node indexes handled as strings
        node_columns[idim.name].append(idx)
    node_cols = sorted([k for k in node_columns if k not in hv.Graph.node_type.kdims
                        and len(node_columns[k]) == len(node_columns[xdim.name])])
    columns = [xdim.name, ydim.name, idim.name]+node_cols+list(info_cols)
    node_data = tuple(node_columns[col] for col in columns)

    # Construct nodes
    vdims = []
    for col in node_cols:
        if isinstance(col, int):
            dim = str(col)
        elif nodes is not None and col in nodes.vdims:
            dim = nodes.get_dimension(col)
        else:
            dim = col
        vdims.append(dim)
    nodes = hv.Graph.node_type(node_data, vdims=vdims)
    
    # Construct edges
    if curved_edges != 0:
        # Compute edge paths
        def bezier(start, end, control, steps=np.linspace(0, 1, 100)):
            return (1 - steps)**2 * start + 2 * (1 - steps) * steps * control + steps**2 * end        
        paths = []
        for edge in G.edges():
            sx, sy = positions[edge[0]]
            ex, ey = positions[edge[1]]
            
            # get vector leading from start to end
            vx = ex - sx
            vy = ey - sy
            
            # perpendicular vector to vector above
            perpendicular_x = -vy
            perpendicular_y = vx
            
            offset = curved_edges # random.randint(-1, 1) * curved_edges #random.uniform(-curved_edges, curved_edges)
            
            # define bezier control point as a slight perpendicular offset from the midpoint
            mx = (ex + sx) / 2.0 + perpendicular_x * offset
            my = (ey + sy) / 2.0 + perpendicular_y * offset
            
            paths.append(np.column_stack([bezier(sx, ex, mx), bezier(sy, ey, my)]))
            
        graph = hv.Graph((edge_data, nodes, paths), vdims=edge_vdims)
    else:
        graph = hv.Graph((edge_data, nodes), vdims=edge_vdims)

    # Construct graph
    return graph

def create_holoviz_graph_from_patients(patients):
    networkx_graph = create_networkx_graph_from_patients(patients)
    
    networkx_graph = create_networkx_graph_from_patients(selected_patients)

    positions = hospital_layout(networkx_graph, building_coordinates, circular_scale=10, force_scale=0.01)

    colors = hv.Cycle('Set3').values
    graph = from_networkx(networkx_graph, positions, curved_edges=0.15).opts(fontscale=2, width=4000, height=4000, title='Interaction network graph',
                                                                                               node_color='type', node_size=20, node_line_width=0,
                                                                                               edge_color='grey', edge_line_width=0.5, cmap=colors,
                                                                                               xaxis=None, yaxis=None)

    labels = hv.Labels(graph.nodes, ['x', 'y'], 'index')
    labels.opts(text_font_size='14pt', text_color='black', bgcolor='white')
    return graph, labels


# Holoviz Dashboard

In [None]:
# define selectors for explorer
# Source: https://panel.holoviz.org/reference/widgets/MultiSelect.html

patient_ids = random.sample(list(dataset["patients"].keys()), 1000) # too many patients

selected_patients = random.sample(patient_ids, 100)

graph, graph_labels = create_holoviz_graph_from_patients(selected_patients)

# agent selectors
patient_selector = pn.widgets.MultiSelect(name='Patient ID Selector', value=selected_patients, options=patient_ids, size=10)
# device_selector = pn.widgets.MultiSelect(name='Device ID Selector', value=random.sample(top_n_patient_ids, 2), options=top_n_patient_ids, size=10)
# employee_selector = pn.widgets.MultiSelect(name='Room ID Selector', value=random.sample(top_n_patient_ids, 2), options=top_n_patient_ids, size=10)

# architecture selectors
# building_selector = pn.widgets.MultiSelect(name='Building ID Selector', value=random.sample(top_n_patient_ids, 2), options=top_n_patient_ids, size=10)
# floor_selector = pn.widgets.MultiSelect(name='Floor ID Selector', value=random.sample(top_n_patient_ids, 2), options=top_n_patient_ids, size=10)
# room_selector = pn.widgets.MultiSelect(name='Room ID Selector', value=random.sample(top_n_patient_ids, 2), options=top_n_patient_ids, size=10)

# Source: https://panel.holoviz.org/reference/layouts/Column.html
data_selection_col = pn.Column("# Data Selection")
data_selection_col.append(patient_selector)

# data_selection_col.append(device_selector)
# data_selection_col.append(employee_selector)
# data_selection_col.append(building_selector)
# data_selection_col.append(floor_selector)
# data_selection_col.append()

# define a control tab group, source: https://panel.holoviz.org/user_guide/Components.html#Tabs
control_tabs = pn.Tabs()
control_tabs.append(("Data Selection", data_selection_col))

# define a map
map_bg = hv.element.tiles.Wikipedia()
# map_bg = hv.element.tiles.CartoLight()

# define a data plot
building_plot = gdf.hvplot.points('coords', color='black')
building_labels = hv.Labels({('x', 'y'): gdf[["coords_x", "coords_y"]].to_numpy(), 'text': gdf["SAP Building Abbreviation 2"].to_list()}, ['x', 'y'], 'text')#.opts(xoffset=0.5, yoffset=0.5, padding=0.2)
# cities = gpd.read_file(gpd.datasets.get_path('naturalearth_cities'))
# building_plot = cities.hvplot(geo=True, color='orange')

# Define a dashboard grid, source: https://panel.holoviz.org/user_guide/Components.html#GridSpec
dashboard_grid = pn.GridSpec(sizing_mode='stretch_both')
dashboard_grid[0:9, 0] = control_tabs
dashboard_grid[0:9, 1:5] = map_bg * building_plot * building_labels * graph * graph_labels
dashboard_grid[10, 1:5] = timeline

In [None]:
import panel as pn

pn.panel(dashboard_grid).servable(title='Spread Explorer')

In [None]:

# # find top30 patients
# n = 30
# patients = set()
# for node in networkx_graph.nodes(data=True):
#     if "type" not in node[1]:
#         continue
        
#     if node[1]["type"] == "Patient":
#         patients.add(node[0])

# highest_degrees = [node[0] for node in sorted(list(networkx_graph.degree), key=lambda x: x[1], reverse=True) if node[0] in patients]
# top_n_patient_ids = highest_degrees[:n]
# top_n_patient_ids