In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

sys.path.append("../..")

import os

os.chdir("../..")

print(os.getcwd())

import logging

from datetime import datetime
import pathlib
import pandas as pd

import click
from tqdm import tqdm

from src.features.dataloader import DataLoader
from src.models.networkx_graph import SurfaceModel

/media/benelot/SPACE/loci/IDSC/Infectology/vre-spark


In [3]:
# get tested/positive patients of range
encoding = "iso-8859-1"
csv_path = "./data/interim/model_data/VRE_SCREENING_DATA.csv"
risk_df = pd.read_csv(csv_path, encoding=encoding, parse_dates=["Record Date"], dtype=str)
risk_df

def get_risk_pids_of_range(from_range=None, to_range=None, is_positive=False):
    risk_range_df = risk_df
    
    if from_range is not None:
        risk_range_df = risk_range_df.loc[(risk_df["Record Date"] > from_range)]
        
    if to_range is not None:
        risk_range_df = risk_range_df.loc[(risk_df["Record Date"] <= to_range)]
    
    if is_positive:
        risk_range_df = risk_range_df[risk_range_df["Pathogen Result"] != "nn"]
    
    return risk_range_df["Patient ID"].tolist()

start_span = datetime(2017, 12, 1)
end_span = datetime(2018, 1, 1)

get_risk_pids_of_range(start_span, end_span)

['00013253506',
 '00006240593',
 '00011897031',
 '00014303736',
 '000504077-9',
 '00014404079',
 '00007420510',
 '00082310750',
 '00014401584',
 '00012552631',
 '00011739495',
 '00002477068',
 '00012631086',
 '00000722073',
 '00014244241',
 '00003324494']

In [4]:
from datetime import date, datetime, timedelta

#####################################
logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO,
                    datefmt='%d.%m.%Y %H:%M:%S')
#####################################

now_str = datetime.now().strftime("%Y%m%d%H%M%S")

#####################################
# Validating algorithms
logging.info("Running sliding window validation...")

def datespan(start_date, end_date, delta=timedelta(days=1)):
    current_date = start_date
    while current_date < end_date:
        yield current_date
        current_date += delta

start_span = datetime(2017, 12, 1)
end_span = datetime(2021, 3, 31)

for start_date in datespan(start_span, end_span, timedelta(weeks=1)):
    end_date = start_date + timedelta(weeks=6 * 4)
    print(f"Loading data within range: {start_date - timedelta(weeks=4)} - {end_date + timedelta(weeks=4)}")
    loader = DataLoader(hdfs_pipe=False)  # hdfs_pipe = False --> files will be loaded directly from CSV
    patient_data = loader.prepare_dataset(load_medications=False,
                                          load_icd_codes=False,
                                          load_chop_codes=False,
                                          load_surgeries=False,
                                          load_partners=False,
                                          from_range=start_date - timedelta(weeks=4),
                                          to_range=end_date + timedelta(weeks=4))
    
    print(f"Calculating snapshot: {start_date} - {end_date}")
    surface_graph = SurfaceModel(data_dir='./data/processed/networkx')
    surface_graph.add_network_data(patient_dict=patient_data, case_subset='relevant_case')
    surface_graph.trim_model(start_date, end_date)
    surface_graph.remove_isolated_nodes()
    surface_graph.inspect_network()
    
    break
    
    surface_graph.add_edge_infection()
    
    infected_patients = get_risk_pids_of_range(is_positive=True)
    print(f"Number of patients ever infected: {len(infected_patients)}")

    patient_degree_ratio_df = surface_graph.calculate_patient_degree_ratio()
    print(patient_degree_ratio_df[(patient_degree_ratio_df["Node Type"] == "Patient") & (patient_degree_ratio_df["Risk Status"] == "neg")].head(50))
    pdr_prediction = patient_degree_ratio_df[(patient_degree_ratio_df["Node Type"] == "Patient") & (patient_degree_ratio_df["Degree Ratio"] >= 1.0) & (patient_degree_ratio_df["Risk Status"] == "neg")]["Node ID"].to_list()
    print(f"Number of patients at risk predicted by Patient Degree Ratio (PDR): {len(pdr_prediction)}")
    print(f"Number of patients correctly predicted by PDR to be ever positive: {len(set(infected_patients).intersection(set(pdr_prediction)))}")

#     total_degree_ratio_df = surface_graph.calculate_total_degree_ratio()
#     #print(total_degree_ratio_df.head(50))
#     tdr_prediction = total_degree_ratio_df[(total_degree_ratio_df["Node Type"] == "Patient") & (total_degree_ratio_df["Total Degree Ratio"] >= 1.0)]["Node ID"].to_list()
#     print(f"Number of patients at risk predicted by Total Degree Ratio (TDR): {len(tdr_prediction)}")
#     print(f"Number of patients correctly predicted by TDR to be ever positive: {len(set(infected_patients).intersection(set(tdr_prediction)))}")
    
    del loader
    #del patient_data
    #del surface_graph

02.07.2021 15:30:27 - INFO: Running sliding window validation...
  from pandas import Panel
02.07.2021 15:30:27 - INFO: Processing data (load_test_data is False, hdfs_pipe is False, base_path set to ./data/interim/model_data/).
02.07.2021 15:30:27 - INFO: [AGENT] loading patient data...


Loading data within range: 2017-11-03 00:00:00 - 2018-06-15 00:00:00


100%|██████████| 2026966/2026966 [00:23<00:00, 85859.71it/s] 
02.07.2021 15:30:53 - INFO: 2004681 patients created
02.07.2021 15:30:53 - INFO: [AGENT ATTRIBUTE] loading building data..
100%|██████████| 24/24 [00:00<00:00, 707.23it/s]
02.07.2021 15:30:53 - INFO: [AGENT] loading room data...
100%|██████████| 1241/1241 [00:00<00:00, 27532.95it/s]
02.07.2021 15:30:53 - INFO: 1056 rooms created, 33 buildings created, 61 floors created
02.07.2021 15:30:53 - INFO: [INTERACTION] loading case data...
100%|██████████| 2162978/2162978 [00:31<00:00, 69150.28it/s] 
02.07.2021 15:31:30 - INFO: 2160192 cases ok, 0 patients not found, 2786 cases not active
02.07.2021 15:31:30 - INFO: [AGENT ATTRIBUTE] loading partner data omitted.
02.07.2021 15:31:30 - INFO: [INTERACTION] loading stay data...
100%|██████████| 2785840/2785840 [00:41<00:00, 67658.92it/s] 
100%|██████████| 2785840/2785840 [00:09<00:00, 289858.74it/s]
02.07.2021 15:32:28 - INFO: 2785834 stays ok, 6 cases not found, 0 malformed, 2785834 wa

Calculating snapshot: 2017-12-01 00:00:00 - 2018-05-18 00:00:00


100%|██████████| 2004681/2004681 [00:31<00:00, 64476.18it/s] 
02.07.2021 15:35:04 - INFO: ##################################################################################
02.07.2021 15:35:04 - INFO: Encountered 0 stays without associated room, 2785834 rooms identified.
02.07.2021 15:35:04 - INFO: ------------------------------------------------------------------
02.07.2021 15:35:04 - INFO: ------------------------------------------------------------------
02.07.2021 15:35:04 - INFO: ##################################################################################
02.07.2021 15:35:04 - INFO: ###############################################################
02.07.2021 15:35:04 - INFO: Running network statistics...
02.07.2021 15:35:07 - INFO: --> Model Snapshot date: from 01.01.1 00:00:00 to 02.07.2021 15:30:27
02.07.2021 15:35:07 - INFO: --> Total 2008503 nodes, out of which 1872805 are isolated
02.07.2021 15:35:08 - INFO: --> Total 3803560 edges
02.07.2021 15:35:08 - INFO: ------------

In [5]:
all_nodes = surface_graph.S_GRAPH.nodes(data=True)  # list of tuples of ('source_id', key, {attr_dict } )

pos_pats = [node_data_tuple for node_data_tuple in all_nodes if not pd.isna(node_data_tuple[0]) and node_data_tuple[1]['type'] == 'Patient' and
                           node_data_tuple[1]['vre_status'] == 'pos']
for pos_pat in pos_pats:
    print(pos_pat[0])
    
print("------\n", len(pos_pats))

00000018996
00001255975
00003672018
00001248766
00001259776
00001256122
00001271733
00002504871
00002469146
00001283375
00003805298
00003846210
00002792176
00003850242
00002807432
00001410407
00002920182
00001490044
00002972425
00001516175
00003041913
00004026098
00003144305
00004049446
00004043448
00001711342
00003324494
00001769391
00003349314
00003398501
00004197305
00001874047
00001894374
00004227816
00003479862
00003455203
00004267079
00001957767
00000177008
00002073072
00005070759
00005069009
00002132516
00002156504
00005142377
00002191490
00004452925
00004461843
00002254174
00002267764
00002283832
00002350858
00005928850
00004661672
00006029230
00006057241
00006136770
00007035853
00007043481
00006291910
00006356109
00006403859
00007124023
00007126417
00007141920
00006511023
00004827791
00006593917
00006614426
00004869354
00004872770
00006727948
00007348037
00006895301
00009154515
00007420510
00009231285
00009261400
00009284664
00008273790
00008270449
00008332037
00009394796
0000

In [17]:
from src.features.model import Patient

pos_pats_details = {pos_pat[0]: patient_data["patients"][pos_pat[0]] for pos_pat in pos_pats[0:5]}

Patient.get_contact_patients(pos_pats_details, with_details=False)

  0%|          | 0/5 [00:00<?, ?it/s]
100%|██████████| 6/6 [00:00<00:00, 453.36it/s]

100%|██████████| 6/6 [00:00<00:00, 188.82it/s]

100%|██████████| 4/4 [00:00<00:00, 698.96it/s]

100%|██████████| 11/11 [00:00<00:00, 210.85it/s]

100%|██████████| 2/2 [00:00<00:00, 602.93it/s]

100%|██████████| 6/6 [00:00<00:00, 428.26it/s]

100%|██████████| 4/4 [00:00<00:00, 1346.92it/s]
 20%|██        | 1/5 [00:00<00:00,  7.33it/s]
100%|██████████| 10/10 [00:00<00:00, 182.20it/s]

100%|██████████| 4/4 [00:00<00:00, 166.63it/s]

100%|██████████| 3/3 [00:00<00:00, 486.37it/s]

100%|██████████| 6/6 [00:00<00:00, 249.18it/s]

100%|██████████| 8/8 [00:00<00:00, 244.27it/s]
 40%|████      | 2/5 [00:00<00:00,  6.82it/s]
  0%|          | 0/10 [00:00<?, ?it/s][A
100%|██████████| 10/10 [00:00<00:00, 84.03it/s][A

100%|██████████| 9/9 [00:00<00:00, 184.45it/s]

  0%|          | 0/10 [00:00<?, ?it/s][A
100%|██████████| 10/10 [00:00<00:00, 85.60it/s][A

100%|██████████| 6/6 [00:00<00:00, 190.29it/s]

100%|████

KeyboardInterrupt: 

In [8]:
Patient.get_contact_patients({"00090898729": patient_data["patients"]["00090898729"]})

100%|██████████| 1/1 [00:00<00:00, 31.21it/s]


{'00014565943': [('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 01:05:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 01:05:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 01:05:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 01:05:00'),
   'INO C 10',
   'contact_room')],
 '00014565960': [('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 00:30:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 00:30:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 00:30:00'),
   'INO C 10',
   'contact_room'),
  ('00090898729',
   Timestamp('2018-02-01 22:36:00'),
   Timestamp('2018-02-02 

In [None]:
# surface_graph.add_edge_infection()

# patient_data = None  # free up memory before graph processing!

# # Extract positive patient nodes
# # positive_patient_nodes = [node for node, nodedata in surface_graph.S_GRAPH.nodes(data=True)
# #                           if nodedata['type'] == 'Patient' and nodedata['vre_status'] == 'pos']

# patient_degree_ratio_df = surface_graph.calculate_patient_degree_ratio()
# print(patient_degree_ratio_df.head(50))
# #patient_degree_ratio_df.to_csv(f"./data/processed/metrics/{now_str}_patient_degree_ratio.csv", index=False)

# total_degree_ratio_df = surface_graph.calculate_total_degree_ratio()
# print(total_degree_ratio_df.head(50))
# #total_degree_ratio_df.to_csv(f"./data/processed/metrics/{now_str}_total_degree_ratio.csv", index=False)

# # TODO: Reenable node betweenness statistics. Deactivated as it uses a lot of resources!
# # node_betweenness_df = surface_graph.calculate_node_betweenness()
# # print(node_betweenness_df.head(50))
# # node_betweenness_df.to_csv(f"./data/processed/metrics/{now_str}_node_betweenness.csv", index=False)
# #####################################