In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys

sys.path.append("../..")

import os

os.chdir("../..")

print(os.getcwd())

import logging

from datetime import datetime
import pathlib
import pandas as pd

import click
from tqdm import tqdm

from src.features.dataloader import DataLoader
from src.models.networkx_graph import SurfaceModel

In [None]:
# get tested/positive patients of range
encoding = "iso-8859-1"
csv_path = "./data/interim/model_data/VRE_SCREENING_DATA.csv"
risk_df = pd.read_csv(csv_path, encoding=encoding, parse_dates=["Record Date"], dtype=str)
risk_df

def get_screening_pids_of_range(from_range=None, to_range=None, is_positive=False):
    risk_range_df = risk_df
    
    if from_range is not None:
        risk_range_df = risk_range_df.loc[(risk_df["Record Date"] > from_range)]
        
    if to_range is not None:
        risk_range_df = risk_range_df.loc[(risk_df["Record Date"] <= to_range)]
    
    if is_positive:
        risk_range_df = risk_range_df[risk_range_df["Pathogen Result"] != "nn"]
    
    return risk_range_df["Patient ID"].tolist()

start_span = datetime(2017, 12, 1)
end_span = datetime(2018, 1, 1)

get_screening_pids_of_range(start_span, end_span)

In [None]:
from datetime import date, datetime, timedelta

logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.INFO, datefmt='%d.%m.%Y %H:%M:%S')

now_str = datetime.now().strftime("%Y%m%d%H%M%S")

training_window_month_qty = 2
prediction_window_month_qty = 1

#####################################
# Validating algorithms
logging.info("Running sliding window validation...")

def datespan(start_date, end_date, delta=timedelta(days=1)):
    current_date = start_date
    while current_date < end_date:
        yield current_date
        current_date += delta

start_span = datetime(2017, 12, 1)
end_span = datetime(2021, 3, 31)

for start_date in datespan(start_span, end_span, timedelta(weeks=1)):
    end_date = start_date + timedelta(weeks=training_window_month_qty * 4)
    print(f"Loading data within range: {start_date - timedelta(weeks=4)} - {end_date + timedelta(weeks=4)}")
    loader = DataLoader()
    patient_data = loader.prepare_dataset(load_medications=False,
                                          load_icd_codes=False,
                                          load_chop_codes=False,
                                          load_surgeries=False,
                                          load_partners=False,
                                          from_range=start_date - timedelta(weeks=4),
                                          to_range=end_date + timedelta(weeks=4))
    
    print(f"Calculating snapshot: {start_date} - {end_date}")
    surface_graph = SurfaceModel(data_dir='./data/processed/networkx')
    surface_graph.add_network_data(patient_dict=patient_data, case_subset='relevant_case')
    surface_graph.trim_model(start_date, end_date)
    surface_graph.remove_isolated_nodes()
    surface_graph.inspect_network()
    
    surface_graph.add_edge_infection(infection_distance=2)

    # print general stats on screenings and infections
    infected_patients = get_screening_pids_of_range(is_positive=True)
    print(f"Number of patients ever infected: {len(infected_patients)}")
    
    print(f"{len(get_screening_pids_of_range(from_range=start_date, to_range=end_date, is_positive=False))} patients screened in graph range {start_date.date()} - {end_date.date()}")
    print(f"{len(get_screening_pids_of_range(from_range=start_date, to_range=end_date, is_positive=True))} patients infected in graph range {start_date.date()} - {end_date.date()}")
    
    # graph and screening range numbers can differ because patients without interactions are discarded as isolated nodes -> Unpredictable nodes
    # TODO: What to do with those?
    print(f"Number of equal patients in graph and in screening range: {len(set(get_screening_pids_of_range(from_range=start_date, to_range=end_date, is_positive=True)).intersection(set(surface_graph.get_positive_patients())))}")
    
    print(f"Number of patients screened in prediction range {end_date} - {end_date + timedelta(weeks=4)}: {len(get_screening_pids_of_range(from_range=end_date, to_range=end_date + timedelta(weeks=4), is_positive=False))}")
    print(f"Number of patients infected in prediction range {end_date} - {end_date + timedelta(weeks=4)}: {len(get_screening_pids_of_range(from_range=end_date, to_range=end_date + timedelta(weeks=4), is_positive=True))}")
    print(f"Number of positive patients in graph: {len(surface_graph.get_positive_patients())}")

    # calculate infection degree
    infection_degree_df = surface_graph.calculate_infection_degree()
    print(infection_degree_df[(infection_degree_df["Node Type"] == "Patient") & (infection_degree_df["Risk Status"] == "neg")].head(5))
    
    # sanity checks: are positive patients in ranking?
    infected_patients_list = infection_degree_df["Node ID"].to_list()
    print(f"Positive patients in infection degree ranking: {len(set(infected_patients).intersection(set(infected_patients_list)))}")
    
    # print prediction based on degree ratio
    id_prediction1 = infection_degree_df[(infection_degree_df["Node Type"] == "Patient") & (infection_degree_df["Degree Ratio"] >= 1.0) & (infection_degree_df["Risk Status"] == "neg")]["Node ID"].to_list()
    print(f"Number of patients at risk predicted by Infection Degree (ID): {len(id_prediction1)}")
    print(f"Number of patients correctly predicted by ID to be ever positive: {len(set(infected_patients).intersection(set(id_prediction1)))}")
    
    # print prediction based on number of infected edges median
    id_prediction2 = infection_degree_df[(infection_degree_df["Node Type"] == "Patient") & (infection_degree_df["Number of Infected Edges"] > infection_degree_df["Number of Infected Edges"].median()) & (infection_degree_df["Risk Status"] == "neg")]["Node ID"].to_list()
    print(f"Number of patients at risk predicted by Infection Degree (ID): {len(id_prediction2)}")
    print(f"Number of patients correctly predicted by ID to be ever positive: {len(set(infected_patients).intersection(set(id_prediction2)))}")

    # print prediction based on number of infected edges 75% quartile
    id_prediction3 = infection_degree_df[(infection_degree_df["Node Type"] == "Patient") & (infection_degree_df["Number of Infected Edges"] > infection_degree_df["Number of Infected Edges"].quantile(0.75)) & (infection_degree_df["Risk Status"] == "neg")]["Node ID"].to_list()
    print(f"Number of patients at risk predicted by Infection Degree (ID): {len(id_prediction3)}")
    print(f"Number of patients correctly predicted by ID to be ever positive: {len(set(infected_patients).intersection(set(id_prediction3)))}")


In [None]:
all_nodes = surface_graph.S_GRAPH.nodes(data=True)  # list of tuples of ('source_id', key, {attr_dict } )

pos_pats = [node_data_tuple for node_data_tuple in all_nodes if not pd.isna(node_data_tuple[0]) and node_data_tuple[1]['type'] == 'Patient' and node_data_tuple[1]['vre_status'] == 'pos']
for pos_pat in pos_pats:
    print(pos_pat[0])
    
print("------\n", len(pos_pats))

In [None]:
from src.features.model import Patient

pos_pats_details = {pos_pat[0]: patient_data["patients"][pos_pat[0]] for pos_pat in pos_pats[0:5]}

Patient.get_contact_patients(pos_pats_details, with_details=False)

In [None]:
Patient.get_contact_patients({"00090898729": patient_data["patients"]["00090898729"]})

In [None]:
# surface_graph.add_edge_infection()

# patient_data = None  # free up memory before graph processing!

# # Extract positive patient nodes
# # positive_patient_nodes = [node for node, nodedata in surface_graph.S_GRAPH.nodes(data=True)
# #                           if nodedata['type'] == 'Patient' and nodedata['vre_status'] == 'pos']

# patient_degree_ratio_df = surface_graph.calculate_patient_degree_ratio()
# print(patient_degree_ratio_df.head(50))
# #patient_degree_ratio_df.to_csv(f"./data/processed/metrics/{now_str}_patient_degree_ratio.csv", index=False)

# total_degree_ratio_df = surface_graph.calculate_total_degree_ratio()
# print(total_degree_ratio_df.head(50))
# #total_degree_ratio_df.to_csv(f"./data/processed/metrics/{now_str}_total_degree_ratio.csv", index=False)

# # TODO: Reenable node betweenness statistics. Deactivated as it uses a lot of resources!
# # node_betweenness_df = surface_graph.calculate_node_betweenness()
# # print(node_betweenness_df.head(50))
# # node_betweenness_df.to_csv(f"./data/processed/metrics/{now_str}_node_betweenness.csv", index=False)
# #####################################