Load and parse the Dataset (Run this first!)

Update the directory which contains all xlsx files

In [None]:
import pandas as pd

from ParseData import parse_dataset
from utils.PropertyNames import ColumnNames as Cols

patient_data = parse_dataset("/home/lumpus/Documents/deBruijnData/Diabetes/", silent=False)
patients = patient_data[Cols.patient].unique()


Filter prune benchmark Test

In [None]:
from utils.PropertyNames import MethodOptions as Opts
from Benchmark import benchmark

params = {
    "k": 5,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.filter,
    "prune_threshold": 1,
    "weight_thresholds": [1, 3, 8],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
    "naive_threshold": 15
}

excluded = patients.copy().tolist()

excluded.remove('P17')
excluded.remove('P26')

benchmark(patient_data[patient_data[Cols.patient].isin(excluded)].copy(), start_time_range_hours=0, end_time_range_hours=1, **params)


Adaptive prune benchmark Test

In [None]:
from utils.PropertyNames import MethodOptions as Opts
from Benchmark import benchmark


params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
    "naive_threshold": 15
}
excluded = patients.copy().tolist()

# excluded.remove('P11')
# excluded.remove('P26')
pd.set_option('display.float_format', '{:.4f}'.format)

benchmark(patient_data[patient_data[Cols.patient].isin(excluded)].copy(), start_time_range_hours=0, end_time_range_hours=1, **params)

Adaptive prune benchmark Test on 2 patients

Plot Probability Distribution

In [None]:
from utils.VisualizationUtils import draw_histogram
from deBruijn.ProbabilityGraph import ProbabilityGraph
from utils.PropertyNames import MethodOptions as Opts
from utils.PropertyNames import ColumnNames as Cols

k = 4
risky_chars: None
params = {
    "prune": False,
    "prune_method": Opts.filter,
    "prune_threshold": 3,
    "max_steps": 3,
}

sequences = []
for p in patients:
    float_seq = patient_data[patient_data[Cols.patient] == p]
    float_seq = float_seq.sort_values(Cols.date, ascending=True)[Cols.char]
    sequences.append(float_seq)

probability_graph = ProbabilityGraph(sequences=sequences, k=k)

print(f"Resulting graph: {probability_graph}")

probability_model = probability_graph.get_probability_model(**params)

draw_histogram(list(probability_model.probability_dict.values()), "Node Probability Distribution", "Probability",
               "Count", bins=20)


Plot Timeline of the target (ideal model) for every point. Alerted datapoints are marked as red

In [None]:
from utils.VisualizationUtils import draw_timeline
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_target_column

naive_threshold = 20
params = {
    "k": 4,
    "risky_chars": None,
    "risk_threshold": 0.5,
    "prune": True,
    "prune_method": Opts.filter,
    "prune_threshold": 3,
    "max_steps": 3,
}

# Pick an alert model here
alert_to_plot = Cols.target

patient_data_with_alerts = add_target_column(patient_data)

for p in patients:
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
        alert_to_plot, include_already_dangerous=False)

Draw timeline of one of our models

In [None]:
from utils.VisualizationUtils import draw_timeline
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_alerts, add_target_column

naive_threshold = 15


params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
}

# Pick an alert model here
alert_to_plot = Cols.combined_alert_and

patient_data_with_alerts = add_target_column(patient_data)
patient_data_with_alerts = add_alerts(patient_data_with_alerts, naive_threshold, **params)

print(patient_data_with_alerts[
    (patient_data_with_alerts[Cols.patient] == 'P20')
    & (patient_data_with_alerts[Cols.isDangerous] == False)
    & (patient_data_with_alerts[Cols.target] == True)
    & (patient_data_with_alerts[Cols.combined_alert_and] == False)
])

excluded = patients.copy().tolist()

for p in ['P21']:
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
         Cols.prob_alert, include_already_dangerous=False)
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
         Cols.combined_alert_and, include_already_dangerous=False)


In [None]:
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_alerts, add_target_column

naive_threshold = 15


params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
}

patient_data_with_alerts = add_target_column(patient_data)
patient_data_with_alerts = add_alerts(patient_data_with_alerts, naive_threshold, **params)

excluded = patients.copy().tolist()
# excluded.remove('P17')
# excluded.remove('P26')


In [None]:
from Benchmark import calculate_metrics

metrics = list()


pd.set_option('display.float_format', '{:.4f}'.format)


for p in patients:
    data_patient = patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].copy()
    metric = calculate_metrics(data_patient, Cols.combined_alert_and)
    metric['Patient'] = p
    del metric['Accuracy']
    print(p)
    display(metric['Confusion Matrix'])
    del metric['Confusion Matrix']
    values_under_70 = data_patient[data_patient['Value'] < 70]
    percentage_under_70 = (len(values_under_70) / len(data_patient)) * 100
    metric['% Hypo'] = percentage_under_70
    metrics.append(metric)

df  = pd.DataFrame(metrics)
df_sorted = df.sort_values(by='Balanced Accuracy', ascending=False)
display(df_sorted)

In [None]:
import numpy as np

no_warning = 0
gave_warning = 0

for p in excluded:
    df = patient_data[patient_data[Cols.patient] == p].copy()
    df = df.dropna(subset=[Cols.target, Cols.naive_alert, Cols.prob_alert, Cols.combined_alert_and, Cols.combined_alert_or])
    crossed_70 = (df[Cols.value] < 70) & (df[Cols.value].shift(1) >= 70)
    alert_true = df[Cols.combined_alert_and].shift(1).astype(bool)
    target_true = df[Cols.target].shift(1).astype(bool)

    gave_warning += np.sum(crossed_70 & alert_true & target_true)
    no_warning += np.sum(crossed_70 & ~alert_true & target_true)


print('gave_warning:', gave_warning)
print('no_warning:', no_warning)
print(gave_warning/(gave_warning+no_warning))


In [None]:
from utils.VisualizationUtils import draw_histogram

time_diff_list = []

for p in excluded:
    df = patient_data[patient_data[Cols.patient] == p].copy().sort_values(Cols.date, ascending=True).reset_index(drop=True)
    df = df.dropna(subset=[Cols.target, Cols.naive_alert, Cols.prob_alert, Cols.combined_alert_and, Cols.combined_alert_or]).reset_index(drop=True)  # Reset index after dropna
    crossed_70 = (df[Cols.value] < 70) & (df[Cols.value].shift(1) >= 70)

    for i in range(1, len(df)):
        # Check if value crosses below 70
        if crossed_70[i]:
            start_time = df.loc[i, Cols.date]
            # Iterate backwards to find the earliest point where both alert and target are true
            for j in range(i-2, -1, -1):
                # Ignore point if value goes under 70 again
                if df.loc[j, Cols.value] < 70:
                    break
                # Check if both target and alert are true
                elif bool(df.loc[1, Cols.combined_alert_and]) is False:
                    end_time = df.loc[j, Cols.date]
                    time_diff = start_time - end_time
                    time_diff_list.append(time_diff)
                    break  # Found the required point, no need to check further

# Convert list of timedelta objects to desired format (e.g., total seconds)
time_diff_seconds = [td.total_seconds() for td in time_diff_list]
time_diff_minutes = [td / 60 for td in time_diff_seconds]
time_diff_minutes = [x for x in time_diff_minutes if x <= 60]

print(time_diff_minutes)
draw_histogram(time_diff_minutes, 'Forecast Time Distribution of the Model', 'Minutes', 'Count', bins=7
            , color='#0000FF', edgecolor='black')

In [None]:
from utils.PropertyNames import ColumnNames as Cols

patient_data[Cols.patient].value_counts()

print(len(patient_data))

In [None]:
values_under_70 = patient_data[patient_data['Value'] < 70]
percentage_under_70 = (len(values_under_70) / len(patient_data)) * 100
print(percentage_under_70)