Load and parse the Dataset (Run this first!)

Update the directory which contains all xlsx files

In [None]:
import pandas as pd

from ParseData import parse_dataset
from utils.PropertyNames import ColumnNames as Cols

patient_data = parse_dataset("/home/meocakir/Documents/Datasets/Diabetes", silent=False)
patients = patient_data[Cols.patient].unique()

len(patient_data)

Filter prune benchmark Test

In [None]:
from utils.PropertyNames import MethodOptions as Opts
from Benchmark import benchmark

params = {
    "k": 5,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.filter,
    "prune_threshold": 1,
    "weight_thresholds": [1, 3, 8],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
    "naive_threshold": 15
}

excluded = patients.copy().tolist()

excluded.remove('P17')
excluded.remove('P26')

benchmark(patient_data[patient_data[Cols.patient].isin(excluded)].copy(), start_time_range_hours=0,
          end_time_range_hours=1, **params)


Adaptive prune benchmark Test

In [None]:
from utils.PropertyNames import MethodOptions as Opts
from Benchmark import benchmark

params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": False,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
    "naive_threshold": 15
}
excluded = patients.copy().tolist()

#excluded.remove('P11')
#excluded.remove('P26')
pd.set_option('display.float_format', '{:.4f}'.format)

benchmark(patient_data[patient_data[Cols.patient].isin(excluded)].copy(), start_time_range_hours=0,
          end_time_range_hours=1, **params)

Adaptive prune benchmark Test on 2 patients

Plot Probability Distribution

In [None]:
from utils.VisualizationUtils import draw_histogram
from deBruijn.ProbabilityGraph import ProbabilityGraph
from utils.PropertyNames import MethodOptions as Opts
from utils.PropertyNames import ColumnNames as Cols

k = 4
risky_chars: None
params = {
    "prune": False,
    "prune_method": Opts.filter,
    "prune_threshold": 3,
    "max_steps": 3,
}

sequences = []
for p in patients:
    float_seq = patient_data[patient_data[Cols.patient] == p]
    float_seq = float_seq.sort_values(Cols.date, ascending=True)[Cols.char]
    sequences.append(float_seq)

probability_graph = ProbabilityGraph(sequences=sequences, k=k)

print(f"Resulting graph: {probability_graph}")

probability_model = probability_graph.get_probability_model(**params)

draw_histogram(list(probability_model.probability_dict.values()), "Node Probability Distribution", "Probability",
               "Count", bins=20)


Plot Timeline of the target (ideal model) for every point. Alerted datapoints are marked as red

In [None]:
from utils.VisualizationUtils import draw_timeline
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_target_column

naive_threshold = 20
params = {
    "k": 4,
    "risky_chars": None,
    "risk_threshold": 0.5,
    "prune": True,
    "prune_method": Opts.filter,
    "prune_threshold": 3,
    "max_steps": 3,
}

# Pick an alert model here
alert_to_plot = Cols.target

patient_data_with_alerts = add_target_column(patient_data)

for p in ['P1']:
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
        alert_to_plot, include_already_dangerous=False)

Draw timeline of one of our models

In [None]:
from utils.VisualizationUtils import draw_timeline
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_alerts, add_target_column

naive_threshold = 15

params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
}

# Pick an alert model here
alert_to_plot = Cols.combined_alert_and

patient_data_with_alerts = add_target_column(patient_data)
patient_data_with_alerts = add_alerts(patient_data_with_alerts, naive_threshold, **params)

excluded = patients.copy().tolist()

for p in ['P1']:
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
        Cols.prob_alert, include_already_dangerous=False)
    draw_timeline(
        patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].sort_values(Cols.date, ascending=True), p,
        Cols.combined_alert_and, include_already_dangerous=False)


In [None]:
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts

from Benchmark import add_alerts, add_target_column

naive_threshold = 15

params = {
    "k": 6,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
}

patient_data_with_alerts = add_target_column(patient_data)
patient_data_with_alerts = add_alerts(patient_data_with_alerts, naive_threshold, **params)

# excluded.remove('P17')
# excluded.remove('P26')


In [None]:
from Benchmark import calculate_metrics

metrics = list()

pd.set_option('display.float_format', '{:.4f}'.format)

for p in patients:
    data_patient = patient_data_with_alerts[patient_data_with_alerts[Cols.patient] == p].copy()
    metric = calculate_metrics(data_patient, Cols.combined_alert_and)
    metric['Patient'] = p
    del metric['Accuracy']
    print(p)
    display(metric['Confusion Matrix'])
    del metric['Confusion Matrix']
    values_under_70 = data_patient[data_patient['Value'] < 70]
    percentage_under_70 = (len(values_under_70) / len(data_patient)) * 100
    metric['% Hypo'] = percentage_under_70
    metrics.append(metric)

df = pd.DataFrame(metrics)
df_sorted = df.sort_values(by='Balanced Accuracy', ascending=False)
display(df_sorted)

In [None]:
import numpy as np

no_warning = 0
gave_warning = 0

for p in patients:
    df = patient_data[patient_data[Cols.patient] == p].copy()
    df = df.dropna(
        subset=[Cols.target, Cols.naive_alert, Cols.prob_alert, Cols.combined_alert_and, Cols.combined_alert_or])
    crossed_70 = (df[Cols.value] < 70) & (df[Cols.value].shift(1) >= 70)
    alert_true = df[Cols.combined_alert_and].shift(1).astype(bool)
    target_true = df[Cols.target].shift(1).astype(bool)

    gave_warning += np.sum(crossed_70 & alert_true & target_true)
    no_warning += np.sum(crossed_70 & ~alert_true & target_true)

print('gave_warning:', gave_warning)
print('no_warning:', no_warning)
print(gave_warning / (gave_warning + no_warning))


In [None]:
from utils.VisualizationUtils import draw_histogram

time_diff_list = []

pdict = dict()

for p in patients:
    print(p)
    df = patient_data[patient_data[Cols.patient] == p].copy().sort_values(Cols.date, ascending=True).reset_index(
        drop=True)
    df = df.dropna(subset=[Cols.target, Cols.naive_alert, Cols.prob_alert, Cols.combined_alert_and,
                           Cols.combined_alert_or]).reset_index(drop=True)  # Reset index after dropna
    crossed_70 = (df[Cols.value] < 70) & (df[Cols.value].shift(1) >= 70)

    for i in range(1, len(df)):
        # Check if value crosses below 70
        if crossed_70[i]:
            start_time = df.loc[i, Cols.date]
            # Iterate backwards to find the earliest point where both alert and target are true
            for j in range(i - 2, -1, -1):
                # Ignore point if value goes under 70 again
                if df.loc[j, Cols.value] < 70:
                    break
                # Check if both target and alert are true
                elif bool(df.loc[1, Cols.combined_alert_and]) is False or bool(df.loc[1, Cols.target]):
                    end_time = df.loc[j, Cols.date]
                    time_diff = start_time - end_time
                    time_diff_list.append(time_diff)
                    if p not in pdict.keys():
                        pdict[p] = 1
                    else:
                        pdict[p] += 1
                    break  # Found the required point, no need to check further

# Convert list of timedelta objects to desired format (e.g., total seconds)
time_diff_seconds = [td.total_seconds() for td in time_diff_list]
time_diff_minutes = [td / 60 for td in time_diff_seconds]
time_diff_minutes = [x for x in time_diff_minutes if x <= 60]

print(len(pdict), pdict)
print(time_diff_minutes)
"""draw_histogram(time_diff_minutes, 'Forecast Time Distribution of the Model', 'Minutes', 'Count', bins=7
               , color='#0000FF', edgecolor='black')"""

In [None]:
from utils.PropertyNames import ColumnNames as Cols

patient_data[Cols.patient].value_counts()

print(len(patient_data))

In [None]:
values_under_70 = patient_data[patient_data['Value'] < 70]
percentage_under_70 = (len(values_under_70) / len(patient_data)) * 100
print(percentage_under_70)

Spectral Clustering

In [None]:
from utils.PropertyNames import ColumnNames as Cols
from utils.PropertyNames import MethodOptions as Opts
from deBruijn.ProbabilityGraph import ProbabilityGraph
from sklearn.cluster import SpectralClustering
import networkx as nx
import csv

params = {
    "k": 5,
    "risky_chars": {0, 1},
    "risk_threshold": 0.2,
    "prune": True,
    "prune_method": Opts.adaptive,
    "prune_threshold": 1,
    "weight_thresholds": [1, 4, 10],
    "value_ranges": [(0, 2), (2, 3), (3, float('inf'))],
    "max_steps": 6,
    "naive_threshold": 15
}

csv_filename = "edge_properties.csv"

for p in ['P21']:
    patient_df = patient_data[patient_data[Cols.patient] == p].sort_values(Cols.date, ascending=True)
    patient_sequences = []
    sequence = []
    for index, row in patient_df.iterrows():
        date_gap = row[Cols.date_gap]
        seq_char = row[Cols.char]
        if pd.isna(date_gap) or (date_gap < pd.Timedelta(minutes=20)):
            sequence.append(seq_char)
        elif len(sequence) > params['k']:
            patient_sequences.append(sequence)
            sequence = []

    probability_graph = ProbabilityGraph(k=params['k'], sequences=patient_sequences)
    graph_copy = probability_graph.graph.copy()
    print(graph_copy)
    adj_mat = nx.to_numpy_array(graph_copy.to_undirected())
    sc = SpectralClustering(2, affinity='precomputed', n_init=100)
    sc.fit(adj_mat)
    print('spectral clustering')
    print(len(sc.labels_), sc.labels_)

    # Assign cluster labels to nodes as properties in the original directed graph
    nodelist = list(graph_copy.nodes())  # Ensure the node list is in the same order as used in adjacency matrix
    for i, label in enumerate(sc.labels_):
        graph_copy.nodes[nodelist[i]]['cluster'] = label

    # Convert edge data to a Pandas DataFrame
    edge_df = pd.DataFrame([(*e[:2], *e[2].values()) for e in graph_copy.edges(data=True)],
                           columns=['Source', 'Target'] + list(list(graph_copy.edges(data=True))[0][2].keys()))

    edge_df['weight'] = edge_df['weight'].astype(int)
    # Export edge DataFrame to CSV
    edge_df.to_csv('/home/lumpus/Documents/Classes/Indiana/deBruijn/Clustering/network_edges.csv', index=False)

    # Create a list of dictionaries for node attributes
    node_list = []
    for node, attrs in graph_copy.nodes(data=True):
        attr_dict = {'Id': node}
        attr_dict.update(attrs)
        node_list.append(attr_dict)

    # Convert list of dictionaries to DataFrame
    node_df = pd.DataFrame(node_list)

    # Export node attributes to CSV
    node_df.to_csv('/home/lumpus/Documents/Classes/Indiana/deBruijn/Clustering/network_nodes.csv', index=False)



In [None]:
import matplotlib.pyplot as plt
from collections import Counter

def draw_timeline(event_times, threshold=15):
    counter = Counter(event_times)

    # Sort events
    sorted_events = sorted(counter.items())

    # Vertical spacing between events at the same time
    vertical_step = 0.2

    # Create timeline plot
    plt.figure(figsize=(15, 4))
    plt.axvline(x=0, color='red', linestyle='--', label='Hypoglycemia Event (t=0)')  # Line at t=0
    first_x_plotted = False  # Flag to track whether the first 'x' marker has been plotted

    for time, count in sorted_events:
        mirror_time = -time  # Multiply by -1 to mirror the timeline
        if count > threshold:
            # If count exceeds threshold, plot a single special marker
            plt.scatter(mirror_time, 0, marker='X', s=100, color='green')
            plt.annotate(f'{count}\nWarnings', (mirror_time, 0), textcoords="offset points", xytext=(0,10), ha='center')  # Bold text
        else:
            # Calculate starting vertical offset for this time event to center it
            vertical_offset = -(count - 1) * vertical_step / 2
            for _ in range(count):
                plt.scatter(mirror_time, vertical_offset, marker='x', s=50, color='blue', label='Warning Instance' if not first_x_plotted else "")
                vertical_offset += vertical_step
                first_x_plotted = True

    plt.xlabel('Time (minutes)')
    plt.title('Hypoglycemia Forecast Timeline Compilation')
    plt.yticks([])  # Hide y-axis
    plt.xticks(range(-35, 1, 1))  # Set x-ticks
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.show()

# Test function with sample data
event_times = [30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 21.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 15.0, 30.0, 30.0, 28.0, 30.0, 30.0, 30.0, 30.0, 30.0, 31.0, 30.0, 30.0, 26.0, 30.0, 14.0, 24.0, 30.0, 5.0, 32.0, 30.0, 24.0, 10.0, 23.0, 30.0, 15.0, 18.0, 11.0, 30.0, 28.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 16.0, 30.0, 21.0, 30.0, 30.0, 28.0, 16.0, 30.0, 30.0, 30.0, 30.0, 30.0, 22.0, 20.0, 30.0, 23.0, 30.0, 30.0, 16.0, 27.0, 20.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 23.0, 30.0, 30.0, 30.0, 31.0, 30.0, 30.0, 30.0, 30.0, 32.0, 30.0, 15.0, 30.0, 30.0, 30.0, 20.0, 30.0, 30.0, 30.0, 31.0, 23.0, 30.0, 30.0, 30.0, 30.0, 30.0, 15.0, 32.0, 31.0, 30.0, 30.0, 14.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 28.0, 30.0, 21.0, 22.0, 22.0, 15.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 11.0, 7.0, 17.0, 6.0, 15.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 21.0, 30.0, 30.0, 30.0, 31.0, 30.0, 30.0, 30.0, 20.0, 30.0, 31.0, 30.0, 30.0, 30.0, 30.0, 31.0, 32.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 31.0, 30.0, 30.0, 30.0, 30.0, 19.0, 30.0, 30.0, 28.0, 30.0, 30.0, 30.0, 30.0, 30.0, 15.0, 30.0, 15.0, 15.0, 30.0, 30.0, 29.0, 15.0, 7.0, 31.0, 30.0, 30.0, 30.0, 28.0, 33.0, 30.0, 30.0, 29.0, 29.0, 30.0, 26.0, 30.0, 28.0, 30.0, 28.0, 30.0, 31.0, 30.0, 23.0, 30.0, 28.0, 22.0, 18.0, 30.0, 30.0, 30.0, 30.0, 30.0, 29.0, 30.0, 30.0, 30.0, 30.0, 31.0, 30.0, 30.0, 25.0, 31.0, 30.0, 30.0, 30.0]
# event_times.extend([0]*10)
print(event_times)
draw_timeline(event_times)


In [None]:
import time
import pickle

with open('Data/REPLACE_BG.dat', 'rb') as f:
    test = pickle.load(f)
print(len(test))


In [None]:
import pickle
import pandas as pd
from utils.PropertyNames import ColumnNames as Cols

with open('Data/REPLACE_BG.dat', 'rb') as f:
    replaceBg = pickle.load(f)
print(len(replaceBg))

patients = replaceBg[Cols.patient].unique()

result = replaceBg.groupby(Cols.patient)[Cols.isDangerous].sum().reset_index()
total_count = replaceBg.groupby(Cols.patient).size().reset_index(name='Datapoints')

final_result = pd.merge(result, total_count, on=Cols.patient)
print(final_result)
