In [None]:
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

In [None]:
import os
import sys
import pandas as pd

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:  
    sys.path.append(module_path)

In [None]:
from vadbp.variant_comparator import VariantComparator
from vadbp.visual_variant_comparator import VisualVariantComparator

In [None]:
def clean_df(df):
    df.drop(["Unnamed: 0"], axis=1, inplace=True)
    return VariantComparator.format_df(df, case_id='hadm_id', activity_key='department', timestamp_key='intime')

# Data Loading

In [None]:
kidney_data = pd.read_csv('Kidney_Log.csv')
kidney_data = clean_df(kidney_data)


In [None]:
department_list = ['Emergency Department', 'Medical ICU', 'Surgical ICU', 'Post-ICU Medicine', 'Discharged']

kidney_data = kidney_data[kidney_data['department'].isin(department_list)]

subject_over_60_kidney  = kidney_data[kidney_data['age']>60]['case:concept:name']
subject_under_60_kidney  = kidney_data[kidney_data['age']<=60]['case:concept:name']
kidney_over_60 = kidney_data[kidney_data['case:concept:name'].isin(subject_over_60_kidney)]
kidney_under_60 = kidney_data[kidney_data['case:concept:name'].isin(subject_under_60_kidney)]

# Visualization

## Kidney Data Age Comparator

In [None]:
varcop_kidney = VariantComparator(kidney_over_60, kidney_under_60, kidney_data, 'Over 60', 'Under 60')
varcop_kidney.prepare()

In [None]:
os.environ["PATH"] += os.pathsep + 'path/Graphviz-12.1.1-win64/bin'

In [None]:
visual_comp_kidney = VisualVariantComparator(varcop_kidney)
visual_comp_kidney.show()

In [None]:
from scipy.stats import mannwhitneyu
import pingouin as pg
import numpy as np
from matplotlib.ticker import PercentFormatter
from matplotlib import pyplot as plt, image as mpimg

In [None]:
old_color = '#1f77b4'
young_color = '#ff7f0e'

def prepare_logs_node(data, attribute, department_attr, continuous=True):
    #process possible loops and filter the correct department and attribute
    data = data[data['concept:name']==department_attr]
    if continuous:
        data_new = data.groupby(by=["case:concept:name", "concept:name"]).mean()
    else: 
        data_new = data.groupby(["case:concept:name", "concept:name"]).apply(pd.DataFrame.mode).reset_index(drop=True)
        # print(data_new)
    return data_new[attribute]

def do_statistics_node(young, old, attribute, department_attr, continuous=True, show_fig=False):
    #apply the statistic tests
    young = prepare_logs_node(young, attribute, department_attr, continuous=continuous)
    old = prepare_logs_node(old, attribute, department_attr, continuous=continuous)
    print('young count:', len(young.dropna()))
    print('old count', len(old.dropna()))
    if show_fig and continuous:
        make_figure(young, old, attribute)
    if continuous:
        return pg.mwu(young, old) 
    else: 
        df_y = pd.DataFrame(data={attribute: pd.Series(young.tolist())})
        df_y['Age'] = '<60'
        df_o = pd.DataFrame(data={attribute: pd.Series(old.tolist())})
        df_o['Age'] = '>60'
        df_combined = pd.concat([df_y, df_o])
        #print(df_combined)
        _, _, stats = pg.chi2_independence(df_combined, x='Age', y=attribute)
        pearson_stats = stats[stats['test'] == 'pearson']
        if show_fig:
            make_figure_cat(young.tolist(), old.tolist(), attribute)
        return pearson_stats

def make_figure(young, old, attribute):
    #create the plot for the statistical test
    fig = plt.figure(figsize=(15, 5))

    # Plot histograms with transparency for overlap effect
    plt.hist(young, alpha=0.5,edgecolor=young_color, density=True, histtype= 'step', label='under 60', linewidth=3)
    plt.hist(old, alpha=0.5, edgecolor=old_color, density=True, histtype='step', label='over 60', linewidth=3)

    # Add titles and labels
    plt.xlabel(attribute)
    plt.ylabel('%')
    plt.title(f'Comparison of {attribute} values between over 60 and under 60')

    # Add legend to distinguish groups
    plt.legend()
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

    # Display plot
    fig.show()
    
def make_figure_cat(labels_y, labels_o, attribute):
    # Calculate counts for each unique label
    labels_left, counts_l = np.unique(labels_y, return_counts=True)
    labels_right, counts_r = np.unique(labels_o, return_counts=True)

    all_labels = list(set(labels_left) | set(labels_right))  # Unique set of all categories
    #make sure the order is correct
    counts_left = {label: 0 for label in all_labels} 
    counts_right = {label: 0 for label in all_labels}

    for label, count in zip(labels_left, counts_l):
        counts_left[label] = (count - 1) / len(labels_y)
    for label, count in zip(labels_right, counts_r):
        counts_right[label] = (count - 1) / len(labels_o)

    # Prepare data for the bars in the correct order
    counts_left = [counts_left[label] for label in all_labels]
    counts_right = [counts_right[label] for label in all_labels]

    fig, ax = plt.subplots(figsize=(15, 5))

    ax.set_title(f'Comparison of {attribute} values between over 60 and under 60')

    ind = np.arange(len(all_labels))
    width = 0.2

    # Create the bar plots
    bars_left = ax.bar(ind, counts_left, width, label='under 60', color=young_color)
    bars_right = ax.bar(ind + width, counts_right, width, label='over 60', color=old_color)

    # Add actual values on top of each bar
    for bar in bars_left:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2%}', ha='center', va='bottom')

    for bar in bars_right:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.2%}', ha='center', va='bottom')

    ax.set_xticks(ind + width / 2)# rotation=90)
    ax.set_xticklabels(all_labels)
    ax.legend(loc='best')
    ax.set_xlabel(attribute)
    ax.set_ylabel('%')
    plt.xticks(rotation=90)
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
    return fig


In [None]:
do_statistics_node(kidney_under_60, kidney_over_60, 'heartrate', 'Emergency Department', show_fig=True)

In [None]:
def prepare_edge_df(data, continuous=True):
    #shift the dataframe such that next activities appear on the same row
    joined = data.join(data.shift(-1), lsuffix='_l', rsuffix='_r')
    #keep only if same case id
    joined = joined[joined['case:concept:name_l'] == joined['case:concept:name_r']]
    #make lists
    df_aggregated = joined.groupby(['concept:name_l', 'concept:name_r']).agg(lambda x: x.tolist())
    df_melted = pd.melt(df_aggregated.reset_index(), id_vars=['concept:name_l', 'concept:name_r'],
                            value_vars=df_aggregated.columns)
    df_final = df_melted[df_melted['value'].map(lambda d: len(d)) > 0]

    left_side_df = df_final[df_final['variable'].str.endswith('_l')]
    left_side_df['variable'] = left_side_df['variable'].apply(lambda row: row.rstrip('_l'))
    right_side_df = df_final[df_final['variable'].str.endswith('_r')]
    right_side_df['variable'] = right_side_df['variable'].apply(lambda row: row.rstrip('_r'))
    merged = left_side_df.merge(right_side_df, on=['variable', 'concept:name_l', 'concept:name_r'],
                                suffixes=['_l', '_r'])

    merged = merged[merged['value_l'] != merged['value_r']]

    #calculate differences 
    if continuous: 
        merged['value'] = merged.apply(
                lambda row: [float(row['value_r'][i]) - float(row['value_l'][i]) for i in range(0, len(row['value_l']))
                                if (pd.api.types.is_numeric_dtype(type(row['value_l'][i])) and pd.api.types.is_numeric_dtype(type(row['value_r'][i])) and not pd.isna(row['value_l'][i]) and not pd.isna(row['value_r'][i]))],
                axis=1)
        merged.drop(['value_l', 'value_r'], axis=1, inplace=True)
    else:    
        merged['value'] = merged.apply(
                lambda row: [str(row['value_l'][i]) + '-' + str(row['value_r'][i]) for i in range(0, len(row['value_l']))
                        if (not pd.isna(row['value_l'][i]) and not pd.isna(row['value_r'][i]))],
                axis=1)
        merged.drop(['value_l', 'value_r'], axis=1, inplace=True)
    return merged

def filter_edge_df(df, attr, depart_A, depart_B):
    #keep the list from the rows with the correct attributes and the correct departments
    return df[(df['variable']==attr)&(df['concept:name_l']==depart_A)&(df['concept:name_r']==depart_B)]['value'].iloc[0]

def do_statistics_edge(young, old, attribute, depart_A, depart_B, continuous=True, show_fig=False):
    young = prepare_edge_df(young, continuous=continuous)
    old = prepare_edge_df(old, continuous=continuous)
    
    young = filter_edge_df(young, attribute, depart_A, depart_B)
    old = filter_edge_df(old, attribute, depart_A, depart_B)
    
    print(len(young))
    print(len(old))

    if show_fig and continuous:
        make_figure(young, old, attribute)
    if continuous:
        return pg.mwu(young, old) 
    else: 
        df_y = pd.DataFrame(data={attribute: pd.Series(young)})
        df_y['Age'] = '<60'
        df_o = pd.DataFrame(data={attribute: pd.Series(old)})
        df_o['Age'] = '>60'
        df_combined = pd.concat([df_y, df_o])
        #print(df_combined)
        _, _, stats = pg.chi2_independence(df_combined, x='Age', y=attribute)
        pearson_stats = stats[stats['test'] == 'pearson']
        if show_fig:
            make_figure_cat(young, old, attribute)
        return pearson_stats


In [None]:
do_statistics_edge(kidney_under_60, kidney_over_60, 'Abnormal Phosphate', 'Surgical ICU', 'Post-ICU Medicine', continuous=False, show_fig=True)