In [1]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = "retina"

In [2]:
import sys

In [3]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

sys.path.append('./../src/')
from manuscript import sankey_side_by_side as sankey
from manuscript import clustering, datasets, inout, export

pd.options.display.max_columns = 200
mpl.rcParams["figure.figsize"] = (10, 8)
mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["font.family"] = "Arial"

import IPython.display
IPython.display.display(IPython.display.HTML("<style>.container { width:90% !important; }</style>"))

fonts = inout.get_resource_path('fonts')
for f in os.listdir(fonts):
    if f.endswith(".ttf"):
        mpl.font_manager.fontManager.addfont(f"{fonts}/{f}")

In [4]:
user = 'general'     # defines top hierarchy of output folder
outfolder = '04c_clustering_stability_against_removal'    # name of notebook

save = True

In [5]:
def dump_figure(name):
    if save:
        export.image(
            user,
            f'{outfolder}/{name}',
        )

In [6]:
import scipy.cluster.hierarchy

In [7]:
import random

In [8]:
number_of_clusters = 14

# Get Data, as in reference

In [9]:
def reweight(data_with_mortality, data_mtx):
    data_mtx = data_mtx.copy().rank(axis=0, pct=True)

    # Reweight features
    cutoff_groups = clustering.identify_related_features(data_with_mortality[data_columns])

    threshold_for_relatedness = 0.7


    data_mtx_for_similarity = data_mtx.copy()
    data_mtx_for_similarity = clustering.reweight_related_features(
        data_mtx_for_similarity, 
        approach='mean_rank', 
        groups=cutoff_groups[threshold_for_relatedness])
    
    return data_mtx, data_mtx_for_similarity

In [10]:
data_with_mortality_raw = pd.read_csv(
    inout.get_material_path('general/03_overwrite_PF_Cr/03data-external_220901_1010.csv.gz'), 
    index_col=0)

In [11]:
data_columns = clustering.get_reference_data_columns()
data_mtx_raw = data_with_mortality_raw[data_columns].copy()

In [13]:
number_of_randomizations = 100

In [14]:
%%time
counter = 0
for j in range(0, number_of_randomizations):

    patient_to_drop = random.choice(
        data_with_mortality_raw['Patient_id'].unique()
    )
    p = inout.get_material_path(f'{user}/{outfolder}/assignments/{patient_to_drop}.csv')
    if os.path.exists(p):
        continue
    else:
        counter = counter+1
        print(counter)
    
    f = data_with_mortality_raw['Patient_id'] == patient_to_drop

    data_with_mortality = data_with_mortality_raw.loc[~f, :].copy()
    data_mtx = data_mtx_raw.loc[~f, :]. copy()
    data_mtx, data_mtx_for_similarity = reweight(data_with_mortality, data_mtx)

    corr_mtx = data_mtx_for_similarity.transpose().corr("pearson")
    data_dist = clustering.get_distances(corr_mtx, approach='euclidean')   
    tree = clustering.get_tree(df_dist=data_dist, approach='ward')
    
    range_to_probe = range(4, 21)
    
    _, assignments = clustering.table_with_assignments(tree, data_with_mortality.index, threshold_range=range_to_probe)
    
    agg = []
    for threshold in range_to_probe:
        with_cluster_by_mortality, _ = clustering.infer_clustermortality_and_add_cluster_id(
            data_with_mortality, data_mtx, assignments[threshold])
        with_cluster_by_mortality = with_cluster_by_mortality[['Patient_id', 'ICU_stay', 'ICU_day', 'cluster_order']].rename_axis('pt_day').reset_index()
        with_cluster_by_mortality.loc[:, 'max_cluster'] = threshold
        with_cluster_by_mortality.loc[:, 'dropped_patient'] = patient_to_drop
        agg.append(with_cluster_by_mortality)
    with_cluster_by_mortality = pd.concat(agg)
    export.full_frame(user, f'{outfolder}/assignments/{patient_to_drop}.csv', with_cluster_by_mortality)
    

    clustering.quilt(
        data_with_mortality=data_with_mortality,
        data_mtx=data_mtx,
        assignment=assignments[number_of_clusters]
    )
    plt.title(patient_to_drop, fontsize=20)
    export.image(user, f'{outfolder}/heatmap_at_{number_of_clusters}/{patient_to_drop}.pdf')
    plt.close()

    clustering.quilt(
        data_with_mortality=data_with_mortality,
        data_mtx=data_mtx,
        assignment=assignments[5]
    )
    plt.title(patient_to_drop, fontsize=20)
    export.image(user, f'{outfolder}/heatmap_at_5/{patient_to_drop}.pdf')
    plt.close()

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
CPU times: user 5h 3s, sys: 11min 59s, total: 5h 12min 2s
Wall time: 2h 58min 23s
