In [None]:
import os
import math
import pickle
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from bidict import bidict

matplotlib.rcParams['figure.dpi'] = 200

In [None]:
icds = pd.read_csv("MIMIC_III_DIAGNOSES_ICD.csv")
icds[icds["SUBJECT_ID"] == 20]

From mixehr training run (with_ecg_quantiles):
* numOfPheTypes: 4
* numOfLabTypes: 1
* numOfPhenotypes: 27811
* numOfLabTests: 389
* numOfPats: 3932
* C_train: 5061127
* Training data file parsing completed.

### Summary: 
* alpha gives a single weighting to each topic
* beta gives a single weighting to each phenotype
* phi gives, for each phenotype (apart from labs), a weighting for each topic
* psi does the same thing, but for labs
* psiHyper, eta, and zeta are also specifically about labs

In [None]:
condition = "with_ecg_quantiles"

alpha = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_alpha.csv", header=None)
beta = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_beta.csv", header=None)
eta = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_eta.csv", header=None)
eta_normalized = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_eta_normalized.csv", header=None)
phi = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_phi.csv", header=None)
phi_normalized = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_phi_normalized.csv", header=None)
psi = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_psi.csv", header=None)
psiHyper = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_psiHyper.csv", header=None)
zeta = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_zeta.csv", header=None)

In [None]:
phi_normalized

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MeanShift, DBSCAN
from sklearn import decomposition
import seaborn as sns

from sklearn.preprocessing import StandardScaler

n_clusters = 12

test_pt_dist = pd.read_csv("train_mixehr_with_ecg_quantiles_train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe.csv", header=None)
#test_pt_dist = pd.read_csv("vali_mixehr_no_waveforms_train_mixehr_no_waveforms_JCVB0_nmar_K75_iter500_metaphe.csv", header=None)

scaled_data = test_pt_dist.to_numpy()

# pca = decomposition.PCA(n_components=4)
# pca.fit(scaled_data)

# pcad = pca.transform(scaled_data)

# scaled_data = pcad

kmeans = KMeans(init="random", n_clusters=n_clusters, n_init=10, max_iter=1000, random_state=0).fit(scaled_data)
#meanshift = MeanShift().fit(scaled_data)
#dbscan = DBSCAN(eps=0.6, min_samples=15).fit(scaled_data)

tsne = TSNE(n_components=2, verbose=1, random_state=123)
z = tsne.fit_transform(scaled_data)


#df = pd.DataFrame()
test_pt_dist["comp-1"] = z[:, 1]
test_pt_dist["comp-2"] = z[:, 0]
#df["comp-1"] = scaled_data[:, 0]
#df["comp-2"] = scaled_data[:, 1]
#df["comp-1"] = pcad[:, 0]
#df["comp-2"] = pcad[:, 1]
test_pt_dist["Cluster"] = kmeans.labels_ + 1

sns.set(rc={'figure.figsize':(6,5)})
#ax = sns.scatterplot(x="comp-1", y="comp-2", legend="full", palette="bright", data=df)
ax = sns.scatterplot(x="comp-1", y="comp-2", hue="Cluster", style="Cluster", legend="full", palette="bright", data=test_pt_dist)
#plt.axis("off")
#plt.legend(ncol=n_clusters)
#ax.set_title(backgrounds[bg] + " test set images with projection based on all feature map values")
#ax.legend(["cluster " + str(cnum) for cnum in range(1,3)])
#ax.legend(["cluster 1", "cluster 2", "cluster 3"])

plt.xlim([-100, 75])
plt.ylim([-100, 75])
plt.axis("off")
plt.tight_layout()
plt.savefig("train_pt_cluster_with_ecg_quantiles.png", dpi=300)
plt.show()

In [None]:
from sklearn import decomposition
import seaborn as sns

phi = pd.read_csv("./train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_phi_normalized.csv", header=None)

phi = phi[phi[0] == 5] # Only ecg features

#dropped_topics = [67, 27, 39, 73, 6, 12, 19, 37, 49, 63, 51, 65, 72, 74]
dropped_topics = []
phi = phi.drop(labels=[t+2 for t in dropped_topics], axis=1)

scaled_data = phi.to_numpy()[:, 2:].T

pca = decomposition.PCA(n_components=2)
pca.fit(scaled_data)

pcad = pca.transform(scaled_data)

scaled_data = pcad

df = pd.DataFrame()
df["comp1"] = pcad[:, 0]
df["comp2"] = pcad[:, 1]

plt.figure(figsize=(6,5))
p1 = sns.scatterplot(x="comp1", # Horizontal axis
       y="comp2", # Vertical axis
       data=df, # Data source
       size = 8,
       legend=False)  

#labelled_points = [12, 63]
labels = list(range(75))
for t in dropped_topics:
    labels.remove(t)
df["label"] = labels
#df["label"] = df.apply(lambda row: str(int(row["label"])) if row["label"] in labelled_points else "", axis=1)

#df = df[(df["comp1"] > -0.05) & (df["comp1"] < 0.05) & (df["comp2"] > -0.05) & (df["comp2"] < 0.05)]

for line in range(0,df.shape[0]):
    if line in df["label"]:
        p1.text(df.comp1[line]+0.001, df.comp2[line], 
        df.label[line], horizontalalignment='left',
        size='small', fontsize=16, color='black', weight='semibold')


# plt.xlim([-0.02, 0.02])
# plt.ylim([-0.02, 0.02])
        
#plt.axis("off")
plt.tight_layout()
plt.savefig("ecg_pca_norm.png", dpi=300)
plt.show()

In [None]:
from sklearn import decomposition
import seaborn as sns

phi = pd.read_csv("./train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_phi.csv", header=None)

phi = phi[phi[0] == 5] # Only ecg features

scaled_data = phi.to_numpy()[:, 2:].T

pca = decomposition.PCA(n_components=2)
pca.fit(scaled_data)

pcad = pca.transform(scaled_data)

scaled_data = pcad

df = pd.DataFrame()
df["comp1"] = pcad[:, 0]
df["comp2"] = pcad[:, 1]

plt.figure(figsize=(4,2))
p1 = sns.scatterplot(x="comp1", # Horizontal axis
       y="comp2", # Vertical axis
       data=df, # Data source
       size = 8,
       legend=False)  

labelled_points = [32, 34, 50, 52, 66, 68, 72, 74]
df["label"] = list(range(75))
df["label"] = df.apply(lambda row: str(int(row["label"])) if row["label"] in labelled_points else "", axis=1)

for line in range(0,df.shape[0]):
     p1.text(df.comp1[line]+0.01, df.comp2[line], 
     df.label[line], horizontalalignment='left',
     size='small', fontsize=10, color='black', weight='semibold')
        
        
plt.axis("off")
plt.tight_layout()
plt.savefig("ecg_pca.png", dpi=300)
plt.show()

In [None]:
from sklearn import decomposition
import seaborn as sns

phi = pd.read_csv("./train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_phi.csv", header=None)

phi = phi[phi[0] == 5] # Only ecg features

scaled_data = phi.to_numpy()[:, 2:].T

pca = decomposition.PCA(n_components=2)
pca.fit(scaled_data)

pcad = pca.transform(scaled_data)

scaled_data = pcad

df = pd.DataFrame()
df["comp1"] = pcad[:, 0]
df["comp2"] = pcad[:, 1]

plt.figure(figsize=(6,5))
p1 = sns.scatterplot(x="comp1", # Horizontal axis
       y="comp2", # Vertical axis
       data=df, # Data source
       size = 8,
       legend=False)  

#labelled_points = [19, 32, 34, 50, 52, 66, 68, 72, 74]
df["label"] = list(range(75))
#df["label"] = df.apply(lambda row: str(int(row["label"])) if row["label"] in labelled_points else "", axis=1)

df = df[(df["comp1"] > -1000) & (df["comp1"] < 1000) & (df["comp2"] > -1000) & (df["comp2"] < 1000)]

for line in range(0,df.shape[0]):
    if line in df["label"]:
        p1.text(df.comp1[line]+0.01, df.comp2[line], 
        df.label[line], horizontalalignment='left',
        size='small', fontsize=16, color='black', weight='semibold')



plt.xlim([-1000, 1010])
plt.ylim([-1000, 1000])
plt.axis("off")
plt.tight_layout()
plt.savefig("ecg_pca.png", dpi=300)
plt.show()

In [None]:
from sklearn import decomposition
import seaborn as sns

phi = pd.read_csv("./train_mixehr_only_ecg_quantiles_JCVB0_nmar_K75_iter500_phi.csv", header=None)

phi = phi[phi[0] == 5] # Only ecg features

scaled_data = phi.to_numpy()[:, 2:].T

pca = decomposition.PCA(n_components=2)
pca.fit(scaled_data)

pcad = pca.transform(scaled_data)

scaled_data = pcad

df = pd.DataFrame()
df["comp1"] = pcad[:, 0]
df["comp2"] = pcad[:, 1]

plt.figure(figsize=(6,5))
p1 = sns.scatterplot(x="comp1", # Horizontal axis
       y="comp2", # Vertical axis
       data=df, # Data source
       size = 8,
       legend=False)  

#labelled_points = [19, 32, 34, 50, 52, 66, 68, 72, 74]
df["label"] = list(range(75))
#df["label"] = df.apply(lambda row: str(int(row["label"])) if row["label"] in labelled_points else "", axis=1)

for line in range(0,df.shape[0]):
     p1.text(df.comp1[line]+0.01, df.comp2[line], 
     df.label[line], horizontalalignment='left',
     size='small', fontsize=16, color='black', weight='semibold')
        
        
plt.axis("off")
plt.tight_layout()
plt.savefig("ecg_pca.png", dpi=300)
plt.show()

In [None]:
test_pt_dist

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, MeanShift, DBSCAN
from sklearn import decomposition
import seaborn as sns

from sklearn.preprocessing import StandardScaler

n_clusters = 9

val_pt_dist = pd.read_csv("vali_mixehr_with_ecg_quantiles_train_mixehr_with_ecg_quantiles_JCVB0_nmar_K75_iter500_metaphe.csv", header=None)

scaled_data = val_pt_dist.to_numpy()

pca = decomposition.PCA(n_components=2)
pca.fit(scaled_data)

pcad = pca.transform(scaled_data)

scaled_data = pcad

#kmeans = KMeans(init="random", n_clusters=n_clusters, n_init=10, max_iter=1000, random_state=0).fit(scaled_data)
meanshift = MeanShift().fit(scaled_data)
#dbscan = DBSCAN(eps=0.6, min_samples=15).fit(scaled_data)

# tsne = TSNE(n_components=2, verbose=1, random_state=123)
# z = tsne.fit_transform(scaled_data)


df = pd.DataFrame()
df["comp-1"] = pcad[:, 0]
df["comp-2"] = pcad[:, 1]
df["Cluster"] = meanshift.labels_ + 1

sns.set(rc={'figure.figsize':(6,5)})
#ax = sns.scatterplot(x="comp-1", y="comp-2", legend="full", palette="bright", data=df)
ax = sns.scatterplot(x="comp-1", y="comp-2", hue="Cluster", style="Cluster", legend="full", palette="bright", data=df)
plt.axis("off")
#plt.legend(ncol=n_clusters)
#ax.set_title(backgrounds[bg] + " test set images with projection based on all feature map values")
#ax.legend(["cluster " + str(cnum) for cnum in range(1,3)])
#ax.legend(["cluster 1", "cluster 2", "cluster 3"])

plt.tight_layout()
plt.axis("off")
plt.savefig("vali_pca_cluster_with_ecg_quantiles.png", dpi=300)
plt.show()

In [None]:
alpha

In [None]:
print(beta[0].unique()) # Missing the "lab" typeId
beta # Does this show the fraction of patients who had this phenotype? 

In [None]:
eta # 778, seems to be related to specifically labs

In [None]:
eta_normalized # Normalized how?? All the values are between 0 and 1

In [None]:
print(phi[0].unique()) # Also missing lab typeIds
phi

In [None]:
psi # Seems to be more labs, but without regard to state

In [None]:
psiHyper # More labs

In [None]:
zeta # Also labs, with states

In [None]:
traindata = pd.read_csv("train_mixehr_with_ecg_quantiles.csv")

In [None]:
traindata["ALL_PHE"] = traindata["typeId"].astype(str) + "-" + traindata["pheId"].astype(str)
traindata["ALL_PHE"].nunique()