In [None]:
from datetime import datetime
import os
import math
import pickle
import pandas as pd
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from bidict import bidict
import scipy
from scipy.signal import savgol_filter
from wordcloud import WordCloud

matplotlib.rcParams['figure.dpi'] = 200

From mixehr training run (with_ecg_quantiles):
* numOfPheTypes: 4
* numOfLabTypes: 1
* numOfPhenotypes: 27811
* numOfLabTests: 389
* numOfPats: 3932
* C_train: 5061127
* Training data file parsing completed.

### Summary: 
* phi gives, for each phenotype (apart from labs), a weighting for each topic
* eta is specifically about labs

In [None]:
condition = "with_ecg_quantiles"

eta = pd.read_csv("./../data/train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_eta.csv", header=None)
eta_normalized = pd.read_csv("./../data/train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_eta_normalized.csv", header=None)
phi = pd.read_csv("./../data/train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_phi.csv", header=None)
phi_normalized = pd.read_csv("./../data/train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_phi_normalized.csv", header=None)

In [None]:
phi_normalized

In [None]:
# List the top k phenotypes associated with each of k topics

k = 75
top_n_pheno = 10
verbose=True


def describe_icd(icd9_code, icd_descrips_dict):
    try:
        return icd_descrips_dict[str(icd9_code)]
    except KeyError:
        return str(icd9_code)


#phi = pd.read_csv("train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_phi.csv", header=None)

types = {"icds": 1, "prescrips": 2, "labs": 3, "note_words": 4, "ecg_quantile_features": 5}

icd_descrips_df = pd.concat([
    pd.read_csv("./../data/MIMIC_III_D_ICD_DIAGNOSES.csv"),
    pd.read_csv("./../data/D_ICD_PROCEDURES.csv")
]).reset_index(drop=True)

icd_descrips_dict = dict(zip(icd_descrips_df.ICD9_CODE.astype(str), icd_descrips_df.LONG_TITLE))

for topic in range(k):
    print("----------TOPIC " + str(topic) + "----------")
    
    for modality in ["icds", "prescrips", "note_words", "ecg_quantile_features"]:
        type_phi = phi[phi[0] == types[modality]].reset_index(drop=True)
        top_rows = type_phi[topic+2].nlargest(n=top_n_pheno, keep='first').index.tolist()

        top_phe_ids = type_phi.iloc[top_rows][1]

        mapper = pickle.load(open("./../data/mixehr/" + modality + "_mapper.pkl", 'rb'))

        if verbose and modality == "icds":
            top_phenotypes = [str(mapper[x]) + ": " + describe_icd(mapper[x], icd_descrips_dict) for x in top_phe_ids]
        else:
            top_phenotypes = [mapper[x] for x in top_phe_ids]
        print(modality + ": " + str(top_phenotypes) + "\n")

In [None]:
interesting_topics = [1, 7, 9, 10, 16, 23, 33, 34, 45, 50, 55]
#interesting_topics = [7, 10, 16, 23, 33, 34, 45, 50, 55]

In [None]:
# Plot the word clouds of the top 20 phenotypes associated with each of 10 topics for ICD-9 codes

k = 10
top_n_pheno = 10

icd9_phi = phi[phi[0] == types["icds"]].reset_index(drop=True)

for topic in interesting_topics:
    print("----------TOPIC " + str(topic) + "----------")
    
    top_rows = icd9_phi[topic+2].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = icd9_phi.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/icds_mapper.pkl", 'rb'))

    top_phenotypes = [describe_icd(mapper[x], icd_descrips_dict).split(",")[0] for x in top_phe_ids]
    frequencies = {}
    
    for phe in top_phenotypes:
        if phe in frequencies:
            frequencies[phe] += 1
        else:
            frequencies[phe] = 1

    word_cloud = WordCloud(
        font_path="/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
        width = 2000,
        height = 1000,
        random_state = 1,
        background_color = "white",
        colormap = "Set2",
        collocations = False
    ).generate_from_frequencies(frequencies)

    plt.imshow(word_cloud, interpolation = "bilinear")
    plt.axis("off")
    plt.savefig("topic_" + str(topic) + "_icds_word_cloud.png", format = "png")
    plt.show()

In [None]:
# Plot the word cloud of all(?) the phenotypes for ICD-9 codes with Topic 0 as the example

rows = icd9_phi[2].index.tolist()

phe_ids = icd9_phi.iloc[rows][1]

mapper = pickle.load(open("icds_mapper.pkl", 'rb'))

phenotypes = [str(mapper[x]) + ": " + describe_icd(mapper[x], icd_descrips_dict) for x in phe_ids]
frequencies = {}

for phe in phenotypes:
    if phe in frequencies:
        frequencies[phe] += 1
    else:
        frequencies[phe] = 1

word_cloud = WordCloud(
    width = 3000,
    height = 2000,
    random_state = 1,
    background_color = "white",
    colormap = "Set2",
    collocations = False
).generate_from_frequencies(frequencies)

plt.imshow(word_cloud)
plt.axis("off")
# plt.savefig("topics_icds_all_word_cloud.png", format = "png")
plt.show()

In [None]:
# Plot the word clouds of the top 20 phenotypes associated with each of 10 topics for prescriptions 

k = 10
top_n_pheno = 10

prescrips_phi = phi[phi[0] == types["prescrips"]].reset_index(drop=True)

for topic in interesting_topics:
    print("----------TOPIC " + str(topic) + "----------")

    top_rows = prescrips_phi[topic+2].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = prescrips_phi.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/prescrips_mapper.pkl", 'rb'))

    top_phenotypes = [mapper[x] for x in top_phe_ids]
    frequencies = {}
    
    for phe in top_phenotypes:
        if phe in frequencies:
            frequencies[phe] += 1
        else:
            frequencies[phe] = 1

    word_cloud = WordCloud(
        width = 2000,
        height = 1000,
        random_state = 1,
        background_color = "white",
        colormap = "Set2",
        collocations = False
    ).generate_from_frequencies(frequencies)

    plt.imshow(word_cloud, interpolation = "bilinear")
    plt.axis("off")
    plt.savefig("topic_" + str(topic) + "_prescrips_word_cloud.png", format = "png")
    plt.show()

In [None]:
# Plot the word cloud of all(?) the phenotypes for prescriptions with Topic 0 as the example

rows = prescrips_phi[2].index.tolist()

phe_ids = prescrips_phi.iloc[rows][1]

mapper = pickle.load(open("prescrips_mapper.pkl", 'rb'))

phenotypes = [mapper[x] for x in phe_ids]
frequencies = {}

for phe in phenotypes:
    if phe in frequencies:
        frequencies[phe] += 1
    else:
        frequencies[phe] = 1

word_cloud = WordCloud(
    width = 3000,
    height = 2000,
    random_state = 1,
    background_color = "white",
    colormap = "Set2",
    collocations = False
).generate_from_frequencies(frequencies)

plt.imshow(word_cloud)
plt.axis("off")
# plt.savefig("topics_prescrips_all_word_cloud.png", format = "png")
plt.show()

In [None]:
# Plot the word clouds of the top 20 phenotypes associated with each of 10 topics for notes 

k = 10
top_n_pheno = 10

notes_phi = phi[phi[0] == types["note_words"]].reset_index(drop=True)

for topic in interesting_topics:
    print("----------TOPIC " + str(topic) + "----------")

    top_rows = notes_phi[topic+2].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = notes_phi.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/note_words_mapper.pkl", 'rb'))

    top_phenotypes = [mapper[x] for x in top_phe_ids]
    frequencies = {}
    
    for phe in top_phenotypes:
        if phe in frequencies:
            frequencies[phe] += 1
        else:
            frequencies[phe] = 1

    word_cloud = WordCloud(
        width = 2000,
        height = 1000,
        random_state = 1,
        background_color = "white",
        colormap = "Set2",
        collocations = False
    ).generate_from_frequencies(frequencies)

    plt.imshow(word_cloud, interpolation = "bilinear")
    plt.axis("off")
    plt.savefig("topic_" + str(topic) + "_note_words_word_cloud.png", format = "png")
    plt.show()

In [None]:
# Plot the word cloud of all(?) the phenotypes for notes with Topic 0 as the example

rows = notes_phi[2].index.tolist()

phe_ids = notes_phi.iloc[rows][1]

mapper = pickle.load(open("note_words_mapper.pkl", 'rb'))

phenotypes = [mapper[x] for x in phe_ids]
frequencies = {}

for phe in phenotypes:
    if phe in frequencies:
        frequencies[phe] += 1
    else:
        frequencies[phe] = 1

word_cloud = WordCloud(
    width = 3000,
    height = 2000,
    random_state = 1,
    background_color = "white",
    colormap = "Set2",
    collocations = False
).generate_from_frequencies(frequencies)

plt.imshow(word_cloud)
plt.axis("off")
# plt.savefig("topics_note_words_all_word_cloud.png", format = "png")
plt.show()

In [None]:
# Plot the word clouds of the top 20 phenotypes associated with each of 10 topics for ECG features

k = 10
top_n_pheno = 10

ecg_phi = phi[phi[0] == types["ecg_quantile_features"]].reset_index(drop=True)

for topic in interesting_topics:
    print("----------TOPIC " + str(topic) + "----------")

    top_rows = ecg_phi[topic+2].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = ecg_phi.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/ecg_quantile_features_mapper.pkl", 'rb'))

    top_phenotypes = [mapper[x] for x in top_phe_ids]
    frequencies = {}
    
    for phe in top_phenotypes:
        if phe in frequencies:
            frequencies[phe] += 1
        else:
            frequencies[phe] = 1

    word_cloud = WordCloud(
        width = 2000,
        height = 1000,
        random_state = 1,
        background_color = "white",
        colormap = "Set2",
        collocations = False
    ).generate_from_frequencies(frequencies)

    plt.imshow(word_cloud, interpolation = "bilinear")
    plt.axis("off")
    plt.savefig("topic_" + str(topic) + "_ecg_quantile_features_word_cloud.png", format = "png")
    plt.show()

In [None]:
# Plot the word cloud of all(?) the phenotypes for ECG features with Topic 0 as the example

rows = ecg_phi[2].index.tolist()

phe_ids = ecg_phi.iloc[rows][1]

mapper = pickle.load(open("ecg_quantile_features_mapper.pkl", 'rb'))

phenotypes = [mapper[x] for x in phe_ids]
frequencies = {}

for phe in phenotypes:
    if phe in frequencies:
        frequencies[phe] += 1
    else:
        frequencies[phe] = 1

word_cloud = WordCloud(
    width = 3000,
    height = 2000,
    random_state = 1,
    background_color = "white",
    colormap = "Set2",
    collocations = False
).generate_from_frequencies(frequencies)

plt.imshow(word_cloud)
plt.axis("off")
# plt.savefig("topics_ecg_quantile_features_all_word_cloud.png", format = "png")
plt.show()

In [None]:
# List the top k phenotypes associated with each of k topics for lab tests

k = 75
top_n_pheno = 10


def describe_lab(lab_code, lab_descrips_dict):
    try:
        return lab_descrips_dict[str(lab_code)]
    except KeyError:
        return str(lab_code)

    
eta = pd.read_csv("./../data/train_mixehr_" + condition + "_JCVB0_nmar_K75_iter500_eta.csv", header=None)
eta_abnormal = eta[eta[2] == 1].reset_index(drop=True)

lab_descrips_df = pd.read_csv("./../data/D_LABITEMS.csv")

lab_descrips_dict = dict(zip(lab_descrips_df.ITEMID.astype(str), lab_descrips_df.LABEL))

for topic in range(k):
    print("----------TOPIC " + str(topic) + "----------")
    
    top_rows = eta_abnormal[topic+3].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = eta_abnormal.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/labs_mapper.pkl", 'rb'))

    top_phenotypes = [str(mapper[x]) + ": " + describe_lab(mapper[x], lab_descrips_dict) for x in top_phe_ids]
    
    print("labs" + ": " + str(top_phenotypes) + "\n")

In [None]:
# Plot the word clouds of the top 20 phenotypes associated with each of 10 topics for lab tests

k = 10
top_n_pheno = 10

for topic in interesting_topics:
    print("----------TOPIC " + str(topic) + "----------")
    
    top_rows = eta_abnormal[topic+3].nlargest(n=top_n_pheno, keep='first').index.tolist()

    top_phe_ids = eta_abnormal.iloc[top_rows][1]

    mapper = pickle.load(open("./../data/mixehr/labs_mapper.pkl", 'rb'))

    top_phenotypes = [describe_lab(mapper[x], lab_descrips_dict) for x in top_phe_ids]
    frequencies = {}
    
    for phe in top_phenotypes:
        if phe in frequencies:
            frequencies[phe] += 1
        else:
            frequencies[phe] = 1

    word_cloud = WordCloud(
        width = 2000,
        height = 1000,
        random_state = 1,
        background_color = "white",
        colormap = "Set2",
        collocations = False
    ).generate_from_frequencies(frequencies)

    plt.imshow(word_cloud)
    plt.axis("off")
    plt.savefig("topic_" + str(topic) + "_labs_word_cloud.png", format = "png")
    plt.show()

In [None]:
# Plot the word cloud of all(?) the phenotypes for lab tests with Topic 0 as the example

rows = eta_abnormal[3].index.tolist()

phe_ids = eta_abnormal.iloc[rows][1]

mapper = pickle.load(open("labs_mapper.pkl", 'rb'))

phenotypes = [describe_lab(mapper[x], lab_descrips_dict) for x in phe_ids]
frequencies = {}

for phe in phenotypes:
    if phe in frequencies:
        frequencies[phe] += 1
    else:
        frequencies[phe] = 1

word_cloud = WordCloud(
    width = 3000,
    height = 2000,
    random_state = 1,
    background_color = "white",
    colormap = "Set2",
    collocations = False
).generate_from_frequencies(frequencies)

plt.imshow(word_cloud)
plt.axis("off")
# plt.savefig("topics_labs_all_word_cloud.png", format = "png")
plt.show()

In [None]:
#icd_descrips_dict['414']

#icd_descrips_df[icd_descrips_df["ICD9_CODE"] == 8964]
icd_descrips_dict

In [None]:
lab_descrips_dict

In [None]:
lab_descrips_dict

In [None]:
eta # 778, seems to be related to specifically labs

In [None]:
eta_normalized # Normalized how?? All the values are between 0 and 1

In [None]:
print(phi[0].unique()) # Also missing lab typeIds
phi