In [12]:
# Encode skipthought vectors for patient diagnoses
import skipthoughts
model = skipthoughts.load_model()

Loading model parameters...
Compiling encoders...
Loading tables...
Packing up...


In [13]:
import codecs
import glob
import io
import os
import pandas as pd
import re

intermediate_directory = os.path.join('/home/ubuntu/workspace/notebooks/clustering/data')

filename = os.path.join(intermediate_directory, 'diag_10K.txt')

diags_list = []
FI = io.open(filename,'r',encoding='latin-1').read()
for line in FI.split('\n'):
    diags_list.append(line)
print('Num diagnoses =', len(diags_list))
print('Num unique diagnoses = ', len(set(diags_list)))
diags_list = set(diags_list)

('Num diagnoses =', 10001)
('Num unique diagnoses = ', 2563)


In [14]:
str_list = filter(None, diags_list)

all_diags_encoded_vectors = skipthoughts.encode(model, str_list, verbose=False)

In [15]:
data_vectors = pd.DataFrame(all_diags_encoded_vectors)
data_vectors["diags"] = str_list
data_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4791,4792,4793,4794,4795,4796,4797,4798,4799,diags
0,0.004081,-0.010978,-0.008206,0.003614,0.020669,0.004152,-0.020804,-0.071421,-0.022898,0.013983,...,0.004625,-0.003665,-0.025637,0.058502,-0.012533,0.01038,-0.026784,0.014798,-0.009865,Diseases of the respiratory system complicatin...
1,-0.012826,-0.018188,-0.015756,0.0007,-0.010938,0.019007,0.009468,-0.047084,-0.012875,0.006693,...,0.004653,0.004854,-0.007443,0.025474,-0.014876,0.023749,0.009612,0.007752,8.1e-05,Malignant neoplasm of overlapping sites of vulva
2,-0.010098,-0.01415,-0.011749,0.002176,-0.026151,0.013937,-0.016538,-0.091504,0.001925,0.019181,...,0.004945,0.027034,0.025409,0.009612,-0.004007,0.02502,-0.038544,-0.042793,-0.014311,Malignant neoplasm of parietal lobe
3,-0.012855,-0.011,-0.009148,0.004236,0.014994,0.001925,0.004141,-0.027751,9.2e-05,0.009492,...,0.005043,-0.001699,-0.003718,0.009594,1.7e-05,-0.015444,-0.008781,-0.010518,0.009912,"Major depressive disorder, recurrent, mild"
4,0.010658,-0.005366,0.01277,0.003583,-0.004319,0.015482,-0.005549,-0.095059,0.018238,-0.012403,...,0.003563,-0.047192,0.002699,0.002866,-0.007163,0.00497,-0.003704,-0.0014,-0.0004,Rheumatoid arthritis without rheumatoid factor...


In [16]:
data_vectors.shape

(2562, 4801)

In [17]:
# Truncated SVD + TSNE Reduced 2d Matrix

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=0)
svd_skipthought = svd.fit_transform(all_diags_encoded_vectors)


from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_skipthought = tsne_model.fit_transform(svd_skipthought)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.289886
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.090928
[t-SNE] Error after 350 iterations: 1.090928


In [18]:
tsne_skipthought.shape

(2562, 2)

In [19]:
# Bokeh Code
import bokeh.plotting as bp
import numpy as np
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
output_notebook()

colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", 
                     "#6e6cd5", "#e3be38", "#4e2d7c", "#5fdfa8"])

plot_skipthought = bp.figure(plot_width=700, plot_height=600, title="SkipThought Clustering of Patient Diagnoses",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
skip_df = pd.DataFrame(tsne_skipthought, columns=['x', 'y'])
skip_df['diags'] = data_vectors["diags"]

plot_skipthought.scatter(x='x', y='y', source=skip_df)

hover = plot_skipthought.select(dict(type=HoverTool))
hover.tooltips={"Diagnosis": "@diags"}
show(plot_skipthought)

In [23]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.cluster import MiniBatchKMeans

num_clusters = 20
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(all_diags_encoded_vectors)
kmeans_clusters = kmeans.predict(all_diags_encoded_vectors)
kmeans_distances = kmeans.transform(all_diags_encoded_vectors)

In [24]:
 tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.115069
[t-SNE] KL divergence after 100 iterations with early exaggeration: 1.072368
[t-SNE] Error after 325 iterations: 1.072368


In [26]:
import numpy as np

colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", "#68af4e", "#6e6cd5",
                     "#e3be38", "#4e2d7c", "#5fdfa8", "#d34690", "#3f6d31", "#d44427", "#7fcdd8", 
                     "#cb4053", "#5e9981", "#803a62", "#9b9e39", "#c88cca", "#e1c37b", "#34223b"])

plot_kmeans = bp.figure(plot_width=700, plot_height=600, title="KMeans clustering of Patient Diagnoses",
                        tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                        x_axis_type=None, y_axis_type=None, min_border=1)

kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['question'] = data_vectors["diags"]

plot_kmeans.scatter(x='x', y='y', 
                    color=colormap[kmeans_clusters], 
                    source=kmeans_df)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"question": "@question", "cluster":"@cluster"}
show(plot_kmeans)