In [22]:
import pandas as pd
pd.options.mode.chained_assignment = None
# nltk for nlp
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# list of stopwords like articles, preposition
stop = set(stopwords.words('english'))
from string import punctuation
from collections import Counter
import re
import numpy as np
import os
data = pd.read_csv('data/diag_10K.txt', header=None, sep='\n', names=['diagnosis'])
data.head()

Unnamed: 0,diagnosis
0,"Major depressive disorder, single episode, in ..."
1,Congenital renal artery stenosis
2,Deformity of right orbit due to bone disease
3,Wegener's granulomatosis without renal involve...
4,Malignant neoplasm of heart


In [23]:
print('data shape:', data.shape)
data = data.drop_duplicates('diagnosis')
print('data shape:', data.shape)

data shape: (10000, 1)
data shape: (2562, 1)


In [24]:
def tokenizer(text):
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                            u'\u2014', u'\u2026', u'\u2013'], tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

data['tokens'] = data['diagnosis'].map(tokenizer)

for diagnosis, tokens in zip(data['diagnosis'].head(5), data['tokens'].head(5)):
    print('diagnosis:', diagnosis)
    print('tokens:', tokens)
    print() 

def keywords():
    tokens = data['tokens']
    alltokens = []
    for token_list in tokens:
        alltokens += token_list
    print('Num tokens =', len(alltokens))
    print('Num unique tokens =', len(set(alltokens)))
    counter = Counter(alltokens)
    return counter.most_common(100)

diagnosis: Major depressive disorder, single episode, in partial remission
tokens: ['major', 'depressive', 'disorder', 'single', 'episode', 'partial', 'remission']

diagnosis: Congenital renal artery stenosis
tokens: ['congenital', 'renal', 'artery', 'stenosis']

diagnosis: Deformity of right orbit due to bone disease
tokens: ['deformity', 'right', 'orbit', 'due', 'bone', 'disease']

diagnosis: Wegener's granulomatosis without renal involvement
tokens: ['wegener', 'granulomatosis', 'without', 'renal', 'involvement']

diagnosis: Malignant neoplasm of heart
tokens: ['malignant', 'neoplasm', 'heart']



In [25]:
keywords()

Num tokens = 13554
Num unique tokens = 1431


[('neoplasm', 573),
 ('disease', 423),
 ('malignant', 370),
 ('rheumatoid', 365),
 ('diseases', 314),
 ('disorders', 283),
 ('right', 246),
 ('left', 246),
 ('elsewhere', 243),
 ('classified', 242),
 ('disorder', 222),
 ('diabetes', 196),
 ('mellitus', 194),
 ('due', 192),
 ('arthritis', 161),
 ('benign', 135),
 ('type', 127),
 ('without', 119),
 ('renal', 107),
 ('pregnancy', 107),
 ('chronic', 105),
 ('diabetic', 95),
 ('complicating', 93),
 ('leukemia', 77),
 ('heart', 76),
 ('pulmonary', 74),
 ('acute', 69),
 ('shoulder', 68),
 ('foot', 66),
 ('behavior', 65),
 ('uncertain', 64),
 ('system', 63),
 ('ankle', 62),
 ('hand', 60),
 ('trimester', 58),
 ('sites', 58),
 ('condition', 57),
 ('hip', 55),
 ('cardiac', 55),
 ('remission', 53),
 ('neoplastic', 53),
 ('complications', 53),
 ('kidney', 52),
 ('abuse', 51),
 ('artery', 50),
 ('coronary', 50),
 ('limb', 49),
 ('retinopathy', 48),
 ('wrist', 47),
 ('gout', 47),
 ('impairment', 47),
 ('factor', 46),
 ('underlying', 45),
 ('elbow', 4

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# min_df is minimum number of documents that contain a term t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above

vectorizer = TfidfVectorizer(min_df=10, tokenizer=tokenizer)
vz = vectorizer.fit_transform(list(data['diagnosis']))
print('data shape:', vz.shape)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']
tfidf.tfidf.hist(bins=50, figsize=(15,7))


data shape: (2562, 233)


<matplotlib.axes._subplots.AxesSubplot at 0x7fade566ba58>

In [27]:
tfidf.sort_values(by=['tfidf'], ascending=True).head(30)


Unnamed: 0,tfidf
neoplasm,2.496304
disease,2.813452
malignant,2.932732
diseases,3.096361
disorders,3.250512
left,3.339545
right,3.339545
elsewhere,3.351766
classified,3.355872
disorder,3.441762


In [28]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(30)

Unnamed: 0,tfidf
immunodeficiency,6.451038
séry,6.451038
chagas,6.451038
ureter,6.451038
underdosing,6.451038
loss,6.451038
blood,6.451038
teeth,6.451038
cerebrovascular,6.451038
vessels,6.451038


In [29]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10, random_state=0)
svd_tfidf = svd.fit_transform(vz)
svd_tfidf.shape
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
tsne_tfidf.shape
tsne_tfidf

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.736943
[t-SNE] Error after 350 iterations: 0.736943


array([[ -8.14671387,  -9.69770862],
       [  5.62743827,  14.22614782],
       [  1.75142164, -11.27123109],
       ..., 
       [ -6.79362576, -15.34408223],
       [  9.06261689,  -7.65758356],
       [  1.39495361, -17.8947697 ]])

In [30]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()

plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="tf-idf clustering of the diagnoses",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
tfidf_df = pd.DataFrame(tsne_tfidf, columns=['x', 'y'])
tfidf_df['diagnosis'] = data['diagnosis']
plot_tfidf.scatter(x='x', y='y', source=tfidf_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"diagnosis": "@diagnosis"}
show(plot_tfidf)

In [31]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

from sklearn.cluster import MiniBatchKMeans

num_clusters = 20
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                               init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

In [32]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    aux = ''
    for j in sorted_centroids[i, :10]:
        aux += terms[j] + ' | '
    print(aux)
    print()


Cluster 0:
benign | neoplasm | leukemia | vascular | acute | remission | relapse | achieved | myeloid | gland | 

Cluster 1:
malignant | neoplasm | overlapping | sites | right | secondary | left | gland | limb | wall | 

Cluster 2:
disorder | ligament | bipolar | episode | due | mixed | sleep | type | depressive | cervical | 

Cluster 3:
disorders | metabolism | due | visual | related | nerve | inflammatory | disc | intervertebral | side | 

Cluster 4:
mellitus | diabetes | diabetic | underlying | type | condition | due | peripheral | coma | without | 

Cluster 5:
nodule | rheumatoid | deformans | osteitis | neoplastic | ankle | left | foot | diseases | right | 

Cluster 6:
disease | neoplastic | pathological | fracture | reiter | chronic | kaschin-beck | intestine | right | deposition | 

Cluster 7:
pulmonary | acute | valve | vessels | chronic | artery | blood | congenital | edema | laceration | 

Cluster 8:
rheumatoid | arthritis | factor | right | left | without | bursitis | diseas

In [33]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    print("Cluster %d:" % i)
    aux = ''
    for j in sorted_centroids[i, :10]:
        aux += terms[j] + ' | '
    print(aux)
    print()

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.762624
[t-SNE] Error after 250 iterations: 0.762624
Cluster 0:
benign | neoplasm | leukemia | vascular | acute | remission | relapse | achieved | myeloid | gland | 

Cluster 1:
malignant | neoplasm | overlapping | sites | right | secondary | left | gland | limb | wall | 

Cluster 2:
disorder | ligament | bipolar | episode | due | mixed | sleep | type | depressive | cervical | 

Cluster 3:
disorders | metabolism | due | visual | related | nerve | inflammatory | disc | intervertebral | side | 

Cluster 4:
mellitus | diabetes | diabetic | underlying | type | condition | due | peripheral | coma | without | 

Cluster 5

In [34]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.762624
[t-SNE] Error after 250 iterations: 0.762624


In [43]:
colormap = np.array(["#6d8dca", "#69de53", "#723bca", "#c3e14c", "#c84dc9", 
                     "#68af4e", "#6e6cd5", "#e3be38", "#4e2d7c", "#5fdfa8", 
                     "#d34690", "#3f6d31", "#d44427", "#7fcdd8", "#cb4053", 
                     "#5e9981", "#803a62", "#9b9e39", "#c88cca", "#e1c37b"])

plot_kmeans = bp.figure(plot_width=700, plot_height=600, title="KMeans clustering of the diagnoses",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
kmeans_df = pd.DataFrame(tsne_kmeans, columns=['x', 'y'])
kmeans_df['cluster'] = kmeans_clusters
kmeans_df['diagnosis'] = data['diagnosis']
plot_kmeans.scatter(x='x', y='y', 
                    color=colormap[kmeans_clusters], 
                    source=kmeans_df)
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"diagnosis": "@diagnosis", 'cluster': '@cluster'}
show(plot_kmeans)

In [36]:
kmeans_df.head()

Unnamed: 0,x,y,cluster,diagnosis
0,-9.055941,-15.589382,2,"Major depressive disorder, single episode, in ..."
1,-3.716584,12.633177,13,Congenital renal artery stenosis
2,-2.052942,-12.069255,6,Deformity of right orbit due to bone disease
3,-3.187004,14.22815,13,Wegener's granulomatosis without renal involve...
4,-16.585605,1.550624,1,Malignant neoplasm of heart


In [37]:
import lda
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
import logging
logging.getLogger("lda").setLevel(logging.WARNING)

In [39]:
cvectorizer = CountVectorizer(min_df=4, max_features=10000, tokenizer=tokenizer, ngram_range=(1,2))
cvz = cvectorizer.fit_transform(data['diagnosis'])
n_topics = 20
n_iter = 2000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)
n_top_words = 8
topic_summaries = []
topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))



Topic 0: disorders disease vascular left due metabolism neoplastic disease neoplastic
Topic 1: disorder bipolar disorders ligament bipolar disorder episode related disorder ligament
Topic 2: retinopathy diabetic retinopathy diabetes diabetic mellitus edema diabetes mellitus nonproliferative
Topic 3: foot ankle ankle foot diseases neoplastic right osteitis deformans
Topic 4: disease intestine right kaschin-beck disease reiter kaschin-beck reiter disease deposition
Topic 5: behavior leukemia neoplasm uncertain uncertain uncertain behavior neoplasm remission acute
Topic 6: disorder abuse psychotic disorder multiple psychotic multiple sites skin sleep
Topic 7: disease chronic acute virus liver myeloid myeloid leukemia syndrome
Topic 8: pulmonary cardiac disorders following disc valve region pregnancy
Topic 9: benign benign neoplasm neoplasm neoplasm left disorders left poisoning gland
Topic 10: renal due gout gout due impairment renal impairment due renal chronic
Topic 11: elsewhere classi

In [40]:
tsne_lda = tsne_model.fit_transform(X_topics)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 2562
[t-SNE] Computed conditional probabilities for sample 2000 / 2562
[t-SNE] Computed conditional probabilities for sample 2562 / 2562
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.759037
[t-SNE] Error after 300 iterations: 0.759037


In [41]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, tweet in enumerate(data['diagnosis']):
    lda_keys += [doc_topic[i].argmax()]
plot_lda = bp.figure(plot_width=700, plot_height=600, title="LDA topic visualization",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)
lda_df = pd.DataFrame(tsne_lda, columns=['x','y'])
lda_df['diagnosis'] = data['diagnosis']
plot_lda.scatter(source=lda_df, x='x', y='y', color=colormap[lda_keys])

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"diagnosis":"@diagnosis"}
show(plot_lda)

In [42]:
lda_df['len_docs'] = data['tokens'].map(len)
def prepareLDAData():
    data = {
        'vocab': vocab,
        'doc_topic_dists': lda_model.doc_topic_,
        'doc_lengths': list(lda_df['len_docs']),
        'term_frequency':cvectorizer.vocabulary_,
        'topic_term_dists': lda_model.components_
    } 
    return data
ldadata = prepareLDAData()
import pyLDAvis
pyLDAvis.enable_notebook()
prepared_data = pyLDAvis.prepare(**ldadata)
pyLDAvis.save_html(prepared_data,'./pyldadavis.html')