In [1]:
import os
import pandas as pd
import numpy as np
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec, LineSentence
from pprint import pprint
from copy import deepcopy
from multiprocessing import cpu_count
import gensim
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.manifold import TSNE
from sklearn import cluster

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from bokeh.io import output_notebook
from bokeh.palettes import viridis
from bokeh.plotting import figure, show, output_file
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

%matplotlib inline
output_notebook()

In [2]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [3]:
model = Word2Vec.load('med_w2v')

In [4]:
stemmer = SnowballStemmer("russian")

In [5]:
stemmer.stem("бронхит")

'бронх'

In [6]:
model.most_similar('ларинг', topn=20)

[('ринофаринг', 0.7727226614952087),
 ('фаринг', 0.7435891032218933),
 ('ларинготрахе', 0.741643488407135),
 ('назофаринг', 0.7340249419212341),
 ('ларингит', 0.7321699857711792),
 ('трахе', 0.7102281451225281),
 ('тонзилл', 0.6485297679901123),
 ('трахеит', 0.636513352394104),
 ('стомат', 0.6119980216026306),
 ('аденоидит', 0.6078621745109558),
 ('катаральн', 0.6012295484542847),
 ('тонзилит', 0.5922653079032898),
 ('эндобронх', 0.5864366292953491),
 ('трахеобронх', 0.5848177075386047),
 ('трахеобронхит', 0.5798566341400146),
 ('глосс', 0.5764217972755432),
 ('ринофарингит', 0.5702421069145203),
 ('ангин', 0.563210666179657),
 ('рин', 0.5592923760414124),
 ('фаренг', 0.5573338270187378)]

In [7]:
bolezni = pd.read_csv('../parse-html/bolezni.csv')

In [8]:
X = []
Y = []

X_path = 'topic_vectors.txt'
Y_path = 'topic_names.txt'

if(os.path.isfile(X_path) and os.path.isfile(Y_path)):
    X = np.loadtxt(X_path)
    with open(Y_path, 'r') as f:
        for line in f.readlines():
            Y.append(line.replace('\n',''))
    Y = np.array(Y)
else:
    for d, ind in tqdm(bolezni.iterrows()):
        i = ind[1]
        tokens = gensim.utils.simple_preprocess(i)
        tmp2 = [word for word in tokens if word not in stopwords.words("russian")]
        word_vectors = []
        for tok in tmp2:
            s = stemmer.stem(tok)
            if(s in model.wv.vocab):
                word_vectors.append(model[s])
        word_vectors = np.array(word_vectors)
        final_vec = np.mean(word_vectors, axis=0)
        X.append(final_vec)
        Y.append(ind[0])

    X = np.array(X)
    Y = np.array(Y)
    
    np.savetxt('topic_vectors.txt', X)
    np.savetxt('topic_names.txt', Y, fmt="%s")

In [9]:
X.shape, Y.shape

((3565, 250), (3565,))

In [10]:
X_categ = []
for i in bolezni['topics']:
    toks = i.split(' $ ')
    X_categ.append(toks[0])

In [11]:
X_categ = np.array(X_categ)

In [12]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [13]:
def getSim(vec):
    sim = []
    for i in X:
        sim.append(cos_sim(vec, i))
    indexs = np.argsort(np.array(sim))
    
    res = [(Y[i]) for i in indexs[-7:]]
    return res

## Visualization

In [14]:
tsne = TSNE(n_components=2, perplexity=30)
# svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)

In [15]:
%%time
X_embedded = tsne.fit_transform(X)

CPU times: user 1min 25s, sys: 7.47 s, total: 1min 33s
Wall time: 1min 36s


In [16]:
X_embedded.shape

(3565, 2)

In [17]:
trace = go.Scatter(
    x = X_embedded[:, 0],
    y = X_embedded[:, 1],
    mode = 'markers',
    text=Y
)

In [18]:
data = [trace]

In [19]:
py.iplot(data)

In [20]:
source = ColumnDataSource(data=dict(X=X_embedded[:, 0], Y=X_embedded[:, 1], labels=Y, cat=X_categ))

In [21]:
hover = HoverTool(tooltips=[
    ("labels", "@labels"),
    ("cat", "@cat")
])

In [22]:
TOOLS="crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
p = figure(title='Diseases', tools=[hover, TOOLS])

In [23]:
palette = viridis(len(set(X_categ)))
color_map = CategoricalColorMapper(factors=list(set(X_categ)), palette=palette)

In [24]:
p.scatter(x='X', y='Y', color={'field': 'cat', 'transform': color_map}, source=source)

In [25]:
show(p)

## Graph

In [None]:
graph = nx.Graph()

In [None]:
graph.add_nodes_from(Y)

In [None]:
for i in tqdm(range(Y.shape[0])):
    top = getSim(X[i])
    
    for j in top:
        graph.add_edge(Y[i], j)

In [None]:
# plt.figure(figsize=(25,25))
# options = {
#     'edge_color': '#FFDEA2',
#     'width': 1,
#     'with_labels': True,
#     'font_weight': 'regular',
# }

# nx.draw(graph, pos=nx.spring_layout(graph, k=0.25, iterations=50), **options)
# plt.savefig("Graph.png", format="PNG")