In [1]:
import os
import pandas as pd
import numpy as np
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec, LineSentence
from pprint import pprint
from copy import deepcopy
from multiprocessing import cpu_count
import gensim
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.manifold import TSNE
from sklearn import cluster

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from bokeh.io import output_notebook
from bokeh.palettes import viridis
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

%matplotlib inline
output_notebook()

In [2]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [3]:
model = Word2Vec.load('med_w2v')

In [4]:
stemmer = SnowballStemmer("russian")

In [5]:
stemmer.stem("бронхит")

'бронх'

In [6]:
model.most_similar('ларинг', topn=20)

[('ринофаринг', 0.7727226614952087),
 ('фаринг', 0.7435891032218933),
 ('ларинготрахе', 0.741643488407135),
 ('назофаринг', 0.7340249419212341),
 ('ларингит', 0.7321699857711792),
 ('трахе', 0.7102281451225281),
 ('тонзилл', 0.6485297679901123),
 ('трахеит', 0.636513352394104),
 ('стомат', 0.6119980216026306),
 ('аденоидит', 0.6078621745109558),
 ('катаральн', 0.6012295484542847),
 ('тонзилит', 0.5922653079032898),
 ('эндобронх', 0.5864366292953491),
 ('трахеобронх', 0.5848177075386047),
 ('трахеобронхит', 0.5798566341400146),
 ('глосс', 0.5764217972755432),
 ('ринофарингит', 0.5702421069145203),
 ('ангин', 0.563210666179657),
 ('рин', 0.5592923760414124),
 ('фаренг', 0.5573338270187378)]

In [7]:
bolezni = pd.read_csv('../parse-html/bolezni.csv')

In [8]:
bolezni.head()

Unnamed: 0,name,descr,topics
0,Hallux valgus,Hallux valgus вальгусная деформация первого па...,Болезни ОДС и травмы
1,HELLP-синдром,Хотя последние годы HELLP синдром наблюдается ...,Женские болезни
2,Абдоминальная мигрень,Термин абдоминальная мигрень используется невр...,Нервные болезни
3,Абиотрофия сетчатки,Абиотрофия сетчатки дегенерация сетчатки дистр...,Глазные болезни $ Наследственные болезни
4,Абсанс,Первые упоминания абсанс датируются годом .Тер...,Нервные болезни


In [9]:
X = []
Y = []

X_path = 'topic_vectors.txt'
Y_path = 'topic_names.txt'

if(os.path.isfile(X_path) and os.path.isfile(Y_path)):
    print('load from .txt file')
    X = np.loadtxt(X_path)
    with open(Y_path, 'r') as f:
        for line in f.readlines():
            Y.append(line.replace('\n',''))
    Y = np.array(Y)
else:
    print('compute vectors')
    for d, ind in tqdm(bolezni.iterrows()):
        i = ind[1]
        tokens = gensim.utils.simple_preprocess(i)
        tmp2 = [word for word in tokens if word not in stopwords.words("russian")]
        word_vectors = []
        for tok in tmp2:
            s = stemmer.stem(tok)
            if(s in model.wv.vocab):
                word_vectors.append(model[s])
        word_vectors = np.array(word_vectors)
        final_vec = np.mean(word_vectors, axis=0)
        X.append(final_vec)
        Y.append(ind[0])

    X = np.array(X)
    Y = np.array(Y)
    
    np.savetxt('topic_vectors.txt', X)
    np.savetxt('topic_names.txt', Y, fmt="%s")

load from .txt file


In [10]:
X.shape, Y.shape

((3565, 250), (3565,))

In [11]:
X_categ = []
for i in bolezni['topics']:
    toks = i.split(' $ ')
    X_categ.append(toks[0])

In [12]:
X_categ = np.array(X_categ)

In [13]:
# np.savetxt('topic_class.txt', X_categ, fmt="%s")

In [14]:
def cos_sim(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

In [15]:
def getSim(vec):
    sim = []
    for i in X:
        sim.append(cos_sim(vec, i))
    indexs = np.argsort(np.array(sim))
    
    res = [(Y[i]) for i in indexs[-7:]]
    return res

## Visualization

In [16]:
tsne = TSNE(n_components=3, perplexity=30, n_iter=2000)
# svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)

In [17]:
%%time
X_embedded = tsne.fit_transform(X)

CPU times: user 6min 24s, sys: 20.2 s, total: 6min 45s
Wall time: 6min 55s


In [18]:
X_embedded.shape

(3565, 3)

In [19]:
# np.savetxt('topic_tsne_vectors.txt', X_embedded)

In [20]:
name2color = {}
for i in list(set(X_categ)):
    name2color[i] = len(name2color)
X_colors = np.array([name2color[i] for i in X_categ])

In [21]:
trace = go.Scatter3d(
    x = X_embedded[:, 0],
    y = X_embedded[:, 1],
    z = X_embedded[:, 2],
    mode = 'markers',
    marker = {
        'color': X_colors,
        'colorscale':'Viridis',
    },
    text=Y
)

In [22]:
data = [trace]

In [23]:
py.iplot(data)

In [24]:
source = ColumnDataSource(data=dict(X=X_embedded[:, 0], Y=X_embedded[:, 1], labels=Y, cat=X_categ))

In [25]:
hover = HoverTool(tooltips=[
    ("labels", "@labels"),
    ("cat", "@cat")
])

In [26]:
TOOLS="crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
p = figure(title='Diseases', tools=[hover, TOOLS], plot_width=600, plot_height=500)

In [27]:
palette = viridis(len(set(X_categ)))
color_map = CategoricalColorMapper(factors=list(set(X_categ)), palette=palette)

In [28]:
p.scatter(x='X', y='Y', color={'field': 'cat', 'transform': color_map}, source=source)

In [29]:
# output_file("vis.html")
# save(p)

In [30]:
show(p)

## Data for visualization

In [24]:
colors = viridis(len(set(X_categ)))
name2color_bokeh = {}
for i in list(set(X_categ)):
    name2color_bokeh[i] = colors[len(name2color_bokeh)]

In [25]:
vis_data = pd.DataFrame()
vis_data['name'] = Y
vis_data['categ'] = X_categ
vis_data['X'] = X_embedded[:, 0]
vis_data['Y'] = X_embedded[:, 1]
vis_data['Z'] = X_embedded[:, 2]
vis_data['color'] = vis_data['categ'].apply(lambda x: name2color_bokeh[x])

In [26]:
vis_data.head()

Unnamed: 0,name,categ,X,Y,Z,color
0,Hallux valgus,Болезни ОДС и травмы,-14.711287,5.173281,0.08706,#30678D
1,HELLP-синдром,Женские болезни,7.911613,-1.022318,-8.712523,#48196B
2,Абдоминальная мигрень,Нервные болезни,-1.937836,-2.5802,-12.138154,#440154
3,Абиотрофия сетчатки,Глазные болезни,-3.191781,-11.44012,-2.863823,#E7E419
4,Абсанс,Нервные болезни,-2.348987,-4.450676,-10.594405,#440154


In [27]:
# vis_data.to_csv('vis_data_3d.csv', index=False)

## Graph

In [None]:
graph = nx.Graph()

In [None]:
graph.add_nodes_from(Y)

In [None]:
for i in tqdm(range(Y.shape[0])):
    top = getSim(X[i])
    
    for j in top:
        graph.add_edge(Y[i], j)

In [None]:
# plt.figure(figsize=(25,25))
# options = {
#     'edge_color': '#FFDEA2',
#     'width': 1,
#     'with_labels': True,
#     'font_weight': 'regular',
# }

# nx.draw(graph, pos=nx.spring_layout(graph, k=0.25, iterations=50), **options)
# plt.savefig("Graph.png", format="PNG")