In [1]:
import warnings
warnings.filterwarnings("ignore")

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
import os

import os
import pandas as pd
import numpy as np
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec, LineSentence
from pprint import pprint
from copy import deepcopy
from multiprocessing import cpu_count
import gensim
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

from tqdm import tqdm

from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

from sklearn.manifold import TSNE
from sklearn import cluster

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from bokeh.io import output_notebook
from bokeh.palettes import viridis
from bokeh.plotting import figure, show, output_file, save
from bokeh.models import HoverTool, CategoricalColorMapper
from bokeh.models import ColumnDataSource, Range1d, LabelSet, Label

%matplotlib inline
output_notebook()

Instructions for updating:
Use the retry module or similar alternatives.


In [3]:
class SiameseNetwork:
    def __init__(self):
        self.x1 = tf.placeholder(tf.float32, [None, 250])
        self.x2 = tf.placeholder(tf.float32, [None, 250])

        with tf.variable_scope("siamese") as scope:
            self.o1 = self.network(self.x1)
            scope.reuse_variables()
            self.o2 = self.network(self.x2)
        
        self.y_ = tf.placeholder(tf.float32, [None])
        self.loss = self.loss_with_spring()

    def network(self, x):
        weights = []
        fc1 = self.fc_layer(x, 128, "fc1")
        ac1 = tf.nn.relu(fc1)
        fc2 = self.fc_layer(ac1, 64, "fc2")
        ac2 = tf.nn.relu(fc2)
        fc3 = self.fc_layer(ac2, 3, "fc3")
        return fc3

    def fc_layer(self, bottom, n_weight, name):
        assert len(bottom.get_shape()) == 2
        n_prev_weight = bottom.get_shape()[1]
        initer = tf.truncated_normal_initializer(stddev=0.01)
        W = tf.get_variable(name+'W', dtype=tf.float32, shape=[n_prev_weight, n_weight], initializer=initer)
        b = tf.get_variable(name+'b', dtype=tf.float32, initializer=tf.constant(0.01, shape=[n_weight], dtype=tf.float32))
        fc = tf.nn.bias_add(tf.matmul(bottom, W), b)
        return fc

    def loss_with_spring(self):
        margin = 5.0
        labels_t = self.y_
        labels_f = tf.subtract(1.0, self.y_, name="1-yi")
        eucd2 = tf.pow(tf.subtract(self.o1, self.o2), 2)
        eucd2 = tf.reduce_sum(eucd2, 1)
        eucd = tf.sqrt(eucd2+1e-6, name="eucd")
        C = tf.constant(margin, name="C")
        # yi*||CNN(p1i)-CNN(p2i)||^2 + (1-yi)*max(0, C-||CNN(p1i)-CNN(p2i)||^2)
        pos = tf.multiply(labels_t, eucd2, name="yi_x_eucd2")
        # neg = tf.multiply(labels_f, tf.subtract(0.0,eucd2), name="yi_x_eucd2")
        # neg = tf.multiply(labels_f, tf.maximum(0.0, tf.subtract(C,eucd2)), name="Nyi_x_C-eucd_xx_2")
        neg = tf.multiply(labels_f, tf.pow(tf.maximum(tf.subtract(C, eucd), 0), 2), name="Nyi_x_C-eucd_xx_2")
        losses = tf.add(pos, neg, name="losses")
        loss = tf.reduce_mean(losses, name="loss")
        return loss

    def loss_with_step(self):
        margin = 5.0
        labels_t = self.y_
        labels_f = tf.subtract(1.0, self.y_, name="1-yi")          # labels_ = !labels;
        eucd2 = tf.pow(tf.subtract(self.o1, self.o2), 2)
        eucd2 = tf.reduce_sum(eucd2, 1)
        eucd = tf.sqrt(eucd2+1e-6, name="eucd")
        C = tf.constant(margin, name="C")
        pos = tf.multiply(labels_t, eucd, name="y_x_eucd")
        neg = tf.multiply(labels_f, tf.maximum(0.0, tf.subtract(C, eucd)), name="Ny_C-eucd")
        losses = tf.add(pos, neg, name="losses")
        loss = tf.reduce_mean(losses, name="loss")
        return loss   

## Data

In [4]:
X_topic = []
Y_topic = []

X_path = 'topic_vectors.txt'
Y_path = 'topic_names.txt'

if(os.path.isfile(X_path) and os.path.isfile(Y_path)):
    print('load from .txt file')
    X_topic = np.loadtxt(X_path)
    with open(Y_path, 'r') as f:
        for line in f.readlines():
            Y_topic.append(line.replace('\n',''))
    Y_topic = np.array(Y_topic)

load from .txt file


In [5]:
bolezni = pd.read_csv('../parse-html/bolezni.csv')
X_categ = []
for i in bolezni['topics']:
    toks = i.split(' $ ')
    X_categ.append(toks[0])
X_categ = np.array(X_categ)

name2topic = {}
for i in range(X_topic.shape[0]):
    name2topic[str(Y_topic[i])] = str(X_categ[i])

In [6]:
name2vector = {}
for i in range(X_topic.shape[0]):
    name2vector[str(Y_topic[i])] = X_topic[i]

In [7]:
siamese_data = pd.read_csv('siamese_data.csv')

In [8]:
emb_size = 250
X = np.zeros((siamese_data.shape[0], 2, emb_size))
y = np.array(siamese_data['target'].values)

In [9]:
for d, i in siamese_data.iterrows():
    X[d, 0, :] = name2vector[i[0]]
    X[d, 1, :] = name2vector[i[1]]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((40270, 2, 250), (19835, 2, 250), (40270,), (19835,))

In [12]:
np.bincount(y_train), np.bincount(y_test)

(array([37881,  2389]), array([18659,  1176]))

## Model

In [13]:
sess = tf.InteractiveSession()

In [14]:
siamese = SiameseNetwork()

In [15]:
train_step = tf.train.AdamOptimizer(0.001).minimize(siamese.loss)
saver = tf.train.Saver()
tf.global_variables_initializer().run()

In [16]:
for step in range(10000):
    _, loss_v = sess.run([train_step, siamese.loss], feed_dict={
                        siamese.x1: X[:,0,:],
                        siamese.x2: X[:,1,:],
                        siamese.y_: y})

    if np.isnan(loss_v):
        print('Model diverged with loss = NaN')
        quit()

    if(step % 10 == 0):
        print('step %d: loss %.3f' % (step, loss_v))

    if(step % 1000 == 0 and step > 0):
        saver.save(sess, './model')
#         embed = siamese.o1.eval({siamese.x1: mnist.test.images})
#         embed.tofile('embed.txt')

step 0: loss 23.507
step 10: loss 22.575
step 20: loss 16.291
step 30: loss 6.739
step 40: loss 5.951
step 50: loss 5.725
step 60: loss 5.240
step 70: loss 4.560
step 80: loss 3.466
step 90: loss 2.936
step 100: loss 2.885
step 110: loss 2.806
step 120: loss 2.723
step 130: loss 2.663
step 140: loss 2.603
step 150: loss 2.547
step 160: loss 2.497
step 170: loss 2.455
step 180: loss 2.420
step 190: loss 2.389
step 200: loss 2.359
step 210: loss 2.328
step 220: loss 2.296
step 230: loss 2.264
step 240: loss 2.232
step 250: loss 2.202
step 260: loss 2.176
step 270: loss 2.152
step 280: loss 2.131
step 290: loss 2.113
step 300: loss 2.095
step 310: loss 2.077
step 320: loss 2.058
step 330: loss 2.037
step 340: loss 2.016
step 350: loss 1.994
step 360: loss 1.973
step 370: loss 1.952
step 380: loss 1.932
step 390: loss 1.912
step 400: loss 1.891
step 410: loss 1.872
step 420: loss 1.852
step 430: loss 1.832
step 440: loss 1.811
step 450: loss 1.787
step 460: loss 1.757
step 470: loss 1.715


KeyboardInterrupt: 

In [17]:
embed = siamese.o1.eval({siamese.x1: X_topic})

In [18]:
embed.shape

(3565, 3)

In [19]:
X_embedded = embed

In [20]:
X_embedded.shape

(3565, 3)

In [20]:
source = ColumnDataSource(data=dict(X=X_embedded[:, 0], Y=X_embedded[:, 1], labels=Y_topic, cat=X_categ))

In [21]:
hover = HoverTool(tooltips=[
    ("labels", "@labels"),
    ("cat", "@cat")
])

In [22]:
TOOLS="crosshair,pan,wheel_zoom,zoom_in,zoom_out,box_zoom,undo,redo,reset,tap,save,box_select,poly_select,lasso_select,"
p = figure(title='Diseases', tools=[hover, TOOLS], plot_width=600, plot_height=500)

In [23]:
palette = viridis(len(set(X_categ)))
color_map = CategoricalColorMapper(factors=list(set(X_categ)), palette=palette)

In [24]:
p.scatter(x='X', y='Y', color={'field': 'cat', 'transform': color_map}, source=source)

In [25]:
show(p)

## Data for Dash

In [21]:
colors = viridis(len(set(X_categ)))
name2color_bokeh = {}
for i in list(set(X_categ)):
    name2color_bokeh[i] = colors[len(name2color_bokeh)]

In [22]:
vis_data = pd.DataFrame()
vis_data['name'] = Y_topic
vis_data['categ'] = X_categ
vis_data['X'] = X_embedded[:, 0]
vis_data['Y'] = X_embedded[:, 1]
vis_data['Z'] = X_embedded[:, 2]
vis_data['color'] = vis_data['categ'].apply(lambda x: name2color_bokeh[x])

In [23]:
vis_data.head()

Unnamed: 0,name,categ,X,Y,Z,color
0,Hallux valgus,Болезни ОДС и травмы,-5.077133,-5.757468,-0.827155,#443982
1,HELLP-синдром,Женские болезни,3.681569,-12.854797,-3.708415,#460C5F
2,Абдоминальная мигрень,Нервные болезни,0.273175,-3.364479,6.76651,#BADE27
3,Абиотрофия сетчатки,Глазные болезни,6.142817,-2.510729,5.723158,#462F7C
4,Абсанс,Нервные болезни,-1.427816,-1.204047,6.859466,#BADE27


In [24]:
# vis_data.to_csv('vis_data_siamese_3d.csv', index=False)