In [57]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from keras.datasets import imdb
import numpy as np
max_features = 20000
# cut texts after this number of words (among top max_features most common words)
maxlen = 80
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [58]:
# A dictionary mapping words to an integer index
word_index = imdb.get_word_index()

# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNK>"] = 2  # unknown
word_index["<UNUSED>"] = 3

reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [59]:
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import scipy
import math
import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

tf.logging.set_verbosity(tf.logging.ERROR)
embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")

In [60]:
X = [decode_review(x) for x in x_train]

In [61]:
X_test = [decode_review(x) for x in x_test]

In [62]:
sentences = tf.placeholder(dtype=tf.string, shape=[None])
embedding_fun = embed(sentences)
with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    context_embed = sess.run(embedding_fun, feed_dict={sentences: X})

In [63]:
with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    context_embed_test = sess.run(embedding_fun, feed_dict={sentences: X_test})

In [64]:
context_embed.shape

(25000, 512)

In [65]:
from annoy import AnnoyIndex
f = 512
t = AnnoyIndex(f, 'euclidean')  # Length of item vector that will be indexed
for i in range(len(context_embed)):
    v = context_embed[i]
    t.add_item(i, v)

t.build(10) # 10 trees

True

In [20]:
t.get_nns_by_item(11, 10, search_k=-1, include_distances=False)

[11, 3591, 12255, 5261, 21721, 4362, 24156, 17745, 18086, 3463]

In [49]:
result = t.get_nns_by_vector(context_embed_test[0], 10, search_k=-1, include_distances=False)
print(result)

[23846, 10785, 16767, 9178, 24976, 9201, 7767, 13811, 22644, 3756]


In [42]:
y_test[3]

0

In [50]:
sum(y_train[result]==0)

8

In [55]:
suc_count =0 
for i in range(len(x_test)):
    result = t.get_nns_by_vector(context_embed_test[i], 21, search_k=-1, include_distances=False)
    label_result = y_train[result]
    num_zero = sum(label_result==0)
    pred_label = 0
    if num_zero<=10:
        pred_label = 1
    if (pred_label == y_test[i]):
        suc_count += 1
    else:
        print(i,pred_label,y_test[i])

8 1 0
10 0 1
25 0 1
26 1 0
27 0 1
28 0 1
31 0 1
32 1 0
33 1 0
39 1 0
46 1 0
49 0 1
52 0 1
53 0 1
56 0 1
59 1 0
70 0 1
78 1 0
83 0 1
91 0 1
95 1 0
96 0 1
105 1 0
115 1 0
125 0 1
126 0 1
130 0 1
135 1 0
136 1 0
139 1 0
140 0 1
143 1 0
144 1 0
152 1 0
156 1 0
164 0 1
166 1 0
167 0 1
172 1 0
177 1 0
178 1 0
180 1 0
184 0 1
185 1 0
187 0 1
194 1 0
199 1 0
202 1 0
206 0 1
210 0 1
214 1 0
220 1 0
224 0 1
225 0 1
231 0 1
242 0 1
244 0 1
247 1 0
260 1 0
263 0 1
264 0 1
266 0 1
267 1 0
276 1 0
279 1 0
280 0 1
282 0 1
283 0 1
293 1 0
296 0 1
297 1 0
299 1 0
300 0 1
301 0 1
302 1 0
306 1 0
316 1 0
320 1 0
325 0 1
331 0 1
334 1 0
335 1 0
336 1 0
343 0 1
345 0 1
347 0 1
348 0 1
349 0 1
352 1 0
355 0 1
360 1 0
363 0 1
366 0 1
374 0 1
376 1 0
378 0 1
379 1 0
383 1 0
384 0 1
385 0 1
390 1 0
394 1 0
396 0 1
399 0 1
400 1 0
403 1 0
407 1 0
408 0 1
411 0 1
419 0 1
420 1 0
422 1 0
433 1 0
434 0 1
438 1 0
443 1 0
449 1 0
453 1 0
461 0 1
468 1 0
469 0 1
471 1 0
472 0 1
476 1 0
477 0 1
479 0 1
480 1 0
483 0 1

3808 0 1
3809 0 1
3813 0 1
3816 1 0
3819 0 1
3824 0 1
3826 0 1
3842 1 0
3844 1 0
3845 1 0
3850 0 1
3851 1 0
3857 1 0
3860 1 0
3861 0 1
3864 1 0
3871 0 1
3878 1 0
3880 1 0
3883 1 0
3887 1 0
3890 0 1
3891 0 1
3894 1 0
3898 1 0
3901 1 0
3903 1 0
3904 0 1
3908 1 0
3909 1 0
3910 0 1
3913 1 0
3916 0 1
3917 1 0
3918 1 0
3920 1 0
3923 0 1
3928 0 1
3929 1 0
3930 1 0
3933 0 1
3937 0 1
3938 0 1
3939 1 0
3940 1 0
3946 1 0
3953 1 0
3955 1 0
3956 0 1
3958 0 1
3960 0 1
3963 1 0
3964 0 1
3967 1 0
3968 1 0
3969 0 1
3972 1 0
3984 1 0
3985 1 0
3987 0 1
3992 1 0
3993 0 1
3994 0 1
3995 0 1
3998 0 1
4010 1 0
4018 0 1
4022 0 1
4024 1 0
4025 1 0
4027 0 1
4045 0 1
4046 1 0
4047 0 1
4048 0 1
4063 1 0
4071 0 1
4075 0 1
4077 1 0
4079 1 0
4082 1 0
4085 0 1
4092 1 0
4093 1 0
4104 0 1
4109 1 0
4111 1 0
4116 0 1
4118 1 0
4127 1 0
4131 1 0
4132 1 0
4135 1 0
4138 0 1
4139 1 0
4145 1 0
4147 1 0
4148 1 0
4153 1 0
4154 1 0
4155 1 0
4157 0 1
4159 0 1
4160 0 1
4169 1 0
4179 1 0
4188 1 0
4189 1 0
4194 1 0
4195 0 1
4206 1 0
4

7652 1 0
7654 0 1
7659 1 0
7660 0 1
7663 1 0
7667 0 1
7668 1 0
7672 1 0
7674 0 1
7680 1 0
7686 0 1
7689 1 0
7692 1 0
7697 1 0
7702 1 0
7710 0 1
7712 1 0
7717 0 1
7718 1 0
7722 0 1
7730 1 0
7734 0 1
7741 1 0
7751 0 1
7762 1 0
7765 1 0
7766 0 1
7767 0 1
7772 0 1
7773 1 0
7775 1 0
7782 1 0
7784 0 1
7785 1 0
7790 1 0
7795 0 1
7799 0 1
7801 1 0
7814 1 0
7817 0 1
7824 0 1
7828 0 1
7829 1 0
7832 0 1
7834 1 0
7838 0 1
7843 1 0
7846 1 0
7855 0 1
7857 1 0
7860 1 0
7863 0 1
7869 1 0
7874 1 0
7888 0 1
7889 0 1
7890 1 0
7893 1 0
7899 1 0
7900 1 0
7901 0 1
7903 0 1
7906 0 1
7917 1 0
7920 0 1
7923 1 0
7925 1 0
7926 0 1
7927 1 0
7931 1 0
7937 1 0
7940 0 1
7947 0 1
7949 1 0
7953 0 1
7955 0 1
7966 0 1
7967 1 0
7969 0 1
7974 0 1
7975 0 1
7981 1 0
7984 0 1
7987 0 1
8003 1 0
8008 0 1
8011 0 1
8015 1 0
8018 1 0
8019 0 1
8020 1 0
8022 1 0
8026 1 0
8029 0 1
8033 0 1
8036 1 0
8037 1 0
8042 0 1
8043 1 0
8049 1 0
8052 1 0
8053 1 0
8056 1 0
8057 0 1
8058 1 0
8060 0 1
8061 0 1
8063 1 0
8067 0 1
8068 0 1
8074 1 0
8

11441 0 1
11442 1 0
11443 1 0
11448 1 0
11455 0 1
11459 0 1
11461 1 0
11462 0 1
11463 1 0
11465 0 1
11470 0 1
11473 0 1
11478 0 1
11479 1 0
11480 1 0
11481 0 1
11484 1 0
11488 0 1
11492 1 0
11496 1 0
11498 1 0
11505 1 0
11506 0 1
11508 0 1
11523 0 1
11532 0 1
11539 0 1
11545 0 1
11546 0 1
11551 1 0
11552 0 1
11555 0 1
11564 0 1
11566 1 0
11567 0 1
11569 1 0
11570 1 0
11571 1 0
11572 0 1
11573 0 1
11577 1 0
11579 0 1
11584 0 1
11586 1 0
11588 1 0
11595 0 1
11596 1 0
11598 1 0
11600 1 0
11606 1 0
11608 1 0
11624 0 1
11626 1 0
11633 1 0
11635 0 1
11636 1 0
11648 1 0
11657 0 1
11665 1 0
11666 1 0
11674 1 0
11675 0 1
11677 0 1
11679 0 1
11680 0 1
11682 1 0
11684 0 1
11688 1 0
11692 1 0
11694 1 0
11696 1 0
11698 1 0
11705 0 1
11707 1 0
11709 1 0
11714 0 1
11718 0 1
11720 1 0
11721 1 0
11725 1 0
11727 0 1
11730 1 0
11739 1 0
11746 1 0
11756 0 1
11762 1 0
11765 1 0
11767 0 1
11771 1 0
11775 0 1
11789 0 1
11795 1 0
11801 0 1
11810 1 0
11814 1 0
11827 0 1
11829 0 1
11834 1 0
11839 0 1
11843 0 1


15208 1 0
15214 1 0
15217 0 1
15224 0 1
15226 1 0
15228 0 1
15236 1 0
15240 1 0
15242 0 1
15244 0 1
15250 0 1
15260 1 0
15261 1 0
15267 0 1
15275 0 1
15278 1 0
15281 1 0
15284 0 1
15287 1 0
15291 0 1
15293 1 0
15295 1 0
15303 0 1
15304 0 1
15306 1 0
15307 0 1
15312 0 1
15313 0 1
15317 1 0
15319 1 0
15323 0 1
15325 1 0
15333 1 0
15334 0 1
15338 1 0
15339 1 0
15345 0 1
15346 1 0
15350 0 1
15353 0 1
15354 1 0
15355 1 0
15357 1 0
15361 1 0
15365 1 0
15367 1 0
15368 1 0
15372 1 0
15375 1 0
15378 1 0
15379 1 0
15386 0 1
15388 1 0
15394 1 0
15395 1 0
15398 1 0
15404 1 0
15405 1 0
15408 1 0
15409 1 0
15419 1 0
15432 1 0
15439 0 1
15441 1 0
15442 0 1
15446 0 1
15448 1 0
15449 1 0
15451 0 1
15458 0 1
15463 1 0
15466 1 0
15469 1 0
15470 1 0
15474 1 0
15479 1 0
15484 0 1
15502 0 1
15509 1 0
15510 1 0
15515 1 0
15518 1 0
15519 1 0
15520 1 0
15526 0 1
15533 0 1
15540 0 1
15546 0 1
15550 1 0
15554 0 1
15557 1 0
15563 0 1
15572 1 0
15576 1 0
15579 0 1
15583 1 0
15590 1 0
15596 1 0
15608 0 1
15613 0 1


19002 1 0
19009 1 0
19011 1 0
19014 0 1
19015 1 0
19017 1 0
19019 0 1
19020 1 0
19031 0 1
19036 0 1
19040 1 0
19047 1 0
19049 0 1
19053 1 0
19062 0 1
19064 1 0
19066 1 0
19069 1 0
19073 1 0
19074 1 0
19080 1 0
19083 1 0
19084 1 0
19086 1 0
19087 1 0
19094 0 1
19095 0 1
19097 1 0
19102 1 0
19109 1 0
19112 0 1
19113 1 0
19119 0 1
19123 0 1
19124 1 0
19126 0 1
19133 1 0
19141 1 0
19146 1 0
19152 1 0
19159 1 0
19164 0 1
19168 1 0
19175 0 1
19177 1 0
19179 1 0
19181 0 1
19183 1 0
19187 0 1
19194 1 0
19195 0 1
19196 0 1
19203 1 0
19206 1 0
19209 0 1
19212 0 1
19217 1 0
19220 1 0
19223 0 1
19224 1 0
19225 1 0
19226 0 1
19230 0 1
19239 0 1
19243 1 0
19249 0 1
19259 0 1
19260 0 1
19261 0 1
19268 0 1
19272 0 1
19273 0 1
19276 1 0
19280 1 0
19283 1 0
19286 1 0
19296 1 0
19299 1 0
19303 0 1
19308 1 0
19309 0 1
19322 1 0
19329 1 0
19331 1 0
19333 0 1
19336 0 1
19339 0 1
19341 0 1
19343 1 0
19345 0 1
19346 0 1
19348 0 1
19349 0 1
19353 1 0
19358 1 0
19362 1 0
19368 0 1
19369 0 1
19370 1 0
19372 0 1


22771 0 1
22774 1 0
22776 0 1
22777 1 0
22779 1 0
22781 1 0
22782 0 1
22786 0 1
22790 0 1
22792 0 1
22794 0 1
22797 0 1
22805 0 1
22807 0 1
22811 0 1
22818 0 1
22823 1 0
22825 0 1
22830 0 1
22834 0 1
22835 1 0
22838 1 0
22841 0 1
22844 1 0
22849 1 0
22851 1 0
22854 0 1
22857 0 1
22860 1 0
22862 1 0
22868 1 0
22870 1 0
22872 0 1
22875 1 0
22878 1 0
22889 1 0
22890 0 1
22891 0 1
22892 0 1
22899 1 0
22905 0 1
22912 0 1
22918 0 1
22929 1 0
22930 0 1
22932 1 0
22937 1 0
22938 0 1
22939 0 1
22943 1 0
22948 1 0
22949 0 1
22952 1 0
22954 0 1
22965 0 1
22972 1 0
22973 1 0
22976 1 0
22977 0 1
22978 0 1
22995 0 1
22997 0 1
23007 0 1
23009 1 0
23016 0 1
23025 0 1
23027 0 1
23030 0 1
23032 1 0
23034 0 1
23035 0 1
23036 0 1
23042 1 0
23049 0 1
23057 1 0
23059 1 0
23060 0 1
23061 1 0
23073 0 1
23078 1 0
23085 1 0
23089 0 1
23091 1 0
23097 0 1
23102 1 0
23110 1 0
23112 0 1
23118 1 0
23119 1 0
23128 0 1
23130 1 0
23146 0 1
23155 0 1
23164 1 0
23166 1 0
23170 0 1
23172 0 1
23175 1 0
23179 0 1
23181 1 0


In [53]:
len(x_test)

25000

In [56]:
suc_count/25000

0.7508