In [0]:
from keras.datasets import imdb
from keras import preprocessing
from keras.models import Sequential
from keras.layers import Flatten, Dense
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [8]:
!wget ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar xzvf aclImdb_v1.tar.gz >/dev/null

import os

imdb_dir = '/content/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read()) # this is the list of all reviews (strings)
            f.close()
            if label_type == 'neg':
                labels.append(0) # this is the list of labels
            else:
                labels.append(1) # this is the list of labels

maxlen = 100  # We will cut reviews after 100 words
training_samples = 200  # We will be training on 200 samples <<<<<<
validation_samples = 10000  # We will be validating on 10000 samples
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples: training_samples + validation_samples]

--2018-12-11 23:27:43--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz.1’


2018-12-11 23:27:47 (17.7 MB/s) - ‘aclImdb_v1.tar.gz.1’ saved [84125825/84125825]

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [9]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip >/dev/null

glove_dir = '/content'

embeddings_index = {}  # this is a dictionary
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs   # you can use this piece of code in many different projects
f.close()

print('Found %s word vectors.' % len(embeddings_index))

--2018-12-11 23:29:52--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2018-12-11 23:29:52--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip.1’


2018-12-11 23:30:32 (20.9 MB/s) - ‘glove.6B.zip.1’ saved [862182613/862182613]

replace glove.6B.50d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
Found 400000 word vectors.


In [0]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
words = []

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if i < max_words:
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            words.append(word)

In [0]:
# Find the distance to other word
def find_synonym(word, rank=1, vec=False):
    if not vec:
      word_vec = embeddings_index.get(word)
    else:
      word_vec = word
    norm = lambda x: np.linalg.norm(word_vec - x)
    norms_for_word = [norm(embeddings_index.get(other_word)) for other_word in words]
    closest = np.sort(norms_for_word)[rank]
    index = norms_for_word.index(closest)
    return words[index]


In [0]:
pairs = {words[i]: find_synonym(word) for i, word in enumerate(words)}

In [13]:
pairs

{'the': 'part',
 'and': 'well',
 'a': 'another',
 'of': 'the',
 'to': 'would',
 'is': 'now',
 'br': 'er',
 'in': 'since',
 'it': 'this',
 'i': 'you',
 'this': 'it',
 'that': 'not',
 'was': 'being',
 'as': 'well',
 'for': 'as',
 'with': 'over',
 'movie': 'film',
 'but': 'because',
 'film': 'movie',
 'on': 'the',
 'not': 'would',
 'you': 'know',
 'are': 'have',
 'his': 'her',
 'have': 'they',
 'he': 'she',
 'be': 'not',
 'one': 'only',
 'all': 'those',
 'at': 'close',
 'by': 'with',
 'an': 'another',
 'they': 'have',
 'who': 'whom',
 'so': 'even',
 'from': 'came',
 'like': 'even',
 'her': 'she',
 'or': 'either',
 'just': 'going',
 'about': 'some',
 'out': 'up',
 'has': 'also',
 'if': 'because',
 'some': 'many',
 'there': 'no',
 'what': 'how',
 'good': 'better',
 'more': 'than',
 'when': 'then',
 'very': 'quite',
 'up': 'down',
 'no': 'there',
 'time': 'when',
 'she': 'he',
 'even': 'though',
 'my': 'your',
 'would': 'could',
 'which': 'that',
 'only': 'one',
 'story': 'stories',
 'really