## Skipgrams in Keras

Here we will implement skipgrams in `Keras`. Skipgrams  are  a  technique largely  used  in  the  field  of speech processing, whereby n-grams are formed (bi-grams, tri-grams, etc.) but in addition to allowing adjacent sequences  of  words,  we  allow tokens to be “skipped”. With skipgrams we are trying to predict the context words (output) from a target word (input). On the other hand, with continuous bag of words (CBOW) we aim at predicting the word given its context.

### Loading libraries and preprocessing data

In [1]:
import warnings
warnings.simplefilter("ignore")

from __future__ import print_function, division
import pandas as pd 
import numpy as np
import random
%matplotlib inline

from nltk import sent_tokenize

np.random.seed(13)
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Reshape, Activation, Lambda
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import model_to_dot 
from keras.preprocessing.sequence import skipgrams

Using TensorFlow backend.


In [2]:
# Alice in Wonderland
path = get_file('carrol-alice.txt', origin="http://www.gutenberg.org/files/11/11-0.txt")
corpus = open(path, encoding="utf8").read()

In [3]:
# Split document into sentences
corpus = corpus[corpus.index('\n\n')+2:]  # remove header.
sentences = sent_tokenize(corpus)

# Tokenize using Keras
base_filter='!"#$%&()*+,-./:;`<=>?@[\\]^_{|}~\t\n' + "'"
tokenizer = Tokenizer(filters=base_filter)
tokenizer.fit_on_texts(sentences)

# Convert tokenized sentences to sequence format
sequences = tokenizer.texts_to_sequences(sentences)
nb_samples = sum(len(s) for s in corpus)

print(len(sequences), tokenizer.document_count)

1093 1093


In [4]:
# what is happening
print(sentences[324])  # this is a sentence
print(sequences[324])  # this is the same sentence where words are encoded as numbers.
print(list(tokenizer.word_index[word.lower().replace('.', '')] 
           for word in sentences[324].split()))

The Caterpillar was the first to speak.
[1, 182, 13, 1, 98, 4, 330]
[1, 182, 13, 1, 98, 4, 330]


#### Skipgrams: Generating Input and Output Labels
Generate `X_train` and `y_train`

In [5]:
# Let's first see how Keras' skipgrams function works.

couples, labels = skipgrams(sequences[324], len(tokenizer.word_index) + 1,
    window_size=2, negative_samples=0, shuffle=True,
    categorical=False, sampling_table=None)

index_2_word = {val: key for key, val in tokenizer.word_index.items()}

for w1, w2 in couples:
    if w1 == 13:
        print(index_2_word[w1], index_2_word[w2])

was the
was the
was first
was caterpillar


In [6]:
# Function to generate the inputs and outputs for all windows

# Vocab size
vocab_size = len(tokenizer.word_index) + 1
# Dimension to reduce to
dim = 100
window_size = 2

def generate_data(sequences, window_size, vocab_size):
    for seq in sequences:
        X, y = [], []
        couples, _ = skipgrams(
            seq, vocab_size,
            window_size=window_size, negative_samples=0, shuffle=True,
            categorical=False, sampling_table=None)
        if not couples:
            continue
        for in_word, out_word in couples:
            X.append(in_word)
            y.append(np_utils.to_categorical(out_word, vocab_size))
        X, y = np.array(X), np.array(y)
        X = X.reshape(len(X), 1)
        y = y.reshape(len(X), vocab_size)
        yield X, y
        
data_generator = generate_data(sequences, window_size, vocab_size)

### Skipgrams: Creating the Model
- Lastly, we create the (shallow) network!

In [7]:
# Create the Keras model
skipgram = Sequential()
skipgram.add(Embedding(input_dim=vocab_size, output_dim=dim, embeddings_initializer='glorot_uniform', input_length=1))
skipgram.add(Reshape((dim,)))
skipgram.add(Dense(input_dim=dim, units=vocab_size, activation='softmax'))
#SVG(model_to_dot(skipgram, show_shapes=True).create(prog='dot', format='svg'))

### Skipgrams: Compiling and Training
- Time to compile and train
- We use crossentropy, common loss for classification

In [8]:
# Compile the Keras Model
from keras.optimizers import SGD
sgd = SGD(lr=1e-4, decay=1e-6, momentum=0.9)

skipgram.compile(loss='categorical_crossentropy', optimizer="adadelta")

# Fit the Skipgrams
for iteration in range(10):
    loss = 0
    for x, y in generate_data(sequences, window_size, vocab_size):
        loss += skipgram.train_on_batch(x, y)
    print('iteration {}, loss is {}'.format(iteration, loss))

iteration 0, loss is 8478.365273475647
iteration 1, loss is 7775.790303230286
iteration 2, loss is 7389.946382522583
iteration 3, loss is 7170.007953643799
iteration 4, loss is 7037.916015148163
iteration 5, loss is 6950.1233451366425
iteration 6, loss is 6886.156407356262
iteration 7, loss is 6835.780858516693
iteration 8, loss is 6793.287910699844
iteration 9, loss is 6755.321224927902


### Skipgrams: Looking at the vectors

To get word_vectors now, we look at the weights of the first layer.

Let's also write functions giving us similarity of two words.

In [9]:
word_vectors = skipgram.get_weights()[0]

from scipy.spatial.distance import cosine

def get_dist(w1, w2):
    i1, i2 = tokenizer.word_index[w1], tokenizer.word_index[w2]
    v1, v2 = word_vectors[i1], word_vectors[i2]
    return cosine(v1, v2)

def get_similarity(w1, w2):
    return 1-get_dist(w1, w2)

def get_most_similar(w1, n=10):
    sims = {word: get_similarity(w1, word) 
            for word in tokenizer.word_index.keys()
            if word != w1}
    sims = pd.Series(sims)
    sims.sort_values(inplace=True, ascending=False)
    return sims.iloc[:n]


print('Similarity between king and queen:', get_similarity('king', 'queen'))
print('')
print('Most similar to queen:', get_most_similar('queen'))

Similarity between king and queen: 0.9519025683403015

Most similar to queen: gryphon     0.969170
dormouse    0.966102
duchess     0.965379
march       0.957100
hatter      0.956557
king        0.951903
cat         0.945495
white       0.943481
mouse       0.939788
rabbit      0.938334
dtype: float64


### Create a CBOW Model

In [10]:
path = get_file('alice.txt', origin='http://www.gutenberg.org/files/11/11-0.txt')
corpus = open(path, encoding='utf-8').readlines()[:300]
corpus = [sentence for sentence in corpus if sentence.count(' ') >= 2]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
corpus = tokenizer.texts_to_sequences(corpus)
nb_samples = sum(len(s) for s in corpus)
V = len(tokenizer.word_index) + 1
dim = 100
window_size = 2

In [11]:
def generate_data_cbow(corpus, window_size, V):
    maxlen = window_size*2
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            contexts = []
            labels   = []            
            s = index - window_size
            e = index + window_size + 1
            
            contexts.append([words[i] for i in range(s, e) if 0 <= i < L and i != index])
            labels.append(word)

            x = sequence.pad_sequences(contexts, maxlen=maxlen)
            y = np_utils.to_categorical(labels, V)
            yield (x, y)

In [12]:
cbow = Sequential()
cbow.add(Embedding(input_dim=V, output_dim=dim, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim,)))
cbow.add(Dense(V, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [13]:
for ite in range(10):
    loss = 0.
    for x, y in generate_data_cbow(corpus, window_size, V):
        loss += cbow.train_on_batch(x, y)
    print(ite, loss)

0 17415.323275089264
1 16149.458847522736
2 16020.484440684319
3 15928.479923248291
4 15818.480538725853
5 15715.989834606647
6 15625.664679527283
7 15544.10433664918
8 15466.378872230649
9 15390.164903387427


In [14]:
f = open('vectors.txt' ,'w',  encoding='utf-8')
f.write('{} {}\n'.format(V-1, dim))

8

In [15]:
vectors = cbow.get_weights()[0]
for word, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(word, str_vec))
f.close()

In [16]:
import gensim
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.txt', binary=False)

In [17]:
w2v.most_similar(positive=['the'])

[('this', 0.6434246897697449),
 ('a', 0.6397069096565247),
 ('any', 0.6323063373565674),
 ('nothing', 0.5520201921463013),
 ('no', 0.5337472558021545),
 ('one', 0.5324605703353882),
 ('shoes', 0.5167098641395569),
 ('tumbling', 0.5105654001235962),
 ('beds', 0.5061527490615845),
 ('out', 0.5025702714920044)]

In [18]:
w2v.most_similar(positive=['alice'])

[('please', 0.6076477766036987),
 ('she', 0.6060823202133179),
 ('good', 0.5879305005073547),
 ('‘poison', 0.5714962482452393),
 ('thought', 0.566747784614563),
 ('eat', 0.5651428699493408),
 ('them', 0.5641888380050659),
 ('they', 0.5598827004432678),
 ('which', 0.5543259978294373),
 ('bye', 0.5497626066207886)]