In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import gutenberg
from string import punctuation
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.preprocessing.sequence import skipgrams
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


In [2]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [3]:
bible = gutenberg.sents('bible-kjv.txt') 

In [4]:
bible = bible[:500]

In [5]:
bible

[['[', 'The', 'King', 'James', 'Bible', ']'], ['The', 'Old', 'Testament', 'of', 'the', 'King', 'James', 'Bible'], ...]

In [6]:
remove_terms = punctuation + '0123456789'

norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]

print('Total lines:', len(bible))
print('\nSample line:', bible[10])
print('\nProcessed line:', norm_bible[10])

Total lines: 500

Sample line: ['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']

Processed line: god said let firmament midst waters let divide waters waters


In [7]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index

word2id['PAD'] = 0
id2word = {v:k for k, v in word2id.items()}
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]

vocab_size = len(word2id)

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 1213
Vocabulary Sample: [('unto', 1), ('said', 2), ('god', 3), ('thou', 4), ('shall', 5), ('lord', 6), ('earth', 7), ('thee', 8), ('thy', 9), ('every', 10)]


In [8]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    for words in corpus:
        sentence_length = len(words)
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = np_utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [10]:
embed_size = 100
window_size = 2
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['old', 'testament', 'james', 'bible'] -> Target (Y): king
Context (X): ['first', 'book', 'called', 'genesis'] -> Target (Y): moses
Context (X): ['beginning', 'god', 'heaven', 'earth'] -> Target (Y): created
Context (X): ['earth', 'without', 'void', 'darkness'] -> Target (Y): form
Context (X): ['without', 'form', 'darkness', 'upon'] -> Target (Y): void
Context (X): ['form', 'void', 'upon', 'face'] -> Target (Y): darkness
Context (X): ['void', 'darkness', 'face', 'deep'] -> Target (Y): upon
Context (X): ['spirit', 'god', 'upon', 'face'] -> Target (Y): moved
Context (X): ['god', 'moved', 'face', 'waters'] -> Target (Y): upon
Context (X): ['god', 'said', 'light', 'light'] -> Target (Y): let
Context (X): ['god', 'saw', 'good', 'god'] -> Target (Y): light


In [11]:
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda

cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))

cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')
print(cbow.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            121300    
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1213)              122513    
Total params: 243,813
Trainable params: 243,813
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
for epoch in range(1, 6):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)
    print()


Epoch: 1 	Loss: 41279.736198067665

Epoch: 2 	Loss: 38445.25493836403

Epoch: 3 	Loss: 38773.63505089283

Epoch: 4 	Loss: 43138.64250642061

Epoch: 5 	Loss: 47685.427143633366



In [13]:
weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(1212, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
said,-0.128253,0.226968,0.0193,-0.081244,-0.372059,-0.319537,-0.295273,-0.052657,-0.167394,-0.409522,...,-0.319721,-0.143053,-0.229126,0.15705,-0.333871,-0.010607,0.084051,-0.228236,0.141151,-0.190837
god,-0.084887,0.101451,-0.021281,-0.090714,0.224609,-0.089845,-0.06949,0.066486,0.362365,0.015019,...,-0.147641,0.114957,0.054864,0.138212,-0.008728,0.062342,0.114764,-0.041396,-0.039181,0.057258
thou,0.174887,-0.062091,-0.170756,0.229444,-0.242916,-0.134692,0.25831,0.048423,0.134819,-0.012863,...,-0.097976,0.171165,-0.205643,0.610108,0.270939,0.122991,0.111601,0.260416,-0.211995,-0.062347
shall,0.117929,0.396772,-0.307282,-0.276501,0.207303,-0.621406,0.427301,0.458144,-0.015476,-0.683615,...,-0.059427,-0.208191,-0.082685,0.420278,-0.031298,0.021003,-0.279808,0.016808,0.110092,0.156189
lord,-0.118808,0.017322,0.016953,-0.299371,-0.111125,-0.386646,0.004721,-0.002264,0.270221,-0.513287,...,0.111961,-0.208771,0.17502,-0.360232,-0.378339,-0.247434,-0.066559,0.331395,0.051095,-0.131308


In [14]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['god', 'noah', 'egypt', 'moses','famine']}

similar_words

(1212, 1212)


{'god': ['midst', 'female', 'stars', 'water', 'sent'],
 'noah': ['shem', 'ham', 'japheth', 'families', 'builded'],
 'egypt': ['dwelled', 'haran', 'journeyed', 'canaanites', 'ur'],
 'moses': ['tops', 'work', 'lambs', 'saving', 'onyx'],
 'famine': ['herds', 'sidon', 'entreated', 'riphath', 'separated']}

In [16]:
# generate skip-grams
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=2) for wid in wids]

# view sample skip-grams
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(10):
    print("({:s} ({:d}), {:s} ({:d})) -> {:d}".format(
          id2word[pairs[i][0]], pairs[i][0], 
          id2word[pairs[i][1]], pairs[i][1], 
          labels[i]))

(james (445), hear (505)) -> 0
(king (45), james (445)) -> 1
(james (445), bible (446)) -> 1
(king (45), hiddekel (698)) -> 0
(james (445), whither (627)) -> 0
(king (45), bible (446)) -> 1
(bible (446), lands (546)) -> 0
(bible (446), feet (635)) -> 0
(king (45), break (1137)) -> 0
(bible (446), james (445)) -> 1
