In [2]:
#Original code contributions by Dipanjan Sarkar
#Follow the link https://www.kdnuggets.com/2018/04/implementing-deep-learning-methods-feature-engineering-text-data-cbow.html

#CBOW model architecture tries to predict current target word ( center word) based on source context words (surrounding words). 

#CONTINUOUS BAG OF WORDS MODEL STEPS

#Build the corpus vocabulary
#Build a CBOW (context, target) generator
#Build the CBOW model architecture
#Train the Model
#Get Word Embeddings

In [3]:
import numpy as np
import pandas as pd
import tensorflow
from tensorflow.keras.preprocessing import text
from tensorflow.keras import utils
from tensorflow.keras.utils import to_categorical
#from tensorflow.keras.utils import np_utils
from tensorflow.keras.preprocessing import sequence
%pprint off

In [4]:
#Laurence Sterne, “The Life and Opinions of Tristram Shandy.” 107 words.
corpus = ["The French are certainly misunderstood:- but whether the fault is theirs, in not sufficiently explaining themselves, or speaking with that exact limitation and precision which one would expect on a point of such importance, and which, moreover, is so likely to be contested by us — or whether the fault may not be altogether on our side, in not understanding their language always so critically as to know “what they would be at” — I shall not decide; but ‘tis evident to me, when they affirm, “That they who have seen Paris, have seen every thing,” they must mean to speak of those who have seen it by day-light."]

tokenizer = text.Tokenizer() #Tokenizer instance

#Fit the tokenizer object on the corpus
tokenizer.fit_on_texts(corpus)

#create a dictionary
word2id = tokenizer.word_index

#Build corpus vocabulary

#The PAD term is to pad context words to a fixed length if needed.
word2id['PAD'] = 0

#Exchange key and values and store in id2word for reverse mapping
id2word = {v:k for k, v in word2id.items()}

#Each word is mapped to a number and stored as a list of list
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in corpus]
#print(wids)

#Size of the vocabulary
vocab_size = len(word2id)

#Embedding size
embed_size = 100

#context window size
window_size = 2 

print('Vocabulary Size:', vocab_size)
print('Vocabulary Sample:', list(word2id.items())[:10])

Vocabulary Size: 80
Vocabulary Sample: [('not', 1), ('to', 2), ('they', 3), ('the', 4), ('be', 5), ('have', 6), ('seen', 7), ('but', 8), ('whether', 9), ('fault', 10)]


In [7]:
#Build a CBOW (context, target) generator

#Function that accepts the corpus in terms of sequence, size of window and vocabulary size as inputs
#and yields target word with the surrounding context words

def generate_context_word_pairs(corpus, window_size, vocab_size):
    
    #Context length is twice the window size specified by the user
    context_length = window_size*2
    
    #For every word in corpus
    for words in corpus:
        sentence_length = len(words) #Obtain the length of words
        
        #For every word obtain the context_words and its label
        for index, word in enumerate(words):
            context_words = []
            label_word   = []            
            start = index - window_size
            end = index + window_size + 1
            
            context_words.append([words[i] 
                                 for i in range(start, end) 
                                 if 0 <= i < sentence_length 
                                 and i != index])
            label_word.append(word)

            #Pad sequence if required and yield sequence and labels
            x = sequence.pad_sequences(context_words, maxlen=context_length)
            y = tensorflow.keras.utils.to_categorical(label_word, vocab_size)
            yield (x, y)

In [8]:
# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
    if 0 not in x[0]:
        print('Context (X):', [id2word[w] for w in x[0]], '-> Target (Y):', id2word[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['the', 'french', 'certainly', 'misunderstood'] -> Target (Y): are
Context (X): ['french', 'are', 'misunderstood', 'but'] -> Target (Y): certainly
Context (X): ['are', 'certainly', 'but', 'whether'] -> Target (Y): misunderstood
Context (X): ['certainly', 'misunderstood', 'whether', 'the'] -> Target (Y): but
Context (X): ['misunderstood', 'but', 'the', 'fault'] -> Target (Y): whether
Context (X): ['but', 'whether', 'fault', 'is'] -> Target (Y): the
Context (X): ['whether', 'the', 'is', 'theirs'] -> Target (Y): fault
Context (X): ['the', 'fault', 'theirs', 'in'] -> Target (Y): is
Context (X): ['fault', 'is', 'in', 'not'] -> Target (Y): theirs
Context (X): ['is', 'theirs', 'not', 'sufficiently'] -> Target (Y): in
Context (X): ['theirs', 'in', 'sufficiently', 'explaining'] -> Target (Y): not


In [None]:
#Results interpretation (first instance)
#The first four words are the context words. In this, the first two are words before and the last two are words after
#Then the resulting target center word is "are"

In [9]:
#Build the CBOW Model Architecture

#Import necessary libraries
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda

# build CBOW architecture
cbow = Sequential()

#Input context words passed to embedding layer (initialised with random weights)
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))

#Average out the word embeddings in lambda layer
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))

#Averaged context embedding is passed to a dense softmax layer which predicts the target word
cbow.add(Dense(vocab_size, activation='softmax'))

#Compile the model
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

# view model summary (8000 parameters (vocab_size = 80) * (embed_size = 100) are to be trained)
#No parameters are to be trained at the lambda layer
#80 vocab * 100 embed_size = 8000 + 80(bias) = 8080 parmaters are the output 

print(cbow.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            8000      
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 80)                8080      
Total params: 16,080
Trainable params: 16,080
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
#Train the Model for few epochs. train_on_batch - runs a single gradient update on a single batch of data.
for epoch in range(1, 5):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size):
        i += 1
        loss += cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 1 	Loss: 487.5397434234619
Epoch: 2 	Loss: 483.0509033203125
Epoch: 3 	Loss: 478.81833934783936
Epoch: 4 	Loss: 474.2853865623474


In [11]:
#Get word embeddings for the vocabulary

weights = cbow.get_weights()[0] #Word embedding of PAD
weights = weights[1:] #Exclude word embedding of PAD
print(weights.shape) # 79 (80-1) vocabulary 

#Convert the weights to a dataframe for each of the word
#A single row shows the word embedding done in 100 dimensions by CBOW model 

pd.DataFrame(weights, index=list(id2word.values())[1:]).head()

(79, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
to,0.044938,-0.008269,0.010627,0.026042,0.002331,0.00694,0.078354,0.036937,-0.012378,0.054591,...,-0.014029,-0.040447,0.003625,0.029801,0.027725,-0.008629,0.075807,0.01359,0.0056,-0.037415
they,-0.037934,0.002994,0.049663,0.025464,-0.026644,-0.012186,-0.03213,0.041848,-0.003856,-0.0257,...,-0.013786,-0.020892,0.014673,-0.058641,-0.051232,0.009474,0.062866,-0.010039,-0.020018,0.010626
the,0.025242,-0.01206,0.015147,0.077378,0.024225,0.046751,0.031603,0.021123,-0.045397,0.01939,...,0.069822,-0.039655,-0.030293,-0.019784,-0.057372,-0.028622,-0.035284,-0.095962,0.044647,0.000919
be,0.034076,0.013433,-0.004254,0.015812,-0.060514,-0.046412,0.015892,0.045996,0.020714,0.03949,...,-0.004562,0.058907,-0.00644,0.04787,0.067822,-0.035752,-0.030949,-0.015416,-0.036782,0.093722
have,0.024941,-0.014621,-0.053798,-0.009571,0.015912,0.051762,0.033805,0.017998,0.015519,-0.02038,...,-0.064365,0.02792,0.011062,0.025919,-0.015937,-0.017667,-0.034889,0.004214,0.031744,-0.019541


In [12]:
#Check for context similarity based on euclidean distances

from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape) # (79,79)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['french','sufficiently','paris']}

similar_words

(79, 79)


{'french': ['our', 'altogether', 'misunderstood', 'light', 'but'],
 'sufficiently': ['understanding',
  'theirs',
  'contested',
  'expect',
  'importance'],
 'paris': ['it', 'every', 'point', 'by', 'us']}

In [None]:
#Result interpretation
#Some words are contextually similar while some are not even with limited vocabulary of 79 words

#Try the CBOW model with slightly larger corpus
#Download Alice in Wonderland from Project Gutenberg and store it in current working directory as Alice.txt

In [18]:
#Open the file
f = open('Alice.txt','r', encoding = 'utf-8')

#Read line by line
alice = f.readlines()

#Remove all \n elements 
alRem = list(map(lambda s: s.strip(), alice ))

#Check total number of elements
print("Total number of list elements: ", len(alRem)) #3773 elements

#List comprehension to remove empty strings
alNoEmpStr = [i for i in alRem if i]

#After removing empty strings, length of the list
print("Total length of list after removing empty strings: ", len(alNoEmpStr)) #2815 elements

#First sentence in Chapter One is 34th element and the last sentence is 2508th element
#prepare a corpus based on these element numbers
alCorpus = alNoEmpStr[34:2508]

print("First two elements: ", alCorpus[0:1]) #First two elements in the list
print("Last two elements: ", alCorpus[-2:]) #Last two elements in the list

In [64]:
#Now corpus is ready for CBOW Modeling
tokenizer = text.Tokenizer() #Tokenizer instance

#Fit the tokenizer object on the Alice corpus
tokenizer.fit_on_texts(alCorpus)

#create a dictionary
word2idAl = tokenizer.word_index

#Build corpus vocabulary

#The PAD term is to pad context words to a fixed length if needed.
word2idAl['PAD'] = 0

#Exchange key and values and store in id2word for reverse mapping
id2wordAl = {v:k for k, v in word2idAl.items()}

#Each word is mapped to a number and stored as a list of list
widsAl = [[word2idAl[w] for w in text.text_to_word_sequence(doc)] for doc in alCorpus]
#print(wids)

#Size of the vocabulary
vocab_size_alice = len(word2idAl)

#Embedding size
embed_size = 100

#context window size
window_size = 2 

print('Vocabulary Size:', vocab_size_alice) #3054
print('Vocabulary Sample:', list(word2idAl.items())[:10])

Vocabulary Size: 3054
Vocabulary Sample: [('the', 1), ('”', 2), ('and', 3), ('to', 4), ('a', 5), ('she', 6), ('of', 7), ('it', 8), ('said', 9), ('alice', 10)]


In [65]:
#Call the generate_context_word_pairs function created earlier

# Test this out for some samples
i = 0
for x, y in generate_context_word_pairs(corpus=widsAl, window_size=window_size, vocab_size=vocab_size_alice):
    if 0 not in x[0]:
        print('Context (X):', [id2wordAl[w] for w in x[0]], '-> Target (Y):', id2wordAl[np.argwhere(y[0])[0][0]])
    
        if i == 10:
            break
        i += 1

Context (X): ['alice', 'was', 'to', 'get'] -> Target (Y): beginning
Context (X): ['was', 'beginning', 'get', 'very'] -> Target (Y): to
Context (X): ['beginning', 'to', 'very', 'tired'] -> Target (Y): get
Context (X): ['to', 'get', 'tired', 'of'] -> Target (Y): very
Context (X): ['get', 'very', 'of', 'sitting'] -> Target (Y): tired
Context (X): ['very', 'tired', 'sitting', 'by'] -> Target (Y): of
Context (X): ['tired', 'of', 'by', 'her'] -> Target (Y): sitting
Context (X): ['of', 'sitting', 'her', 'sister'] -> Target (Y): by
Context (X): ['sitting', 'by', 'sister', 'on'] -> Target (Y): her
Context (X): ['by', 'her', 'on', 'the'] -> Target (Y): sister
Context (X): ['bank', 'and', 'having', 'nothing'] -> Target (Y): of


In [66]:
# build CBOW architecture
cbowAl = Sequential()

#Input context words passed to embedding layer (initialised with random weights)
cbowAl.add(Embedding(input_dim=vocab_size_alice, output_dim=embed_size, input_length=window_size*2))

#Average out the word embeddings in lambda layer
cbowAl.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))

#Averaged context embedding is passed to a dense softmax layer which predicts the target word
cbowAl.add(Dense(vocab_size_alice, activation='softmax'))

#Compile the model
cbowAl.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [67]:
print(cbowAl.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 100)            305400    
_________________________________________________________________
lambda_1 (Lambda)            (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3054)              308454    
Total params: 613,854
Trainable params: 613,854
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
#Number of parameters in the embedding layer = 3054 (Vocabulary of alice corpus) * 100 dimensions = 305400
#Number of parameters in the dense output layer = 3054 * 100 (dim) + 3054 (bias) = 308454 parameters
#All 613854 parameters are trainable compared to 16080 parameters in earlier model of vocab 80 words

In [68]:
#Train the model for 10 epochs. Each epoch takes at least 5 minutes. Total training time approximately 50 minutes
for epoch in range(1, 11):
    loss = 0.
    i = 0
    for x, y in generate_context_word_pairs(corpus=widsAl, window_size=window_size, vocab_size=vocab_size_alice):
        i += 1
        loss += cbowAl.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))

    print('Epoch:', epoch, '\tLoss:', loss)

Epoch: 1 	Loss: 186648.92244170606
Epoch: 2 	Loss: 229101.56220288947
Epoch: 3 	Loss: 233897.94276710693
Epoch: 4 	Loss: 235240.0928955432
Epoch: 5 	Loss: 241231.58937511826
Epoch: 6 	Loss: 244479.49974279804
Epoch: 7 	Loss: 246508.15938479546
Epoch: 8 	Loss: 246198.8009637592
Epoch: 9 	Loss: 250720.9576856495
Epoch: 10 	Loss: 252279.28531993495


In [71]:
#Get word embeddings for the vocabulary

weightsAl = cbowAl.get_weights()[0] #Word embedding of PAD
weightsAl = weightsAl[1:] #Exclude word embedding of PAD
print(weightsAl.shape) # 3053 (3054-1) vocabulary 

#Convert the weights to a dataframe for each of the word
#A single row shows the word embedding done in 100 dimensions by CBOW model

#Shape of dataframe (3053,100)

aliceDF = pd.DataFrame(weightsAl, index=list(id2wordAl.values())[1:])
aliceDF.head()

(3053, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
”,1.387312,1.059913,1.399809,-1.705998,-1.438428,1.978453,-1.728255,-1.331693,-1.273764,1.209744,...,1.229479,0.988443,-1.452939,1.412286,1.41016,-1.13922,1.437299,-1.100276,-1.148705,1.783028
and,1.092667,1.113216,1.259892,-1.383837,-0.690532,0.910162,-1.399434,-1.341926,-0.792996,1.580792,...,1.413779,0.857467,-1.270074,1.743577,0.938099,-1.457112,1.127715,-1.331147,-1.084189,1.176422
to,1.298293,1.723328,1.397377,-1.622204,-1.253475,1.313388,-1.614611,-1.379891,-1.160793,1.447603,...,1.393684,1.211561,-1.333187,1.819563,1.432811,-1.1478,1.36622,-1.685795,-1.588862,1.088095
a,1.334192,1.155898,1.031305,-1.776452,-1.64312,1.327358,-1.53812,-1.119446,-0.803385,1.446294,...,1.283289,1.071023,-1.479391,1.71661,0.964271,-1.334887,1.029508,-1.226262,-1.563774,1.358557
she,1.064586,1.454698,1.035282,-1.684532,-1.141873,1.150333,-1.464615,-0.815316,-1.480658,1.208726,...,1.460051,0.836496,-1.743281,1.628383,1.125051,-1.818674,1.212635,-0.905509,-1.077633,1.768557


In [77]:
#Check for context similarity based on euclidean distances

# compute pairwise distance matrix
distance_matrix_alice = euclidean_distances(weightsAl)
print(distance_matrix_alice.shape) # (3053,3053)

# view contextually similar words
similar_words_alice = {search_term: [id2wordAl[idx] for idx in distance_matrix_alice[word2idAl[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['tired','well','bats','alice','croquet',
                                       'cheshire','hatter','down','flamingo','adventures']}

similar_words_alice

(3053, 3053)


{'tired': ['immediately', 'full', '’em', 'free', 'dish'],
 'well': ['soon', 'something', 'heard', '“it', 'half'],
 'bats': ['sending', 'she’ll', 'proceed', 'feelings', 'miles'],
 'alice': ['with', '“i', 'as', 'but', 'his'],
 'croquet': ['play', 'shouting', 'flat', 'trouble', 'also'],
 'cheshire': ['croquet', 'trouble', 'waving', 'altogether', '“sure'],
 'hatter': ['duchess', 'gryphon', 'cat', 'indeed', 'making'],
 'down': ['up', 'off', 'into', 'about', 'an'],
 'flamingo': ['latin', 'yet—oh', 'kissed', 'daughter', 'ordering'],
 'adventures': ['repeating', 'changes', 'neither', 'daughter', 'sister’s']}

In [None]:
#Context similarities can be found for words like play for croquet.
#Preprocessing the text and more training shall yield more contextually relevant results