In [1]:

import string
import re
from os import listdir
from nltk.corpus import stopwords
import tensorflow.keras.backend as K
from tensorflow.keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras import utils
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Lambda
from pandas import DataFrame
from matplotlib import pyplot
from numpy.lib.stride_tricks import sliding_window_view
from tqdm import tqdm


In [2]:
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

def doc_to_line(filename, vocab):
    doc = load_doc(filename)
    tokens = clean_doc(doc)
    tokens = [w for w in tokens if w in vocab]
    return ' '.join(tokens)

def process_docs(directory, vocab, is_train):
    lines = list()
    for filename in listdir(directory):
        if is_train and filename.startswith('cv9'):
            continue
        if not is_train and not filename.startswith('cv9'):
            continue
        path = directory + '/' + filename
        line = doc_to_line(path, vocab)
        lines.append(line)
    return lines

def load_clean_dataset(vocab, is_train):
    neg = process_docs('txt_sentoken/neg', vocab, is_train)
    pos = process_docs('txt_sentoken/pos', vocab, is_train)
    docs = neg + pos
    labels = [0 for _ in range(len(neg))] + [1 for _ in range(len(pos))]
    return docs, labels
    
def define_model(n_words):
    model = Sequential()
    model.add(Dense(50, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
    scores = list()
    n_repeats = 10
    n_words = Xtest.shape[1]
    for i in range(n_repeats):
        model = define_model(n_words)
        model.fit(Xtrain, ytrain, epochs=10, verbose=0)
        _, acc = model.evaluate(Xtest, ytest, verbose=0)
        scores.append(acc)
    print('%d accuracy: %s' % ((i+1), acc))
    return scores

def prepare_data(train_docs, test_docs, mode):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_docs)
    Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
    Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
    return Xtrain, Xtest


In [3]:
# get pre-prepared vocab
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())


In [4]:
train_docs, ytrain = load_clean_dataset(vocab, True)
#test_docs, ytest = load_clean_dataset(vocab, False)
#import numpy as np
#y= np.array(ytrain)
#yt= np.array(ytest)

In [5]:
# tokenizing vocab
tokenizer = Tokenizer()
tokenizer.fit_on_texts(vocab)
word2id = tokenizer.word_index
word2id = {k:v-1 for k,v in word2id.items()}
id2word = {v:k for k,v in word2id.items()}

In [6]:
vocab_size = len(word2id)
window_size=2

In [7]:
# encoding training docs
wids = [[word2id[w] for w in text_to_word_sequence(doc)] for doc in train_docs]

In [8]:
import random
random.seed(42)

In [9]:
# function to generate training examples
def generate_context_word_pairs(corpus, window_size, vocab_size):
    context_length = window_size*2
    random.shuffle(corpus)
    for words in corpus:
        sentence_length = len(words)
        data = sliding_window_view([0] * window_size + words + [0] * window_size ,context_length+1)
        for line in data:
            if line[window_size]>=vocab_size:
                print(id2word[line[window_size]])
            x = np.delete(line,window_size,0).reshape((1,-1))
            y = utils.to_categorical(line[window_size], vocab_size).reshape((1,-1))
            yield (x, y)

## For 100 Embedding Size

In [14]:
embed_size = 100
cbow = Sequential()
cbow.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow.add(Dense(vocab_size, activation='softmax'))
cbow.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [15]:
cbow.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 4, 100)            1480300   
_________________________________________________________________
lambda (Lambda)              (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 14803)             1495103   
Total params: 2,975,403
Trainable params: 2,975,403
Non-trainable params: 0
_________________________________________________________________


In [16]:
# function to generate multiple exapmles from an iterator
# used to indicate batch size (bs)
def get_iter(z,n):
    xs=[]
    ys=[]
    ne = next(z,False)
    i=0
    while ne:
        x,y=ne
        xs.append(x)
        ys.append(y)
        i+=1
        if i>=n:
            break
        ne = next(z,False)

    return np.concatenate(xs),np.concatenate(ys)

In [17]:
for epoch in range(1, 16):
    losses = []
    i = 0
    # batch_size
    bs=100
    data = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)
    for j in tqdm(range(570000//bs)):
        x,y = get_iter(data,bs)
        i += 1*bs
        loss = cbow.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))
    
            losses.append(loss)
    print('Epoch:', epoch, '\tLoss:', losses[-1])
    print()

 18%|█▊        | 1008/5700 [00:22<01:43, 45.28it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2008/5700 [00:44<01:23, 44.44it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:07<01:02, 43.23it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:29<00:38, 44.38it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:52<00:15, 44.04it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:08<00:00, 44.33it/s]


Epoch: 1 	Loss: 8.706259727478027



 18%|█▊        | 1007/5700 [00:23<01:51, 42.16it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:47<01:23, 44.02it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3006/5700 [01:10<01:01, 44.02it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:33<00:39, 42.96it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5006/5700 [01:56<00:15, 43.60it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:12<00:00, 43.17it/s]


Epoch: 2 	Loss: 8.196671485900879



 18%|█▊        | 1004/5700 [00:23<01:48, 43.09it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2004/5700 [00:46<01:25, 43.18it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3009/5700 [01:09<01:02, 43.37it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4004/5700 [01:32<00:38, 43.88it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:55<00:15, 44.58it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:11<00:00, 43.28it/s]


Epoch: 3 	Loss: 10.175945281982422



 18%|█▊        | 1005/5700 [00:23<01:46, 43.91it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:46<01:25, 43.28it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3004/5700 [01:09<01:02, 42.82it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:33<00:39, 43.42it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:56<00:15, 43.58it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:12<00:00, 43.01it/s]


Epoch: 4 	Loss: 10.179208755493164



 18%|█▊        | 1005/5700 [00:23<01:50, 42.36it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2004/5700 [00:47<01:45, 35.03it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:12<01:01, 43.42it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:36<00:39, 43.31it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:59<00:15, 43.48it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:15<00:00, 42.09it/s]


Epoch: 5 	Loss: 9.059844970703125



 18%|█▊        | 1006/5700 [00:24<01:48, 43.08it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2008/5700 [00:49<01:22, 44.84it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:11<00:58, 46.34it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:33<00:36, 46.22it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:55<00:15, 46.00it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:10<00:00, 43.77it/s]


Epoch: 6 	Loss: 9.300460815429688



 18%|█▊        | 1005/5700 [00:21<01:40, 46.88it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:42<01:19, 46.69it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:04<00:57, 47.16it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:25<00:36, 46.88it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:46<00:14, 46.79it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:01<00:00, 46.93it/s]


Epoch: 7 	Loss: 7.631513595581055



 18%|█▊        | 1005/5700 [00:21<01:39, 46.97it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:42<01:18, 47.10it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:03<00:57, 47.14it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:25<00:36, 46.81it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:46<00:14, 47.62it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:01<00:00, 46.95it/s]


Epoch: 8 	Loss: 8.445752143859863



 18%|█▊        | 1010/5700 [00:21<01:38, 47.57it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:42<01:19, 46.36it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:03<00:56, 47.54it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4010/5700 [01:25<00:35, 47.84it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:46<00:15, 46.01it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:01<00:00, 46.95it/s]


Epoch: 9 	Loss: 8.322246551513672



 18%|█▊        | 1005/5700 [00:21<01:38, 47.45it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2010/5700 [00:42<01:16, 47.96it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:03<00:56, 47.51it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:24<00:35, 47.17it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:46<00:15, 43.80it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:01<00:00, 46.73it/s]


Epoch: 10 	Loss: 9.21213436126709



 18%|█▊        | 1005/5700 [00:21<01:39, 46.97it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:42<01:18, 46.90it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:03<00:57, 46.65it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:25<00:36, 46.40it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:46<00:14, 46.69it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:01<00:00, 46.88it/s]


Epoch: 11 	Loss: 11.280233383178711



 18%|█▊        | 1005/5700 [00:21<01:39, 46.96it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:42<01:18, 46.96it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:04<00:57, 46.63it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:25<00:36, 46.58it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:47<00:14, 47.06it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:02<00:00, 46.68it/s]


Epoch: 12 	Loss: 9.165172576904297



 18%|█▊        | 1005/5700 [00:21<01:42, 45.88it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:43<01:19, 46.59it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:04<00:57, 46.66it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:25<00:36, 46.61it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:47<00:14, 46.59it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:02<00:00, 46.61it/s]


Epoch: 13 	Loss: 7.757524013519287



 18%|█▊        | 1005/5700 [00:21<01:40, 46.82it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:43<01:18, 46.89it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:04<00:58, 46.32it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:26<00:36, 46.11it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:47<00:14, 46.46it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:02<00:00, 46.43it/s]


Epoch: 14 	Loss: 9.55837345123291



 18%|█▊        | 1005/5700 [00:21<01:41, 46.28it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2004/5700 [00:43<01:18, 47.18it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3004/5700 [01:04<00:56, 47.36it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4009/5700 [01:26<00:35, 47.59it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:47<00:14, 47.93it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [02:02<00:00, 46.70it/s]

Epoch: 15 	Loss: 8.902464866638184






In [27]:
import pandas as pd

weights = cbow.get_weights()[0]
weights = weights[1:]
print(weights.shape)

cbow_embed = pd.DataFrame(weights, index=list(id2word.values())[1:])

(14802, 100)


In [78]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix = euclidean_distances(weights)
print(distance_matrix.shape)

# view contextually similar words
similar_words = {search_term: [id2word[idx] for idx in distance_matrix[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['scenes', 'movies', 'plot', 'good', 'character', 'director','script','airplane']}

similar_words

(14802, 14802)


{'scenes': ['plot', 'time', 'many', 'back', 'another'],
 'movies': ['years', 'comedy', 'bad', 'great', 'set'],
 'plot': ['back', 'movie', 'story', 'films', 'time'],
 'good': ['great', 'bad', 'end', 'day', 'us'],
 'character': ['like', 'one', 'new', 'even', 'film'],
 'director': ['character', 'like', 'even', 'one', 'new'],
 'script': ['back', 'plot', 'great', 'comedy', 'set'],
 'airplane': ['fires', 'cunning', 'battling', 'kurtz', 'stereo']}

## For 50 Embedding Size

In [23]:
embed_size=50
cbow2 = Sequential()
cbow2.add(Embedding(input_dim=vocab_size, output_dim=embed_size, input_length=window_size*2))
cbow2.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(embed_size,)))
cbow2.add(Dense(vocab_size, activation='softmax'))
cbow2.compile(loss='categorical_crossentropy', optimizer='rmsprop')

In [24]:
cbow2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 4, 50)             740150    
_________________________________________________________________
lambda_1 (Lambda)            (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 14803)             754953    
Total params: 1,495,103
Trainable params: 1,495,103
Non-trainable params: 0
_________________________________________________________________


In [25]:
for epoch in range(1, 16):
    losses2 = []
    i = 0
    bs=100
    data = generate_context_word_pairs(corpus=wids, window_size=window_size, vocab_size=vocab_size)
    for j in tqdm(range(570000//bs)):
        x,y = get_iter(data,bs)
        i += 1*bs
        loss = cbow2.train_on_batch(x, y)
        if i % 100000 == 0:
            print('Processed {} (context, word) pairs'.format(i))
    
            losses2.append(loss)
    print('Epoch:', epoch, '\tLoss:', losses2[-1])
    print()

 18%|█▊        | 1007/5700 [00:20<01:35, 49.05it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:41<01:14, 49.81it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3009/5700 [01:01<00:53, 49.93it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:21<00:34, 49.35it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:42<00:14, 47.23it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.02it/s]


Epoch: 1 	Loss: 8.271760940551758



 18%|█▊        | 1005/5700 [00:20<01:35, 49.34it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2009/5700 [00:40<01:13, 50.48it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3007/5700 [01:00<00:55, 48.75it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4007/5700 [01:21<00:34, 48.82it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5007/5700 [01:41<00:14, 48.50it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.11it/s]


Epoch: 2 	Loss: 7.924046039581299



 18%|█▊        | 1007/5700 [00:20<01:36, 48.61it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2007/5700 [00:41<01:15, 48.85it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3007/5700 [01:02<00:58, 45.96it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4007/5700 [01:22<00:34, 48.56it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:43<00:14, 48.60it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:57<00:00, 48.47it/s]


Epoch: 3 	Loss: 9.143813133239746



 18%|█▊        | 1006/5700 [00:20<01:36, 48.68it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2008/5700 [00:40<01:15, 48.88it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3004/5700 [01:00<00:55, 48.49it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:21<00:34, 48.70it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5006/5700 [01:41<00:14, 47.13it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.10it/s]


Epoch: 4 	Loss: 9.262236595153809



 18%|█▊        | 1005/5700 [00:20<01:36, 48.71it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:41<01:16, 48.60it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3007/5700 [01:01<00:55, 48.31it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4007/5700 [01:22<00:34, 48.44it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:43<00:14, 48.39it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:57<00:00, 48.54it/s]


Epoch: 5 	Loss: 7.656022548675537



 18%|█▊        | 1006/5700 [00:20<01:36, 48.43it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2008/5700 [00:41<01:15, 48.82it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:01<00:55, 48.89it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:22<00:34, 49.64it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:42<00:13, 49.56it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 48.88it/s]


Epoch: 6 	Loss: 9.389405250549316



 18%|█▊        | 1006/5700 [00:20<01:33, 50.23it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2007/5700 [00:40<01:15, 49.22it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:00<00:58, 46.16it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4005/5700 [01:20<00:33, 49.97it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5006/5700 [01:40<00:13, 50.03it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:54<00:00, 49.88it/s]


Epoch: 7 	Loss: 9.966471672058105



 18%|█▊        | 1010/5700 [00:20<01:33, 50.25it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:40<01:14, 49.43it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:00<00:52, 51.13it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:20<00:34, 48.94it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:40<00:14, 48.85it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:54<00:00, 49.57it/s]


Epoch: 8 	Loss: 9.061644554138184



 18%|█▊        | 1007/5700 [00:20<01:35, 49.07it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:40<01:15, 49.03it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:01<00:54, 49.10it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:21<00:34, 49.28it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:42<00:14, 49.30it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.07it/s]


Epoch: 9 	Loss: 7.989750385284424



 18%|█▊        | 1004/5700 [00:20<01:39, 47.13it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:40<01:16, 48.36it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:01<00:54, 49.03it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:21<00:34, 48.92it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:42<00:14, 48.79it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.01it/s]


Epoch: 10 	Loss: 8.011358261108398



 18%|█▊        | 1006/5700 [00:20<01:35, 49.04it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:40<01:14, 49.32it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3008/5700 [01:01<00:54, 49.23it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4007/5700 [01:21<00:34, 49.34it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5007/5700 [01:41<00:14, 48.95it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:55<00:00, 49.15it/s]


Epoch: 11 	Loss: 10.007803916931152



 18%|█▊        | 1006/5700 [00:20<01:35, 49.34it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2009/5700 [00:40<01:15, 48.93it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3006/5700 [01:01<00:54, 49.04it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4007/5700 [01:21<00:34, 49.21it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5005/5700 [01:42<00:14, 47.96it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.09it/s]


Epoch: 12 	Loss: 8.522176742553711



 18%|█▊        | 1006/5700 [00:20<01:35, 49.19it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2008/5700 [00:40<01:14, 49.24it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:01<00:55, 48.68it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:21<00:35, 48.02it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5009/5700 [01:42<00:14, 49.21it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.03it/s]


Epoch: 13 	Loss: 9.256138801574707



 18%|█▊        | 1007/5700 [00:20<01:35, 49.03it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2006/5700 [00:40<01:15, 48.95it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:01<00:55, 48.33it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4008/5700 [01:21<00:34, 49.10it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5008/5700 [01:42<00:14, 48.70it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:56<00:00, 49.00it/s]


Epoch: 14 	Loss: 7.708466053009033



 18%|█▊        | 1005/5700 [00:20<01:36, 48.54it/s]

Processed 100000 (context, word) pairs


 35%|███▌      | 2005/5700 [00:41<01:17, 47.92it/s]

Processed 200000 (context, word) pairs


 53%|█████▎    | 3005/5700 [01:01<00:55, 48.51it/s]

Processed 300000 (context, word) pairs


 70%|███████   | 4006/5700 [01:22<00:34, 49.02it/s]

Processed 400000 (context, word) pairs


 88%|████████▊ | 5006/5700 [01:42<00:14, 48.95it/s]

Processed 500000 (context, word) pairs


100%|██████████| 5700/5700 [01:57<00:00, 48.66it/s]

Epoch: 15 	Loss: 8.336174011230469






In [31]:
import pandas as pd

weights2 = cbow2.get_weights()[0]
weights2 = weights2[1:]
print(weights.shape)

# get embedding layer and save it in a data frame
cbow_embed2 = pd.DataFrame(weights2, index=list(id2word.values())[1:])

(14802, 100)


In [83]:
cbow_embed2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
minime,-0.022439,0.002697,-0.052501,0.037626,-0.07775,0.108842,0.010356,0.01604,-0.03556,-0.071554,...,0.054338,-0.006274,-0.086588,-0.068387,0.033215,-0.01709,0.122262,-0.015611,0.045766,-0.07976
development,0.446492,0.098142,-0.837156,0.148432,-0.47301,0.86777,0.147892,0.76512,0.338696,0.414017,...,-0.282149,0.178609,-0.013432,-0.116612,0.281091,-0.678257,-0.422604,-0.287184,0.187136,-0.589217
snails,0.161501,-0.019215,-0.071411,-0.11785,0.087203,-0.014703,-0.19579,0.110234,0.155726,0.05399,...,-0.055832,0.173336,0.102171,0.01777,-0.012717,0.035227,-0.01721,-0.081917,-0.149901,0.06937
experience,0.378994,-0.278708,-0.755604,-0.439698,-0.239931,0.548892,-0.573432,0.750922,-0.609256,0.437183,...,0.77006,0.423539,0.424337,0.634931,0.630823,-0.146818,-0.35239,-0.520596,-0.142981,-0.715027
bosco,0.058635,-0.092159,-0.119827,-0.024501,-0.088382,0.03961,-0.076446,-0.024487,-0.062403,0.102968,...,0.015738,0.105876,-0.023984,0.076409,0.085746,-0.020851,0.078548,-0.080226,-0.026793,0.03981


In [76]:
from sklearn.metrics.pairwise import euclidean_distances

# compute pairwise distance matrix
distance_matrix2 = euclidean_distances(weights2)
print(distance_matrix2.shape)

# view contextually similar words
similar_words2 = {search_term: [id2word[idx] for idx in distance_matrix2[word2id[search_term]-1].argsort()[1:6]+1] 
                   for search_term in ['flight', 'movies', 'plot', 'good', 'character', 'director','script']}

similar_words2

(14802, 14802)


{'flight': ['troops', 'interviews', 'trial', 'newly', 'escapes'],
 'movies': ['great', 'many', 'rather', 'best', 'instead'],
 'plot': ['however', 'seems', 'instead', 'script', 'comedy'],
 'good': ['still', 'great', 'best', 'rather', 'instead'],
 'character': ['like', 'also', 'one', 'director', 'man'],
 'director': ['one', 'character', 'even', 'like', 'life'],
 'script': ['instead', 'however', 'comedy', 'plot', 'course']}

## saving data and embeddings

In [35]:
import pickle

In [39]:
def save_as_pickle(obj,file_name):
    with open(file_name,'wb') as file:
        pickle.dump(obj,file)

In [60]:
# adding pad

# for 100 hidden neuron embedings
cbow_embed = cbow_embed.append([[0]*100])
as_list = cbow_embed.index.tolist()
idx = as_list.index(0)
as_list[idx] = 'pad'
cbow_embed.index = as_list

# for 50 hidden neuron embedings
cbow_embed2 = cbow_embed2.append([[0]*50])
as_list = cbow_embed2.index.tolist()
idx = as_list.index(0)
as_list[idx] = 'pad'
cbow_embed2.index = as_list

  cbow_embed = cbow_embed.append([[0]*100])
  cbow_embed2 = cbow_embed2.append([[0]*100])


In [79]:
cbow_embed2.tail(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
pad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
cbow_embed.tail(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
pad,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
save_as_pickle(word2id,"word2id")
save_as_pickle(id2word,'id2word')
save_as_pickle(wids,'wids')
cbow_embed.to_pickle('embeddings_for_100')
cbow_embed2.to_pickle('embeddings_for_50')