#Setup

##Imports

In [19]:
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
import random
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

##Data

###Load Data

In [2]:
text = list(pd.read_csv('/content/in_domain_train.tsv', sep='\t', header=None)[3]) + list(pd.read_csv('/content/in_domain_dev.tsv', sep='\t', header=None)[3]) + list(pd.read_csv('/content/out_of_domain_dev.tsv', sep='\t', header=None)[3])
text[-10: -1]

['John thinks it would upset himself to die.',
 'John made Bill mad at himself.',
 'John made Bill master of himself.',
 'The correspondence school made Bill a good typist.',
 'The correspondence school sent Bill a good typist.',
 'John considers Bill silly.',
 'John considers Bill to be silly.',
 'John bought a dog for himself to play with.',
 'John arranged for himself to get the prize.']

In [3]:
print(len(text))

9594


###Preprocess Text

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
tokenized_text = tokenizer.texts_to_sequences(text)
len(tokenized_text)

9594

In [5]:
idx_word = {v: k for k, v in tokenizer.word_index.items()}
print(idx_word)

{1: 'the', 2: 'to', 3: 'a', 4: 'that', 5: 'i', 6: 'is', 7: 'john', 8: 'of', 9: 'was', 10: 'and', 11: 'in', 12: 'he', 13: 'it', 14: 'you', 15: 'be', 16: 'who', 17: 'mary', 18: 'on', 19: 'bill', 20: 'for', 21: 'will', 22: 'with', 23: 'did', 24: 'which', 25: 'we', 26: 'have', 27: 'more', 28: 'this', 29: 'his', 30: 'they', 31: 'by', 32: 'book', 33: 'at', 34: 'she', 35: 'him', 36: 'has', 37: 'not', 38: 'her', 39: 'had', 40: 'me', 41: 'from', 42: 'what', 43: 'man', 44: 'would', 45: 'there', 46: 'are', 47: 'as', 48: 'my', 49: 'up', 50: 'than', 51: 'been', 52: 'put', 53: 'any', 54: 'but', 55: 'do', 56: 'all', 57: 'some', 58: 'himself', 59: 'left', 60: 'leave', 61: 'about', 62: 'know', 63: 'gave', 64: 'can', 65: 'every', 66: 'into', 67: 'were', 68: 'people', 69: 'saw', 70: 'eat', 71: 'if', 72: 'one', 73: 'ball', 74: 'student', 75: 'an', 76: 'out', 77: 'go', 78: 'students', 79: 'how', 80: 'said', 81: 'too', 82: 'read', 83: 'like', 84: 'should', 85: 'no', 86: 'kim', 87: 'girl', 88: 'picture', 89:

In [6]:
# Making Sure That Every Element Has length > 1
for ele in tokenized_text:
    if len(ele) <= 1:
        print(ele)
        tokenized_text.remove(ele)


9594


In [7]:
# Randomly Picking The Missing Word(Output) and Using the rest of the Sentence as the Input.
# Using each sentence thrice
outputs = []
inputs = []
for i in range(len(tokenized_text)):
    seq1 = list(tokenized_text[i])
    seq2 = list(tokenized_text[i])
    seq3 = list(tokenized_text[i])
    ele1 = random.choice(seq1)
    ele2 = random.choice(seq2)
    ele3 = random.choice(seq3)
    outputs.append(ele1)
    outputs.append(ele2)
    outputs.append(ele3)
    seq1.remove(ele1)
    seq2.remove(ele2)
    seq3.remove(ele3)
    inputs.append(seq1)
    inputs.append(seq2)
    inputs.append(seq3)

idx = 2
print(inputs[idx])
print(outputs[idx])


[204, 289, 232, 163, 28, 777, 778, 1251, 779, 72, 25, 3828]
1


In [8]:
inp_padded = pad_sequences(inputs)
inp_padded.shape

(28782, 41)

#Build and Train Model

In [9]:
depth = int(list(tokenizer.word_index.values())[-1] + 1)

##Custom Layers

In [10]:
class PreprocessLayer(tf.keras.layers.Layer):
    def __init__(self, depth):
        super(PreprocessLayer, self).__init__()
        self.depth = depth
    
    def call(self, inputs):
        inp_one_hot = tf.one_hot(inputs, depth=self.depth)

        return inp_one_hot, inputs

class AveragePoolingLayer(tf.keras.layers.Layer):
    def __init__(self, ignore=0):
        super(AveragePoolingLayer, self).__init__()
        self.ignore = ignore
    
    def call(self, inp_one_hot, inp_padded):
        inp_pad_mask = tf.cast((inp_padded != self.ignore), tf.float64)
        inp_mask_sum = tf.math.reduce_sum(inp_pad_mask, axis=-1)
        inp_one_hot_1 = inp_one_hot  * tf.expand_dims(inp_pad_mask, axis=-1)
        inp_one_hot_avg = tf.math.reduce_sum(inp_one_hot_1, axis=1) / tf.expand_dims(inp_mask_sum, axis=-1)

        return inp_one_hot_avg


##Initialize The Model

In [11]:
DenseLayer1 = tf.keras.layers.Dense(256, activation='relu')
DenseLayer2 = tf.keras.layers.Dense(depth, activation='softmax')
model_inp = tf.keras.layers.Input(shape=(41,), dtype=tf.int32)
one_hot_inp, padded_inp = PreprocessLayer(depth=depth)(model_inp)
x = AveragePoolingLayer()(one_hot_inp, padded_inp)
x = DenseLayer1(x)
model_out = DenseLayer2(x)
model = tf.keras.Model(inputs=model_inp, outputs=model_out)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

##Train The Model

In [12]:
model.fit(inp_padded, np.array(outputs), epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f38a1eeff50>

##Predict using a random sentence
This is majorly done to check if the model works decently well on a test example. It is a very simple sanity check.

In [17]:
def predict(sentence):
    tokenized = tokenizer.texts_to_sequences([sentence])
    tokenized.append([0] * 41)
    tokenized = pad_sequences(tokenized)
    out = model.predict(tokenized)[0]
    max_out = tf.math.argmax(out)
    output = idx_word.get(max_out.numpy(), "")
    print(output)
predict("the cat sat on the")

table


In [18]:
predict("on sat the cat the")

table


We can see that the model predicts the same word even though the order of the words are shuffled. 

This is because the Continuous Bag Of Words(CBOW) model we use here doesn't care about the order of the words while predicting.

#Word Embeddings

##Extracting The Embeddings

In [80]:
weight_one = DenseLayer1.get_weights()[0]
weight_two = DenseLayer2.get_weights()[0]
embeddings = 0.5 * (weight_one + weight_two.T)
embeddings.shape

(5828, 256)

##Saving The Embeddings

###Saving as a .txt File

In [83]:
with open("/content/word2vecembed.txt", 'w') as file:
    for i in range(len(idx_word)):
        file.write(f"{idx_word[i+1]} {embeddings[i+1]}")

###Saving as a .tsv file
TSV files are better while saving word embeddings as TSVs are the file types suported in Embedding Projectors.

In [91]:
import io

In [92]:
out_v = io.open('word2vec_embeddings.tsv', 'w', encoding='utf-8')
out_m = io.open('word2vec_vocab.tsv', 'w', encoding='utf-8')

for index, word in enumerate(list(idx_word.values())):
    vec = embeddings[index+1]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

###Downloading The Files

In [93]:
try:
  from google.colab import files
  files.download('embeddings.tsv')
  files.download('words.tsv')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [94]:
try:
  from google.colab import files
  files.download('/content/word2vecembed.txt')
except Exception:
  pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>