In [26]:
# Import dependencies
import pandas as pd
import os
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, Dense, Flatten
from keras.callbacks import Callback
import matplotlib.pyplot as plt

In [27]:
# Load pretrained embeddings
# The first 15000
embedding_file = open('./GoogleNews-vectors-negative300_lite.txt')
embedding_file.readline()
embedding_indices = dict()
for line in embedding_file:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype=np.float32)
    embedding_indices[word] = coefs
embedding_file.close()

In [28]:
# Utility for training
# - model generation
# - metrics scraper
def get_model(e):
    model = Sequential()
    model.add(e)
    model.add(Dense(64, activation='relu'))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['acc'])

    print(model.summary())
    return model

class EpochHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.accs = []

    def on_epoch_end(self, epoch, logs={}):
        self.losses.append(logs.get('loss'))
        self.accs.append(logs.get('acc'))
        return


In [78]:
# variables to access dataset
dataset_base = "lexical_entailment"
datasets = ["baroni2012", "bless2011", "kotlerman2010", "levy2014", "turney2014"]
chosen_dataset = 0
test = "data_lex_test.tsv"
train = "data_lex_train.tsv"
val = "data_lex_val.tsv"


In [79]:
# create the tokenizer
glob = pd.read_csv(
    os.path.join(dataset_base, datasets[chosen_dataset], 'data.tsv')
    , sep='\t', header=None)

glob_text_np = np.array(glob.loc[:, 0:1]).flatten()
glob_text_uniq = list(set(glob_text_np))
tokenizer = Tokenizer()
tokenizer.fit_on_texts(glob_text_uniq)
vocab_size = len(tokenizer.word_index) + 1

In [31]:
# Setting up the embedder matrix for the pretrained embedder
num_words_found = 0
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embedding_indices.get(word)
    if embedding_vector is not None:
        num_words_found+=1
        embedding_matrix[i] = embedding_vector

print("%d words found out of %d in the %d item long embedding corpus."%
      (num_words_found, vocab_size, len(embedding_indices)))

4834 words found out of 5623 in the 149999 item long embedding corpus.


In [32]:
# Creating the models
print("Google v2w:")
embedding1 = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=2, trainable=False)
model1 = get_model(embedding1)
print()
print("Vanila embedding:")
embedding2 = Embedding(vocab_size, 300, input_length=2, trainable=True)
model2 = get_model(embedding2)

models = [model1, model2]

Google v2w:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 2, 300)            1686900   
_________________________________________________________________
dense_5 (Dense)              (None, 2, 64)             19264     
_________________________________________________________________
flatten_3 (Flatten)          (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 1,706,293
Trainable params: 19,393
Non-trainable params: 1,686,900
_________________________________________________________________
None

Vanila embedding:
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2, 300)            1686900   
_________________________

In [80]:
# Preparing the data
train_glob = pd.read_csv(
    os.path.join(dataset_base, datasets[chosen_dataset], train)
    , sep='\t', header=None)

val_glob = pd.read_csv(
    os.path.join(dataset_base, datasets[chosen_dataset], val)
    , sep='\t', header=None)

test_glob = pd.read_csv(
    os.path.join(dataset_base, datasets[chosen_dataset], test)
    , sep='\t', header=None)

train_X = np.array(train_glob.loc[:, 0:1])
train_X = np.array([[tokenizer.texts_to_sequences([txt])[0][0] for txt in row ]for row in train_X]).squeeze()
train_Y = np.array(train_glob.loc[:, 2])
train_Y = train_Y * 1

val_X = np.array(val_glob.loc[:, 0:1])
val_X = np.array([[tokenizer.texts_to_sequences([txt])[0][0] for txt in row ]for row in val_X]).squeeze()
val_Y = np.array(val_glob.loc[:, 2])
val_Y = val_Y * 1

test_X = np.array(test_glob.loc[:, 0:1])
test_X = np.array([[tokenizer.texts_to_sequences([txt])[0][0] for txt in row ]for row in test_X]).squeeze()
test_Y = np.array(test_glob.loc[:, 2])
test_Y = test_Y * 1

In [None]:
# Training the models
accs = list()
losses = list()

for i, model in enumerate(models):
    history = EpochHistory()
    print("## Training model #%d ##" % i)
    model.fit(train_X, train_Y, epochs=100, validation_data=(val_X, val_Y), callbacks=[history], verbose=0)
    accs.append(history.accs)
    losses.append(history.losses)
    print("## Testing model #%d ##" % i)
    t_loss, t_acc = model.evaluate(test_X, test_Y, verbose=0)
    print("Test Results: loss=%f, acc=%f" % (t_loss, t_acc))


## Training model #0 ##


In [None]:
show_array = accs # losses
show_array = np.array(show_array).T
labels = ['Google w2v', 'Vanilla Embedding']

plt.xlabel(labels[0])
plt.ylabel(labels[1])
plt.plot([show_array.min(), show_array.max()], [show_array.min(), show_array.max()])
plt.plot(show_array[:, 0], show_array[:, 1], 'ro')
plt.show()

In [None]:
show_array = losses
show_array = np.array(show_array).T
labels = ['Google w2v', 'Vanilla Embedding']
plt.xlabel(labels[0])
plt.ylabel(labels[1])
plt.plot([show_array.min(), show_array.max()], [show_array.min(), show_array.max()])
plt.plot(show_array[:, 0], show_array[:, 1], 'ro')
plt.show()

In [81]:
train_X

array([[ 317,   83],
       [ 912, 1108],
       [ 665, 1148],
       ...,
       [ 271, 1028],
       [ 305,  370],
       [1313,  388]])