In [96]:
import numpy as np
import tensorflow as tf
import keras
import keras.layers as layers
import pickle
import wandb
from wandb.keras import WandbCallback
import time

In [35]:
glove_mat = np.load("./data/glove_6B_50d.npy")
words = np.load("./data/words_6B_50d.npy")
with open("./data/word_to_ind_6B_50d.pkl", "rb") as f:
    word_to_ind = pickle.load(f)

In [36]:
X_train = np.load("./data/google_mikolov/X_train.npy")
X_dev = np.load("./data/google_mikolov/X_dev.npy")
X_test = np.load("./data/google_mikolov/X_test.npy")
X_test_words = np.load("./data/google_mikolov/X_test_words.npy")

Y_train = np.load("./data/google_mikolov/Y_train.npy")
Y_dev = np.load("./data/google_mikolov/Y_dev.npy")
Y_test = np.load("./data/google_mikolov/Y_test.npy")
Y_test_words = np.load("./data/google_mikolov/Y_test_words.npy")

In [48]:
def dist(a, b):
    return np.linalg.norm(glove_mat[word_to_ind[a]] - glove_mat[word_to_ind[b]])

def closest_neighbors(embedding, n):
    neighbors = sorted(words, key=lambda word: np.linalg.norm(embedding - glove_mat[word_to_ind[word]]))
    return neighbors[:n]

In [128]:
delta = 1.0
h = 1.0

def mse_large(y_true, y_pred):
    mse = keras.losses.mse(y_true, y_pred)
    # penalty = tf.divide(delta, tf.math.add(h, tf.norm(y_pred))) 
    penalty = tf.maximum(0.0, tf.multiply(-0.5/5.0, tf.subtract(tf.norm(y_pred), 5.0)))
    # print(mse, penalty)
    return tf.math.add(mse, penalty)

print(mse_large(minnesota, bulletinyyy))
print(mse_large(florida, instance))

tf.Tensor(1.0125517529035342, shape=(), dtype=float64)
tf.Tensor(0.5664634466615177, shape=(), dtype=float64)


In [167]:
def mse_cossim(y_true, y_pred):
    mse = keras.losses.mse(y_true, y_pred)
    mse = tf.multiply(mse, 0.2)
    cossin = keras.losses.cosine_similarity(y_true, y_pred)
    penalty = tf.maximum(0.0, tf.subtract(tf.divide(1.0, tf.add(0.3, tf.norm(y_pred))), 0.2))
    # print(mse, cossin)
    return tf.add(tf.add(mse, cossin), penalty)

print(mse_cossim(minnesota, bulletinyyy))
print(mse_cossim(florida, instance))

tf.Tensor(2.894656970745808, shape=(), dtype=float64)
tf.Tensor(-0.36326289742076506, shape=(), dtype=float64)


In [156]:
config = {
    "architecture": "dense",
    "dataset": "google_mikolov",
    "optimizer": "adam",
    "loss": "mse_cossim",
    "metrics": ["mse"],
    "layers": 3,
    "hidden_layers": [300, 300]
}

inputs = layers.Input(shape=(150))
X = inputs

assert(config["layers"] == len(config["hidden_layers"]) + 1)

for size in config["hidden_layers"]:
    X = layers.Dense(size, activation="relu")(X)
    X = layers.Dropout(0.2)(X)

Y = layers.Dense(50, activation="relu")(X)

model = keras.Model(inputs=inputs, outputs=Y, name="analogy_model")
model.summary()


Model: "analogy_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_20 (InputLayer)        [(None, 150)]             0         
_________________________________________________________________
dense_64 (Dense)             (None, 300)               45300     
_________________________________________________________________
dropout_45 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_65 (Dense)             (None, 300)               90300     
_________________________________________________________________
dropout_46 (Dropout)         (None, 300)               0         
_________________________________________________________________
dense_66 (Dense)             (None, 50)                15050     
Total params: 150,650
Trainable params: 150,650
Non-trainable params: 0
_______________________________________________

In [168]:
model.compile(loss=mse_cossim, optimizer=config["optimizer"], metrics=config["metrics"])

run = wandb.init(project="word-analogies", name=config["architecture"] + "-" + str(config["layers"]) + "-" + str(config["hidden_layers"]), config=config, notes="tweaked penalty")

model.fit(X_train, Y_train, batch_size=32, epochs=100, validation_data=(X_dev, Y_dev), callbacks=[WandbCallback()])

if input("Save model? (y/n): ") == "y":
    model.save("models/" + config["architecture"] + "-" + str(config["layers"]) + "-" + str(config["hidden_layers"]) + "-" + str(round(time.time())))

run.finish()

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

VBox(children=(Label(value=' 1.76MB of 1.76MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,99.0
loss,-0.66414
mse,0.24831
val_loss,-0.67554
val_mse,0.23601
_step,99.0
_runtime,99.0
_timestamp,1602257071.0
best_val_loss,-0.67564
best_epoch,96.0


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,▆▇███▇▄▇▆▅▆▆▆▅▄▅▅▆▄▅▆▆▅▃▆▃▃▅▆▃▅▅▅▃▃▃▄▂▁▅
mse,▄▇▇▇█▇▃█▆▄▅▆▆▄▄▅▅▅▄▅▆▅▄▁▄▂▃▅▅▃▄▄▅▃▄▂▃▃▁▄
val_loss,▆▅▆█▇▅▆▄▄▆▃▄▄▅▇▄▄▄▅▄▅▄▃▃▆▅▁▆▅▃▄▃▃▂▄▁▅▁▁▃
val_mse,▆▄▇▆▇▇▆▅▃▇▅▅▄▅█▇▆▆▅▅▇▅▄▃▄▅▁▅▃▅▅▄▄▃▄▁▅▃▂▅
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██
_timestamp,▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇██


In [172]:
model.evaluate(X_test, Y_test, verbose=2)
Y_predict = model.predict(X_test)
Y_predict.shape

62/62 - 0s - loss: -6.7752e-01 - mse: 0.2362


(1955, 50)

In [173]:
indices = np.random.randint(Y_predict.shape[0], size=10)
for i, emb in enumerate(Y_predict[indices]):
    print(Y_test_words[indices[i]], closest_neighbors(emb, 5), "Q:", X_test_words[indices[i]])

sharper ['conversely', 'misses', 'importantly', 'bulletinyyy', 'srivalo'] Q: ['young' 'younger' 'sharp']
Macedonian ['bulletinyyy', 'srivalo', 'piyanart', 'besides', 'ooooooooooooooooooooooooooooooooooooooo'] Q: ['Chile' 'Chilean' 'Macedonia']
Gabon ['bulletinyyy', 'gabon', 'piyanart', 'srivalo', 'ooooooooooooooooooooooooooooooooooooooo'] Q: ['Georgetown' 'Guyana' 'Libreville']
onions ['onions', 'cooked', 'broth', 'veggies', 'roasted'] Q: ['pig' 'pigs' 'onion']
Iran ['adding', 'moreover', 'contribute', 'pressing', 'essential'] Q: ['Helsinki' 'Finland' 'Tehran']
krone ['krone', 'oei', 'aust', 'petterson', 'kaylee'] Q: ['Nigeria' 'naira' 'Denmark']
mom ['mom', 'treats', 'loves', 'reminds', 'dad'] Q: ['grandfather' 'grandmother' 'dad']
Russia ['reach', 'besides', 'sets', 'moreover', 'adding'] Q: ['Kiev' 'Ukraine' 'Moscow']
Chile ['chile', 'rica', 'whereas', 'namely', 'likewise'] Q: ['Mogadishu' 'Somalia' 'Santiago']
generating ['generating', 'generates', 'absorbing', 'generate', 'generate

In [108]:
minnesota = glove_mat[word_to_ind["minnesota"]]
bulletinyyy = glove_mat[word_to_ind["bulletinyyy"]]
florida = glove_mat[word_to_ind["florida"]]
instance = glove_mat[word_to_ind["instance"]]

In [175]:
print(np.linalg.norm(glove_mat[word_to_ind["besides"]]))
print(np.linalg.norm(glove_mat[word_to_ind["instance"]]))
print(np.linalg.norm(glove_mat[word_to_ind["whereas"]]))
print(np.linalg.norm(glove_mat[word_to_ind["serves"]]))
print(np.linalg.norm(glove_mat[word_to_ind["additionally"]]))
print(np.linalg.norm(glove_mat[word_to_ind["piyanart"]]))
print(np.linalg.norm(glove_mat[word_to_ind["srivalo"]]))
print(np.linalg.norm(glove_mat[word_to_ind["ooooooooooooooooooooooooooooooooooooooo"]]))
print(np.linalg.norm(glove_mat[word_to_ind["gabon"]]))
print(np.linalg.norm(glove_mat[word_to_ind["bulletinyyy"]]))

3.1332464376507314
3.4258391633512217
3.3534548890073057
4.144025282606665
3.1282103224588176
0.3774952538484451
0.3774078400358829
0.6525149434571366
5.0923254306985335
0.05465219281056251


In [144]:
# average mse across sample of words
mean_squared_error = 0
count = 0
for i in range(40000):
    word_1 = np.random.choice(words)
    word_2 = np.random.choice(words)
    mean_squared_error += np.mean((glove_mat[word_to_ind[word_1]] - glove_mat[word_to_ind[word_2]])**2)
    count += 1
mean_squared_error /= count
print(mean_squared_error)

0.7552636357973507
