In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="6"

In [2]:
import torch 
import numpy as np
import torch.nn as nn
import t3nsor as t3
from torchtext import data
from torchtext import datasets
import torch.optim as optim
import tqdm

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Corpus loader

In [4]:
from sgns.utils import *
from sgns.models import *

In [5]:
vocab, D = load_corpus(corpus="enwik9")

In [6]:
dataset = CorpusDataset(D)

In [7]:
vocab_size = D.shape[0]
embedding_dim = 256
batch_size = 2000

In [8]:
emb_model = TTEmbeddings(
    vocab_size,
    embedding_dim,
    tt_shape=[[180, 135], [16, 16]],
    tt_rank=64)

In [9]:
model = Word2VecSGNS(
    emb_model, neg_sampling_param=5, learning_rate=5e-4)

In [None]:
epoch_losses = []
losses = []

for epoch in range(1000):
    sampler = CorpusSampler(dataset)
    loader = DataLoader(
        dataset=dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=1,
        pin_memory=torch.cuda.is_available(),
        sampler=sampler)

    for i, batch in tqdm.tqdm(enumerate(loader)):
        loss = model.train(batch)
        losses.append(loss)

    if epoch % 1 == 0:
        avg_loss = np.mean(losses)
        epoch_losses.append(avg_loss)
        losses = []
        print ("Loss: ", avg_loss)
        print ("---------------------")
        np.savez("checkpoints/enwik9_256_64/loss.npz", loss=epoch_losses)
        
    if epoch % 10 == 0:
        torch.save(
            model.emb_model.state_dict(),
            "checkpoints/enwik9_256_64/model_"+str(epoch // 10)+".pth.tar")
        
    dataset.update_negatives()

13308it [03:00, 73.69it/s]


Loss:  27.866587026433827
---------------------


13308it [03:12, 69.14it/s]


Loss:  26.204723789557917
---------------------


In [None]:
ws = torch.LongTensor(np.arange(vocab_size))[None, :]
emb_matrix = model.emb_model.w_emb(
    ws.to(model._device))[0].detach().cpu().numpy()

In [None]:
class WordVectors:
    
    def __init__(self, vocabulary, embedding_matrix):
        self.vocab = vocabulary
        self.W = embedding_matrix
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        
    def word_vector(self, word):
        """ Takes word and returns its word vector.
        """
        if word in self.vocab:
            vec = self.W[:,int(self.vocab[word])]
            vec = vec
        else:
            print ("No such word in vocabulary.")
            vec = None
            
        return vec
    
    def nearest_words(self, word, top=10, display=False):
        """ Takes word from the vocabulary and returns its top_n
        nearest neighbors in terms of cosine similarity.
        """

        vec = self.word_vector(word)[None, :]

        cosines = cosine_similarity(self.W.T, vec)[:, 0]
        args = np.argsort(cosines)[::-1]       

        nws = []
        for i in range(1, top+1):
            nws.append((self.inv_vocab[args[i]], round(cosines[args[i]], 3)))
            if (display):
                print (self.inv_vocab[args[i]], round(cosines[args[i]], 3))
        return nws

In [None]:
word_model = WordVectors(vocab, emb_matrix.T)

In [None]:
word_model.nearest_words("one")

In [None]:
word_model.nearest_words("hitler")

In [None]:
plt.plot(epoch_losses)