In [1]:
%load_ext autoreload

%autoreload 1

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import yaml

from matrix.signal_matrix_factory import SignalMatrixFactory
from matrix.PIP_loss_calculator import MonteCarloEstimator
from utils.tokenizer import SimpleTokenizer
from utils.reader import ReaderFactory

import numpy

import sys
PATH_TO_REPSEVAL = "../repseval/src"
sys.path.insert(0, PATH_TO_REPSEVAL)
from evaluate import evaluate_embed_matrix
from wordreps import WordReps





In [20]:
# one-block example

corpus_file = "./data/text8.zip"
settings = [("glove","./config/glove_sample_config.yml"), ("word2vec", "./config/word2vec_sample_config.yml"), ("lsa", "./config/lsa_sample_config.yml")]
algorithm, model_config = settings[2]

with open(model_config, "r") as f:
    cfg = yaml.load(f)

reader = ReaderFactory.produce(corpus_file[-3:])
data = reader.read_data(corpus_file)
tokenizer = SimpleTokenizer()
indexed_corpus = tokenizer.do_index_data(data,
    n_words=cfg.get('vocabulary_size'),
    min_count=cfg.get('min_count'))

factory = SignalMatrixFactory(indexed_corpus)

signal_matrix = factory.produce(algorithm)
path = signal_matrix.param_dir
signal_matrix.inject_params(cfg)
signal_matrix.estimate_signal()
signal_matrix.estimate_noise()
signal_matrix.export_estimates()

pip_calculator = MonteCarloEstimator()
pip_calculator.get_param_file(path, "estimates.yml")
pip_calculator.estimate_signal()
pip_calculator.estimate_pip_loss()
pip_calculator.plot_pip_loss()

lmdas = signal_matrix.spectrum 
myus = numpy.array(pip_calculator.estimated_signal) 
r = pip_calculator.rank
k = 300

c = numpy.dot(lmdas[:k], myus[:k]) / numpy.dot(myus[:k], myus[:k])
print(c)
   

vocabulary_size=1909
n=1909, rank=343, sigma=0.3901904566333448
optimal dimensionality is 24
a plot of the loss is saved at params/LSAMatrix/pip_0.5.pdf
1.442398466416934


In [2]:
# split the code into functions

def create_signal_matrix(corpus_fname, model_config, algorithm):
    with open(model_config, "r") as f:
        cfg = yaml.load(f)

    reader = ReaderFactory.produce(corpus_fname[-3:])
    data = reader.read_data(corpus_fname)
    tokenizer = SimpleTokenizer()
    indexed_corpus = tokenizer.do_index_data(data,
        n_words=cfg.get('vocabulary_size'),
        min_count=cfg.get('min_count'))

    factory = SignalMatrixFactory(indexed_corpus)

    signal_matrix = factory.produce(algorithm)
    path = signal_matrix.param_dir
    signal_matrix.inject_params(cfg)
    signal_matrix.estimate_signal()
    signal_matrix.estimate_noise()
    signal_matrix.export_estimates()
    return cfg, path, signal_matrix, tokenizer


def estimate_pip(path):
    pip_calculator = MonteCarloEstimator()
    pip_calculator.get_param_file(path, "estimates.yml")
    pip_calculator.estimate_signal()
    pip_calculator.estimate_pip_loss()
    pip_calculator.plot_pip_loss()
    return pip_calculator    
    

In [3]:
corpus_fname = "./data/text8.zip"
settings = [("glove","./config/glove_sample_config.yml"), ("word2vec", "./config/word2vec_sample_config.yml"), ("lsa", "./config/lsa_sample_config.yml")]

cfg = {}
path = {}
signal_matrix = {}
pip_calculator = {}
tokenizer = {}

for (algorithm, model_config) in settings:
    print(algorithm, model_config, corpus_fname)
    cfg[algorithm], path[algorithm], signal_matrix[algorithm], tokenizer[algorithm]  = create_signal_matrix(corpus_fname, model_config, algorithm)
    pip_calculator[algorithm] = estimate_pip(path[algorithm]) 

glove ./config/glove_sample_config.yml ./data/text8.zip
vocabulary_size=10000
n=10000, rank=2623, sigma=0.1472030442613216
optimal dimensionality is 738
a plot of the loss is saved at params/GloVeMatrix/pip_0.5.pdf
word2vec ./config/word2vec_sample_config.yml ./data/text8.zip
vocabulary_size=10000
n=10000, rank=2280, sigma=0.35662454111971503
optimal dimensionality is 119
a plot of the loss is saved at params/Word2VecMatrix/pip_0.5.pdf
lsa ./config/lsa_sample_config.yml ./data/text8.zip
vocabulary_size=10000
n=10000, rank=2232, sigma=0.3521004885954202
optimal dimensionality is 119
a plot of the loss is saved at params/LSAMatrix/pip_0.5.pdf


In [68]:
# all words are indexed with the same ids across embeddings as shown below!
print(tokenizer['glove'].dictionary['apple'])
print(tokenizer['glove'].reversed_dictionary[9999])

print(tokenizer['word2vec'].dictionary['apple'])
print(tokenizer['word2vec'].reversed_dictionary[204])


1221
recognise
1221
usually


In [None]:
# evaluate on word embedding benchmarks (repseval)

# create the embedding matrices
k = {}
source = {}
for algo, _ in settings:
    print(algo)
    k[algo] = numpy.argmin(pip_calculator[algo].estimated_pip_loss)
    source[algo] = signal_matrix[algo].U[:,:k[algo]] @ numpy.diag(signal_matrix[algo].spectrum[:k[algo]])
    print(source[algo].shape)
    
    WR = WordReps()
    WR.load_matrix(source[algo], tokenizer[algo].dictionary)
    evaluate_embed_matrix(WR, mode="lex")

