## Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from IPython import display

import numpy as np
import pandas as pd

import os

from word2vec_as_EMF import Word2vecMF
from functions import *

## Load model

In [2]:
%%time
model_enwik = Word2vecMF()
model_enwik.load_matrices(from_file='enwik-200/matrices.npz')

CPU times: user 14.5 s, sys: 4.3 s, total: 18.8 s
Wall time: 18.8 s


## Save some matrices to disk (to use them as the initialization further)

In [3]:
# SVD initialization

SPPMI = np.maximum(np.log(model_enwik.D) - np.log(model_enwik.B), 0)
u, s, vt = svds(SPPMI, k=100)
C_svd = u.dot(np.sqrt(np.diag(s))).T
W_svd = np.sqrt(np.diag(s)).dot(vt)

In [4]:
model_enwik.C = C_svd.copy()
model_enwik.W = W_svd.copy()

model_enwik.save_CW('enwik-200/initializations/SVD_dim100', 0)

# Launch the experiment

In [6]:
datasets_path='datasets'
C_svd, W_svd = model_enwik.load_CW('enwik-200/initializations/SVD_dim100', 0)

In [None]:
%%time
opt_experiment(model_enwik,
               mode='PS', 
               d=100,
               eta = 1e-6,
               MAX_ITER=100,
               from_iter=0,
               start_from='SVD',
               init=(True, C_svd, W_svd))

In [None]:
print 5

# Indexing word pairs from all datasets

In [8]:
# Look through all datasets (wordsim353, men, mturk etc) and find
# all words which occur in these datasets. Save their indices in
# model_enwik.vocab to provide easy access to them in experiments.
# USE THIS SCRIPT ONLY IF YOU ADDED SOME NEW DATASETS to datasets/

names = []
arrs = []
for filename in os.listdir('datasets'):
    
    if filename[-4:]=='.csv':

        name = filename[:-4]
        for i in xrange(3):
            names.append(str(i)+name)
        
        dataset = pd.read_csv('datasets/'+filename, header=None, delimiter=';').values
        ind1 = []
        ind2 = []
        vec2 = []
        model_dict=model_enwik.vocab
        for i in xrange(dataset.shape[0]):
            word1 = dataset[i, 0].lower()
            word2 = dataset[i, 1].lower()
            if (word1 in model_dict and word2 in model_dict):
                ind1.append(int(model_dict[word1]))
                ind2.append(int(model_dict[word2]))
                vec2.append(np.float64(dataset[i, 2]))
                
        ind1 = np.array(ind1)
        ind2 = np.array(ind2)
        vec2 = np.array(vec2)
        
        arrs.append(ind1)
        arrs.append(ind2)
        arrs.append(vec2)
        
np.savez(open('datasets/indices.npz', 'wb'), **{name:value for name,value in zip(names, arrs)})