Experiments on wiki dataset including:
* Pretrained Glove
* Our TT model with Glove init for different parameters

In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]
random.seed(1024)

In [4]:
import sys
sys.path.append('/workspace/tt-pytorch')

In [5]:
import t3nsor as t3

In [7]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [8]:
device = torch.device('cuda' if USE_CUDA else 'cpu')

# Data loading

In [9]:
def prepare_sequence(seq, word2index):
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return LongTensor(idxs)

def prepare_word(word, word2index):
    return LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]])

In [10]:
data = np.loadtxt("/workspace/data/enwik8.txt", dtype=str, delimiter='.')

corpus = [data[i].split() for i in data.nonzero()[0]]

### Exclude sparse words 

In [82]:
word_count = Counter(flatten(corpus))

In [83]:
MIN_COUNT = 100
exclude = []

In [84]:
for w, c in word_count.items():
    if c < MIN_COUNT:
        exclude.append(w)

In [85]:
len(exclude)

203668

### Prepare train data 

In [86]:
vocab = list(set(flatten(corpus)) - set(exclude))

In [87]:
len(vocab)

9603

In [89]:
word2index = {}
for vo in vocab:
    if word2index.get(vo) is None:
        word2index[vo] = len(word2index)
        
index2word = {v:k for k, v in word2index.items()}

In [90]:
BATCH_SIZE = 256

In [91]:
len(corpus)

889156

In [92]:
WINDOW_SIZE = 5

s = WINDOW_SIZE // 2

train_data = []

for tmp, sentence in enumerate(corpus):
    l = len(sentence)
    for i in range(l):
        for j in range(max(0,i-s), min(i+s+1,l)):
            if (i != j and (sentence[i] in word2index) \
                and (sentence[j] in word2index)):
                c = sentence[j]
                w = sentence[i]
                
                train_data.append((word2index[w], word2index[c]))

In [93]:
train_data = np.array(train_data)

In [94]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

### Build Unigram Distribution**0.75 

$$P(w)=U(w)^{3/4}/Z$$

In [22]:
Z = 0.001

In [69]:
word_count = Counter(flatten(corpus))
num_total_words = sum([c for w, c in word_count.items() if w not in exclude])

In [None]:
unigram_table = []

for vo in vocab:
    unigram_table.extend([vo] * int(((word_count[vo]/num_total_words)**0.75)/Z))

In [None]:
print(len(vocab), len(unigram_table))

### Negative Sampling 

In [None]:
def negative_sampling(targets, unigram_table, k):
    batch_size = targets.size(0)
    neg_samples = []
    for i in range(batch_size):
        nsample = []
        target_index = targets[i].item()
        while len(nsample) < k: # num of sampling
            neg = random.choice(unigram_table)
            if word2index[neg] == target_index:
                continue
            nsample.append(neg)
        neg_samples.append(prepare_sequence(nsample, word2index).view(1, -1))
    
    return torch.cat(neg_samples)

# Load Glove

In [40]:
import bcolz, pickle

In [32]:
words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'/workspace/glove/6B.50.dat', mode='w')

with open(f'/workspace/glove/glove.6B.50d.txt', 'rb') as f:
    print("opening file..")
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)

In [38]:
vectors = bcolz.carray(vectors[1:].reshape((400001, 50)), rootdir=f'/workspace/glove/6B.50.dat', mode='w')
vectors.flush()

In [41]:
pickle.dump(words, open(f'/workspace/glove/6B.50_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'/workspace/glove/6B.50_idx.pkl', 'wb'))

In [63]:
vectors = bcolz.open(f'/workspace/glove/6B.50.dat')[:]
words = pickle.load(open(f'/workspace/glove/6B.50_words.pkl', 'rb'))
word2idx = pickle.load(open(f'/workspace/glove/6B.50_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [65]:
glove['dick']

array([-0.37085 , -0.072561, -0.44149 ,  0.28066 , -0.16122 ,  0.81921 ,
       -1.1913  , -0.14747 , -0.90533 , -0.91049 , -0.80279 ,  0.52766 ,
       -0.71711 , -0.068202,  0.85043 , -0.14513 ,  0.47629 ,  0.57743 ,
       -0.56997 , -0.59035 , -0.26699 , -0.01678 ,  0.29088 , -0.2056  ,
        0.015951, -1.5403  , -0.10613 ,  0.34189 , -0.42333 , -0.094656,
        0.49382 , -0.19415 , -0.67599 , -0.014195, -0.15302 , -0.4968  ,
       -0.1013  ,  0.45619 , -0.28205 , -0.28014 ,  0.17112 ,  1.057   ,
       -0.99511 , -0.58887 ,  0.72242 ,  0.051293, -0.59036 ,  0.29768 ,
       -0.94829 ,  1.9092  ])

### Glove vocab

In [95]:
len(vocab)

9603

In [97]:
matrix_len = len(vocab)
weights_matrix = np.zeros((matrix_len, 50))
words_found = 0

for i, word in enumerate(vocab):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(50, ))

In [102]:
import pickle
import subprocess
import pandas as pd

In [103]:
def evaluate_embed(vocab, embeds):
    if len(vocab) < embeds.shape[0]:
        embeds = embeds[:len(vocab), :]
        
    d = dict(zip(list(vocab.keys()), embeds))
    
    
    fin = '/workspace/tt-pytorch/tmp1.pkl'
    fout = '/workspace/tt-pytorch/tmp1.csv'
    
    print('Starting pickle')
    with open(fin, 'wb') as f:
        pickle.dump(d, f)
        
    print('Finished pickle')
        
    path = '/workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py'
    script_full = 'python {path} --file {fname} --output {outname}'.format(path=path, 
                                                                 fname=fin, 
                                                                 outname=fout)
    
    print(script_full)
    
    subprocess.run([script_full], shell=True)
    df = pd.read_csv(fout)

    return df

In [204]:
df = evaluate_embed(word2index, weights_matrix)

Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv


In [206]:
df[['MEN', 'MTurk', 'SimLex999']]

Unnamed: 0,MEN,MTurk,SimLex999
0,0.318493,0.507134,0.20228


In [221]:
word2idx_cut = dict(zip(list(word2idx.keys())[:-1], np.arange(40000)))

### TT ranks 

In [177]:
embds = np.zeros((9800, 50))

In [178]:
embds[:9603,:] = weights_matrix

In [209]:
ranks = [16, 32, 64, 128, 256, 512]

In [210]:
results = []

In [211]:
for rank in ranks:
    tt_embds = t3.to_tt_matrix(torch.Tensor(embds).to(device), 
                               shape=[[8,25,49], [5,5,2]], max_tt_rank=rank)
    
    embds_full = tt_embds.full()
    embds_cut = embds_full.detach().cpu().numpy()[:9603, :]
    
    df = evaluate_embed(word2index, embds_cut)
    cur_result = np.array(df[['MEN', 'MTurk', 'SimLex999']])
    results.append(cur_result)
    print(rank, cur_result)

Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
2 [[ 0.00129656 -0.111136    0.01444499]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
4 [[ 0.01397857 -0.01816318  0.03506379]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
8 [[-0.00404109  0.10894443  0.04813412]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
16 [[0.00463707 0.05176925 0.07168481]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/w

In [213]:
tt_embds = t3.to_tt_matrix(torch.Tensor(embds).to(device), 
                               shape=[[8,25,49], [5,5,2]], max_tt_rank=64)

In [214]:
print(tt_embds)

A TT-Matrix of size 9800 x 50, underlying tensorshape: [8, 25, 49] x [5, 5, 2], TT-ranks: [1, 40, 64, 1] 
 on device 'cuda:0' with compression rate 1.49


# full

In [235]:
embds.shape

(400000, 50)

In [230]:
embds = vectors[:-1, :]

In [231]:
ranks = [16, 32, 64, 128, 256, 512]

In [232]:
results = []

In [237]:
4 * 100000 == 4 * 10 * 10 * 10 * 10 * 10 

True

In [242]:
for rank in ranks:
    tt_embds = t3.to_tt_matrix(torch.Tensor(embds).to(device), 
                               shape=[[400, 100, 10], [5, 5, 2]], max_tt_rank=rank)
    
    embds_full = tt_embds.full()
    embds_cut = embds_full.detach().cpu().numpy()
    
    df = evaluate_embed(word2idx_cut, embds_cut)
    cur_result = np.array(df[['MEN', 'MTurk', 'SimLex999']])
    results.append(cur_result)
    print(rank, cur_result)

Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
16 [[ 0.08090662  0.11002691 -0.11075207]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
32 [[ 0.08557271  0.14092777 -0.10232093]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
64 [[ 0.07682954  0.11284035 -0.09828222]]
Starting pickle
Finished pickle
python /workspace/tt-pytorch/word-embeddings-benchmarks/scripts/evaluate_on_all.py --file /workspace/tt-pytorch/tmp1.pkl --output /workspace/tt-pytorch/tmp1.csv
128 [[ 0.09364144  0.14017365 -0.10043336]]
Starting pickle
Finished pickle
python /workspace/tt-py

In [239]:
embds_cut.shape

(400000, 50)

In [241]:
len(word2idx_cut)

40000