# Get InferSent embeddings of quora dataset

In [2]:
!git clone https://github.com/facebookresearch/InferSent.git

Cloning into 'InferSent'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 259 (delta 7), reused 13 (delta 4), pack-reused 240[K
Receiving objects: 100% (259/259), 448.95 KiB | 340.00 KiB/s, done.
Resolving deltas: 100% (131/131), done.


In [1]:
!ls

CODE_OF_CONDUCT.md   LICENSE
compute.ipynb	     models.py
CONTRIBUTING.md      __pycache__
demo.ipynb	     quora_text_test_set_infersent.pth
encoder		     quora_text_train_set_infersent.pth
extract_features.py  quora_text_valid_set_infersent.pth
fastText	     README.md
GloVe		     samples.txt


In [2]:
import torch
from models import InferSent
# from utils import QUORA_TEXT_TRAIN_PATH, QUORA_TEXT_VALID_PATH, QUORA_TEXT_TEST_PATH

In [3]:
V = 2
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [4]:
infersent.cuda()

InferSent(
  (enc_lstm): LSTM(300, 2048, bidirectional=True)
)

In [5]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [6]:
infersent.build_vocab_k_words(K=100000)

Vocab size : 100000


In [8]:
sentences = torch.load('../' + QUORA_TEXT_TRAIN_PATH)

In [8]:
sentences = torch.load('../' + QUORA_TEXT_VALID_PATH)

In [7]:
sentences = torch.load('../' + 'datasets/quora_paraphrase/quora_text_test.pth')

In [8]:
type(sentences), len(sentences)

(list, 60000)

In [9]:
sentences[:10]

['What is the best English translation of the Bhagavad Gita?',
 'Which is the best English version of Bhagavad-Gita?',
 'Quora kept refreshing on its own. Is this a normal thing or is it just me?',
 'Why does Quora keep refreshing the page?',
 'What is it like to study in McGill University?',
 'What is it like to study at McGill University?',
 'Has Ancient Persia been scientifically tested?',
 'Have Ancient Akkadians been scientifically tested?',
 'How can I see who my boyfriend views on instagram?',
 'Can you see who who viewed your videos on Instagram?']

In [10]:
embeddings = infersent.encode(sentences, tokenize=True)

In [11]:
type(embeddings)

numpy.ndarray

In [12]:
embeddings.shape, embeddings.dtype

((60000, 4096), dtype('float32'))

In [13]:
device = torch.device('cuda:0')

In [15]:
t1 = torch.from_numpy(embeddings).to(device)

In [16]:
t1.size(), t1.dtype, t1.device

(torch.Size([60000, 4096]), torch.float32, device(type='cuda', index=0))

In [15]:
torch.save(t1, 'quora_text_test_set_infersent.pth')

# Calculate mean and covariance matrices

In [1]:
import torch

In [2]:
samples = torch.load('quora_text_train_set_infersent.pth').cuda()
samples.size(), samples.dtype

(torch.Size([200000, 4096]), torch.float32)

In [2]:
samples = torch.load('quora_text_valid_set_infersent.pth').cuda()
samples.size(), samples.dtype

(torch.Size([38526, 4096]), torch.float32)

In [2]:
samples = torch.load('quora_text_test_set_infersent.pth').cuda()
samples.size(), samples.dtype

(torch.Size([60000, 4096]), torch.float32)

In [3]:
quora_test_mean = samples.mean(dim=0)

In [4]:
quora_test_mean

tensor([ 0.0075, -0.0283,  0.0634,  ...,  0.0150,  0.0167, -0.0117],
       device='cuda:0')

In [5]:
errors = samples - quora_test_mean.unsqueeze(0)
errors.size(), errors.dtype

(torch.Size([200000, 4096]), torch.float32)

In [6]:
quora_test_covariance = torch.mm(errors.t(), errors) / errors.size(0)
quora_test_covariance.size(), quora_test_covariance.dtype

(torch.Size([4096, 4096]), torch.float32)

In [7]:
torch.save(dict(mean=quora_test_mean, cov=quora_test_covariance), 'quora_text_train_set_infersent_mean_and_covariance.pth')

# Testing

In [2]:
import torch

In [4]:
torch.diag(torch.arange(4))

tensor([[0, 0, 0, 0],
        [0, 1, 0, 0],
        [0, 0, 2, 0],
        [0, 0, 0, 3]])

In [5]:
def _symmetric_matrix_square_root(mat, eps=1e-10):
    u, s, v = torch.svd(mat)
    si = torch.where(s < eps, s, torch.sqrt(s))
    return torch.mm(torch.mm(u, torch.diag(si)), v.t())

In [None]:
def trace_sqrt_product(sigma, sigma_v):
    sqrt_sigma = _symmetric_matrix_square_root(sigma)
    sqrt_a_sigmav_a = torch.mm(sqrt_sigma, torch.mm(sigma_v, sqrt_sigma))
    return torch.trace(_symmetric_matrix_square_root(sqrt_a_sigmav_a))