In [None]:
!pip install argparse
!pip install json
!pip install codecs
!pip install logging
!pip install os
!pip install warnings
!pip install pqkmeans

In [None]:
test_path = '/content/drive/My Drive/IR/questions-words_trimmed.txt'
model_path = '/content/drive/My Drive/IR/crawl-300d-50K.vec'

In [4]:
import argparse
import json
import codecs
import logging
import os
import warnings

from sklearn.decomposition import PCA
from gensim.models import KeyedVectors
import pqkmeans

warnings.simplefilter(action='ignore', category=FutureWarning)


def reduce_dimensions_pca(vectors, dimensions=150):
    reduced_vectors = PCA(n_components=dimensions).fit_transform(vectors)
    return reduced_vectors


def product_quantize(vectors, subdims=30, centres=1000):
    encoder = pqkmeans.encoder.PQEncoder(iteration=40, num_subdim=subdims, Ks=centres)
    encoder.fit(vectors)
    vectors_pq = encoder.transform(vectors)
    reconstructed_vectors = encoder.inverse_transform(vectors_pq)
    return reconstructed_vectors, vectors_pq, encoder.codewords


def compute_accuracy(model):
    print("Calculating accuracy...")
    accuracy, _ = model.evaluate_word_analogies('questions-words.txt', restrict_vocab=50000)
    print("Accuracy: {:f}%".format(accuracy*100))


def save_matrix(file, matrix):
    matrix_shape = list(matrix.shape)
    matrix_list = matrix.flatten().tolist()
    data = {
        "shape": matrix_shape,
        "vectors": matrix_list
    }
    with open(file, 'w') as f:
        print("Saving {:s}".format(file))
        f.write(json.dumps(data))


def human_format(num):
    if not os.path.exists("generated"):
        os.makedirs("generated")

    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    # add more suffixes if you need them
    return '%.0f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])


def save_model(path, embedding_size, word_list, codes, centroids):
    if not os.path.exists(path):
        os.makedirs(path)

    model_name = "embeddings-{}d-{}.vec".format(embedding_size, human_format(len(word_list)))
    model.save_word2vec_format(os.path.join(path, model_name))

    vocab_file_path = os.path.join(path, "vocab.json")
    with codecs.open(vocab_file_path, 'w', 'UTF-8') as f:
        print("Saving {:s}".format(vocab_file_path))
        f.write(json.dumps(word_list))

    save_matrix(os.path.join(path, 'codes.json'), codes)
    save_matrix(os.path.join(path, 'centroids.json'), centroids)





BaseLine accuracy in cell below

In [7]:
from gensim.models import KeyedVectors

def compute_accuracy(model):
    accuracy, _ = model.evaluate_word_analogies(test_path, restrict_vocab=50000)
    print("Accuracy: {:f}%".format(accuracy*100))

model = KeyedVectors.load_word2vec_format(model_path)
# Compute baseline accuracy
compute_accuracy(model)

Accuracy: 98.433048%


PCA Dimensionality reduction in cell below

In [8]:
def reduce_dimensions_pca(vectors, dimensions=150):
    reduced_vectors = PCA(n_components=dimensions).fit_transform(vectors)
    return reduced_vectors

original_embeddings = model.vectors
reduced_embeddings = reduce_dimensions_pca(original_embeddings)

# Create a new model with the reduced embeddings and calculate the accuracy
words = [model.index2word[idx] for idx in range(len(reduced_embeddings))]
model = KeyedVectors(vector_size=reduced_embeddings.shape[1])
model.add(words, reduced_embeddings, replace=True)
compute_accuracy(model)

Accuracy: 96.866097%


Product Quantization in cell below

In [13]:
import pqkmeans

def product_quantize(vectors, subdims=100, centres=1000):
    encoder = pqkmeans.encoder.PQEncoder(iteration=40, num_subdim=subdims, Ks=centres)
    encoder.fit(vectors)
    vectors_pq = encoder.transform(vectors)
    reconstructed_vectors = encoder.inverse_transform(vectors_pq)
    return reconstructed_vectors, vectors_pq, encoder.codewords

reconstructed_embeddings, codes, centroids = product_quantize(original_embeddings)

# Compute accuracy of new model
words = [model.index2word[idx] for idx in range(len(reconstructed_embeddings))]
model = KeyedVectors(vector_size=reconstructed_embeddings.shape[1])
model.add(words, reconstructed_embeddings, replace=True)
compute_accuracy(model)

Accuracy: 97.720798%


Size Reduction Code in Cell Below

In [14]:
original_size = original_embeddings.nbytes
reduced_size =  reduced_embeddings.nbytes
new_size = codes.nbytes + centroids.nbytes
print("Size reduction: {:f}%".format((original_size - new_size) * 100 / original_size))

Size reduction: 79.333333%
