In [1]:
import os
import tqdm
import numpy as np
import gensim

from collections import defaultdict
from gensim.models.utils_any2vec import ft_ngram_hashes  # This function is used to calculate hashes from ngrams to determine position in ngram matrix

In [2]:
ft = gensim.models.FastText.load_fasttext_format("/home/ifte/resource/wikiextractor/wikidata/eng/_fasttext.model.bin")  # Original fasttext embeddings from https://fasttext.cc/
ft.wv.save('/home/ifte/resource/wikiextractor/wikidata/eng/fasttext_gensim.model')  # we are not saving training weights to save space

  """Entry point for launching an IPython kernel.


In [3]:
ft = gensim.models.KeyedVectors.load("/home/ifte/resource/wikiextractor/wikidata/eng/fasttext_gensim.model")

In [4]:
# Setup new size of embeddings
new_vocab_size = 1000000
new_ngrams_size = 1000000  # Should be GCD of initial

In [5]:
sorted_vocab = sorted(ft.vocab.items(), key=lambda x: x[1].count, reverse=True)
top_vocab = dict(sorted_vocab[:new_vocab_size])

top_vector_ids = [x.index for x in top_vocab.values()]
assert max(top_vector_ids) < new_vocab_size  # Assume vocabulary is already sorted by frequency

top_vocab_vectors = ft.vectors_vocab[:new_vocab_size]

In [6]:
new_to_old_buckets = defaultdict(set)
old_hash_count = defaultdict(int)
for word, vocab_word in tqdm.tqdm(ft.vocab.items()):
    old_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, ft.bucket, fb_compatible=ft.compatible_hash)
    new_hashes = ft_ngram_hashes(word, ft.min_n, ft.max_n, new_ngrams_size, fb_compatible=ft.compatible_hash)
    
    for old_hash in old_hashes:
        old_hash_count[old_hash] += 1  # calculate frequency of ngrams for proper weighting
        
    for old_hash, new_hash in zip(old_hashes, new_hashes):
        new_to_old_buckets[new_hash].add(old_hash)

100%|██████████| 1094611/1094611 [03:03<00:00, 5959.51it/s]


In [7]:
# Create new FastText model instance
new_ft = gensim.models.keyedvectors.FastTextKeyedVectors(
    vector_size=ft.vector_size,
    min_n=ft.min_n,
    max_n=ft.max_n,
    bucket=new_ngrams_size,
    compatible_hash=ft.compatible_hash
)

# Set shrinked vocab and vocab vector
new_ft.vectors_vocab = top_vocab_vectors
new_ft.vectors = new_ft.vectors_vocab
new_ft.vocab = top_vocab

# Set ngram vectors
new_ft.init_ngrams_weights(42)  # Default random seed
for new_hash, old_buckets in new_to_old_buckets.items():
    total_sum = sum(old_hash_count[old_hash] for old_hash in old_buckets)
    
    new_vector = np.zeros(ft.vector_size, dtype=np.float32)
    for old_hash in old_buckets:
        weight = old_hash_count[old_hash] / total_sum
        new_vector += ft.vectors_ngrams[old_hash] * weight
    
    new_ft.vectors_ngrams[new_hash] = new_vector

In [8]:
new_ft.save('/home/ifte/resource/wikiextractor/wikidata/eng/shrinked_fasttext.model')