# fastText embeddings

Load fastText embeddings using the fastText library

In [None]:
# Setup
import fasttext
import fasttext.util

In [None]:
bin_file = '/media/fero/Programs/nlp/embeddings/norwegian/cc.no.300.bin'
vec_file = '/media/fero/Programs/nlp/embeddings/norwegian/cc.no.300.vec'

## Load from .bin

In [None]:
# Load a fastText model and inspect its shape
ft_bin = fasttext.load_model(bin_file)
print(f'Original shape: {ft_bin.get_dimension()}')

In [None]:
# Reduce its dimensionality
fasttext.util.reduce_model(ft_bin, 100)
print(f'New shape: {ft_bin.get_dimension()}')

## Load from .txt

In [None]:
# Load a fastText model and see what shape it's got
ft_vec = fasttext.load_model(vec_file)
ft_vec.get_dimension()

In [None]:
# Reduce its dimensionality
fasttext.util.reduce_model(ft_vec, 100)
ft_vec.get_dimension()

## Inspect

In [None]:
print('Word: "fantastisk"')
print(f"Shape: {ft_vec.get_word_vector('fantastisk').shape}")
print(f"Nearest neighbours: {ft_vec.get_nearest_neighbors('fantastisk')}")

Using the binary models, vectors for out-of-vocabulary
words can be obtained with

`$ ./fasttext print-word-vectors cc.no.300.bin < oov_words.txt`

In the text format, each line contain a word followed
by its vector. Each value is space separated, and words
are sorted by frequency in descending order.

These text models can easily be loaded in Python using
the following code:

In [None]:
import io

def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = map(float, tokens[1:])
    return data