In [1]:
import fasttext
import fasttext.util

# Download pre-trained English model (966M tokens)
fasttext.util.download_model('en', if_exists='ignore')
ft_model = fasttext.load_model('cc.en.300.bin')

# Get vector with subword information
word_vector = ft_model.get_word_vector("gonna")
# Also works for OOV words via character n-grams
oov_vector = ft_model.get_word_vector("cooool")  # Handles informal spellings

In [2]:
words = open('reddit_top.txt').readlines()

In [3]:
words = ['<PAD>','<UNK>', '<EOS>'] + words

In [4]:
import orjson

id2word = open('id2word.json', 'wb')

lookup_1 = {str(i): word.strip() for i, word in enumerate(words)}


In [5]:
id2word.write(orjson.dumps(lookup_1))

166636

In [6]:
lookup_2 = {word.strip(): str(i) for i, word in enumerate(words)}

In [7]:
word2id = open('word2id.json', 'wb')
word2id.write(orjson.dumps(lookup_2))

166636

In [8]:
import numpy as np
# Define special tokens and their vectors
SPECIAL_TOKENS = {
    '<PAD>': np.zeros(300, dtype=np.float16),
    '<EOS>': np.random.normal(0, 0.01, 300).astype(np.float16),
    '<UNK>': np.random.normal(0, 0.1, 300).astype(np.float16)
}

word_vecs = []

# Add special tokens in order
for token in ['<PAD>', '<EOS>', '<UNK>']:
    word_vecs.append(SPECIAL_TOKENS[token])

# Add word vectors (skip special tokens already in the list)
for word in words[3:]:
    word_vecs.append(ft_model.get_word_vector(word.strip()).astype(np.float16))

In [9]:
len(word_vecs)

10003

In [10]:
import numpy as np

# Convert to numpy array (all vectors are now consistent)
embedding_matrix = np.array(word_vecs, dtype=np.float16)

# Verify shape and special tokens
print(f"Embedding matrix shape: {embedding_matrix.shape}")
print(f"<PAD> vector (zeros): {embedding_matrix[0][:5]}")
print(f"<EOS> vector (random): {embedding_matrix[1][:5]}")
print(f"<UNK> vector (random): {embedding_matrix[2][:5]}")

Embedding matrix shape: (10003, 300)
<PAD> vector (zeros): [0. 0. 0. 0. 0.]
<EOS> vector (random): [ 0.00362   0.0031   -0.009895 -0.005432  0.012794]
<UNK> vector (random): [-0.1164   -0.1173    0.05743   0.0249   -0.013824]


In [11]:
np.savez_compressed('embedding_matrix.npz', embedding_matrix=embedding_matrix)

In [13]:

sanity_wrods = [
'i',
'and',
'to',
'the',
'a',
'of',
]

for i, word in enumerate(sanity_wrods):
    print(np.sum(np.absolute( np.asarray(ft_model.get_word_vector(word), dtype=np.float16) - embedding_matrix[i+3])))

0.0
0.0
0.0
0.0
0.0
0.0
