In [1]:
import numpy as np
from src.embedding_manager import EmbeddingManager

em = EmbeddingManager(path='../fasttext.wiki-news-300d-1M.vec')

100%|██████████| 999995/999995 [01:27<00:00, 11488.76it/s]


Total embeddings shape: (629762, 300)


# Perform cleaning according to analysis, conducted in embeddings-analysis.ipynb

In [2]:
def filter_values(array, limits, title):
    filtered_out_idx =  (array < limits[0]) | (array > limits[1])
    filtered_out = array[filtered_out_idx]
    print(f'Filtered out by {title} in {limits}: {filtered_out.shape[0]}')
    return filtered_out_idx

In [3]:
# Per entry limits
mean_limits = (-0.03, 0.03)
median_limits = (-0.04, 0.04)
max_limits = (0, 1)
min_limits = (-1, 0)

mean_values = np.mean(em.vectors, axis=1)
median_values = np.median(em.vectors, axis=1)
max_values = np.max(em.vectors, axis=1)
min_values = np.min(em.vectors, axis=1)

mean_filtered_out_idx = filter_values(mean_values, mean_limits, 'mean')
median_filtered_out_idx = filter_values(median_values, median_limits, 'median')
max_filtered_out_idx = filter_values(max_values, max_limits, 'max')
min_filtered_out_idx = filter_values(min_values, min_limits, 'min')

total_entry_filter_idx = mean_filtered_out_idx | median_filtered_out_idx | max_filtered_out_idx | min_filtered_out_idx
print(f'Total filtered out entries count: {em.vectors[total_entry_filter_idx].shape[0]}')

em.vectors = em.vectors[~total_entry_filter_idx]
em.words = em.words[~total_entry_filter_idx]

Filtered out by mean in (-0.03, 0.03): 118
Filtered out by median in (-0.04, 0.04): 4
Filtered out by max in (0, 1): 530
Filtered out by min in (-1, 0): 18219
Total filtered out entries count: 18619


In [4]:
# Per dimension limits
mean_dim_limits = (-0.1, 0.1)
median_dim_limits = (-0.1, 0.1)
max_dim_limits = (0, 2)
min_dim_limits = (-2, 0)

mean_values_per_dim = np.mean(em.vectors, axis=0)
median_values_per_dim = np.median(em.vectors, axis=0)
max_values_per_dim = np.max(em.vectors, axis=0)
min_values_per_dim = np.min(em.vectors, axis=0)

mean_dim_filtered_out_idx = filter_values(mean_values_per_dim, mean_dim_limits, 'mean per dimension')
median_dim_filtered_out_idx = filter_values(median_values_per_dim, median_dim_limits, 'median per dimension')
max_dim_filtered_out_idx = filter_values(max_values_per_dim, max_dim_limits, 'max per dimension')
min_dim_filtered_out_idx = filter_values(min_values_per_dim, min_dim_limits, 'min per dimension')

total_dimension_filter_idx = mean_dim_filtered_out_idx | median_dim_filtered_out_idx | max_dim_filtered_out_idx | min_dim_filtered_out_idx
print(f'Total filtered out dimensions count: {em.vectors[0][total_dimension_filter_idx].shape[0]}')

Filtered out by mean per dimension in (-0.1, 0.1): 4
Filtered out by median per dimension in (-0.1, 0.1): 3
Filtered out by max per dimension in (0, 2): 1
Filtered out by min per dimension in (-2, 0): 0
Total filtered out dimensions count: 4


In [5]:
em.vectors = em.vectors.swapaxes(0, 1)[~total_dimension_filter_idx].swapaxes(0, 1)
print(em.vectors.shape)

# reduce dimensions a bit more
from sklearn.decomposition import PCA
pca = PCA(n_components=290)
em.vectors = pca.fit_transform(em.vectors)
print(em.vectors.shape)

(611143, 296)
(611143, 290)


## Save filtered embeddings

In [6]:
from tqdm import tqdm

rounded = em.vectors.round(5)
with open('../fasttext.wiki-news-cleaned-290d.vec', 'w', encoding='UTF-8') as vec_file:
    vec_file.write(f'{em.vectors.shape[0]} {em.vectors.shape[1]}\n')
    for i, word in enumerate(tqdm(em.words)):
        vector = rounded[i]
        row = word + ' ' + ' '.join([str(val) for val in vector])
        vec_file.write(row + '\n')

100%|██████████| 611143/611143 [01:47<00:00, 5685.48it/s]
