In [1]:
import numpy as np
from src.embedding_manager import EmbeddingManager

em = EmbeddingManager(path='../fasttext.wiki-new-300d-1M.vec')

100%|██████████| 999995/999995 [01:26<00:00, 11521.43it/s]


Total embeddings shape: (629762, 300)


# Perform cleaning according to analysis, conducted in embeddings-analysis.ipynb

In [2]:
def filter_values(array, limits, title):
    filtered_out_idx =  (array < limits[0]) | (array > limits[1])
    filtered_out = array[filtered_out_idx]
    print(f'Filtered out by {title} in {limits}: {filtered_out.shape[0]}')
    return filtered_out_idx

In [3]:
# Per dimension limits
mean_dim_limits = (-0.1, 0.1)
median_dim_limits = (-0.1, 0.1)
max_dim_limits = (0, 2)
min_dim_limits = (-2, 0)

mean_values_per_dim = np.mean(em.vectors, axis=0)
median_values_per_dim = np.median(em.vectors, axis=0)
max_values_per_dim = np.max(em.vectors, axis=0)
min_values_per_dim = np.min(em.vectors, axis=0)

mean_dim_filtered_out_idx = filter_values(mean_values_per_dim, mean_dim_limits, 'mean per dimension')
median_dim_filtered_out_idx = filter_values(median_values_per_dim, median_dim_limits, 'median per dimension')
max_dim_filtered_out_idx = filter_values(max_values_per_dim, max_dim_limits, 'max per dimension')
min_dim_filtered_out_idx = filter_values(min_values_per_dim, min_dim_limits, 'min per dimension')

total_dimension_filter_idx = mean_dim_filtered_out_idx | median_dim_filtered_out_idx | max_dim_filtered_out_idx | min_dim_filtered_out_idx
print(f'Total filtered out dimensions count: {em.vectors[0][total_dimension_filter_idx].shape[0]}')

Filtered out by mean per dimension in (-0.1, 0.1): 4
Filtered out by median per dimension in (-0.1, 0.1): 3
Filtered out by max per dimension in (0, 2): 17
Filtered out by min per dimension in (-2, 0): 21
Total filtered out dimensions count: 35


In [4]:
em.vectors = em.vectors.swapaxes(0, 1)[~total_dimension_filter_idx].swapaxes(0, 1)
print(em.vectors.shape)

# reduce dimensions a bit more
from sklearn.decomposition import PCA
pca = PCA(n_components=256)
em.vectors = pca.fit_transform(em.vectors)
print(em.vectors.shape)

(629762, 265)
(629762, 256)


In [5]:
# Per entry limits
mean_limits = (-0.03, 0.03)
median_limits = (-0.04, 0.04)
max_limits = (0, 1)
min_limits = (-1, 0)

mean_values = np.mean(em.vectors, axis=1)
median_values = np.median(em.vectors, axis=1)
max_values = np.max(em.vectors, axis=1)
min_values = np.min(em.vectors, axis=1)

mean_filtered_out_idx = filter_values(mean_values, mean_limits, 'mean')
median_filtered_out_idx = filter_values(median_values, median_limits, 'median')
max_filtered_out_idx = filter_values(max_values, max_limits, 'max')
min_filtered_out_idx = filter_values(min_values, min_limits, 'min')

total_entry_filter_idx = mean_filtered_out_idx | median_filtered_out_idx | max_filtered_out_idx | min_filtered_out_idx
print(f'Total filtered out entries count: {em.vectors[total_entry_filter_idx].shape[0]}')
print(em.words[total_entry_filter_idx])

em.vectors = em.vectors[~total_entry_filter_idx]
em.words = em.words[~total_entry_filter_idx]

Filtered out by mean in (-0.03, 0.03): 201
Filtered out by median in (-0.04, 0.04): 105
Filtered out by max in (0, 1): 67
Filtered out by min in (-1, 0): 38
Total filtered out entries count: 226
['rivera' 'tulsa' 'uniting' 'في' 'docking' 'psychopathic' 'workhouse'
 'rezko' 'olya' 'aacn' 'arpaio' 'micron' 'transducers' 'serait' 'nauvoo'
 'sofla' 'valea' 'roxana' 'taliesin' 'vha' 'gosnell' 'disestablishments'
 'alwyn' 'motorhead' 'dunhill' 'limehouse' 'hoodoo' 'rankersbo'
 'michaelmas' 'epd' 'ihre' 'bruegel' 'seamount' 'keener' 'pratyya'
 'molise' 'rovio' 'atrophied' 'vaslui' 'sogdian' 'mairead' 'bostic'
 'adaag' 'wurzelbacher' 'almshouses' 'ahwaz' 'sobotka' 'jeremias' 'krasny'
 'timurid' 'kamau' 'extemporaneous' 'adlai' 'dawei' 'groveling' 'novák'
 'slatkin' 'rrt' '2606' 'riya' 'rouses' 'monotonically' 'lawfulness'
 'municipio' 'hicham' 'gessler' 'sakari' 'sobered' 'hamley' 'impassively'
 'hưng' 'stian' 'skelos' 'outremont' 'qic' 'arnolfini' 'aruch' 'corradini'
 'mainzer' 'lubomir' 'bla

## Save filtered embeddings

In [11]:
with open('../fasttext.wiki-news-cleaned-256d.vec', 'a', encoding='UTF-8') as vec_file:
    for i, word in enumerate(em.words):
        vector = em.vectors[i]
        row = word + ' ' + ' '.join([str(val) for val in vector])
        vec_file.write(row + '\n')