# filtering the words in Fasttext SK model
* we leave only words which exist in Slovak dictionaries
* 3 lists of SK words are taken with total ~2mil words
* final model has 1043624 words

In [1]:
# morphology database from https://korpus.sk/morphology_database.html
import re

words = set()

with open('/home/dzon/kajo/spacy_multilang/ma-2015-02-05.txt', 'rt', encoding='utf8') as f:
    temp = f.readlines()
    for line in temp:
        line = line.strip()
        lemma,word,tag = line.split("\t")
        word = re.sub("\W","", word)
        words.add(word.strip().lower())
        

print(len(words))
sorted(list(words))[:20]    

1176372


['a',
 'ab',
 'aba',
 'abakus',
 'abakuse',
 'abakusmi',
 'abakusoch',
 'abakusom',
 'abakusov',
 'abakusu',
 'abakusy',
 'abatiša',
 'abatišami',
 'abatiše',
 'abatiši',
 'abatišiach',
 'abatišiam',
 'abatišou',
 'abatišu',
 'abatíš']

In [2]:
# http://www.sk-spell.sk.cx/slovak-wordlist

with open('/home/dzon/kajo/translations/sk_wordlist.txt', 'rt', encoding='utf8') as f:
    temp = f.readlines()
    save_len = len(temp)
    for i in range(len(temp)):
        temp[i] = temp[i].strip().lower()
        temp[i] = re.sub("\W","", temp[i])
    print(sorted(list(temp))[:20])
    temp = set(temp)
    print(len(temp))
words |= temp
print('combined: %d' % (len(words)))

['a', 'a', 'aba', 'abak', 'abaka', 'abakteriálna', 'abakteriálne', 'abakteriálneho', 'abakteriálnej', 'abakteriálnejšej', 'abakteriálnejšia', 'abakteriálnejšie', 'abakteriálnejšieho', 'abakteriálnejšiemu', 'abakteriálnejšiu', 'abakteriálnejšom', 'abakteriálnejšou', 'abakteriálnejší', 'abakteriálnejších', 'abakteriálnejším']
1021980
combined: 1498877


In [3]:
with open('sk_words_skoro_max.txt', 'w') as f:
    for item in sorted(list(words)):
        f.write("%s\n" % item)

In [4]:
# http://p.brm.sk/sk_wordlist/

temp_words = set()

with open('/home/dzon/kajo/spacy_multilang/sk.txt', 'rt', encoding='utf8') as f:
    temp = f.readlines()
    for line in temp:
        line = line.strip()
        #print(line)
        tmp = line.split(" ")
        #print("-",word,"-")
        word = re.sub("\W","", tmp[0])
        temp_words.add(word.strip().lower())
print(sorted(list(temp_words))[:20]  )
print(len(temp_words))
words_max = words
words_max |= temp_words
print('combined: %d' % (len(words_max)))

['', 'a', 'aa', 'aaa', 'aaaa', 'aaaaa', 'aaaaaa', 'aaaaaaa', 'aaaaaaaa', 'aaaaaaaaaa', 'aaaaaaaaaaa', 'aaaaaaaaaaaaaaaaaaaaa', 'aaaaaaaaaaaaaaaah', 'aaaaaaah', 'aaaaaaahh', 'aaaaaaahhhhhhh', 'aaaaaah', 'aaaaah', 'aaaaahhh', 'aaaaahhhhh']
1657175
combined: 2069285


In [5]:
with open('sk_words_max.txt', 'w') as f:
    for item in sorted(list(words_max)):
        f.write("%s\n" % item)

In [6]:
from gensim.models import Word2Vec, KeyedVectors
import time
import numpy as np
import gzip
import os

In [7]:
# load the big model 
start = time.time()
model = KeyedVectors.load_word2vec_format('/mnt/data/data/models-fasttext/cc.sk.300.vec.bin', binary=True)
print('Finished loading original model %.2f min' % ((time.time()-start)/60))

Finished loading original model 0.82 min


In [8]:
print('word2vec total words: %d' % len(model.vocab))
print('non-phrases: %d' % len([w for w in model.vocab.keys() if '_' not in w]))
print('phrases: %d' % len([w for w in model.vocab.keys() if '_' in w]))

word2vec total words: 2000000
non-phrases: 1999999
phrases: 1


In [9]:
max_suffix_len = 2
min_base_len = 8

indices_to_delete = []
j = 0
suffix_grace_words = 0
for i,w in enumerate(model.index2word):
    l = w.strip().lower()
    found = False
    if l in words:
        found = True
    if found:
        model.vocab[w].index = j
        j += 1
    else:
        del model.vocab[w]
        indices_to_delete.append(i)

model.syn0 = np.delete(model.syn0, indices_to_delete, axis=0)
print('slim: %d' % len(model.vocab))
print('suffix grace words: %d' % (suffix_grace_words))



slim: 1043624
suffix grace words: 0


In [10]:
slim_model_name = '/mnt/data/data/models-fasttext/cc.sk.300-slim.vec.bin'
model.save_word2vec_format(slim_model_name, binary=True)
slim_model_name = '/mnt/data/data/models-fasttext/cc.sk.300-slim.vec'
model.save_word2vec_format(slim_model_name, binary=False)