# Create word2vec from elasticsearch index using fasttext

Install dependencies

In [None]:
import sys
import re
sys.path.append("./../")

In [None]:
pip install fasttext

Set variables

In [None]:
es_host = "doccano-dataset-tools-es01"
es_index = "free-text-reports"
field = "text"
output_txt_file = '/home/jovyan/work/resources/data_ignored/data.txt'
output_model_file = "fasttext_rad.bin"
output_vec_file = "fasttext_rad.vec"

Create .txt file of all documents in index

In [None]:
from nlp_datau.index_to_txt import IndexToTxt
to_txt = IndexToTxt(es_host=es_host, es_index=es_index)
to_txt.write_index(output_txt_file, field)

clean text

In [None]:
import spacy

nlp = spacy.load("nl_core_news_sm")

In [None]:
from string import punctuation


def replace_punct(string):
    if string is None: 
        return ''
    for ch in ['&','#', '?', '!', ':', ';', ', ', '- ', '(' , ')', '[', ']']:
         if ch in string:
            string = string.replace(ch, ' ')
    return string

    
def clean(line):
    lines = []
    line = replace_punct(line)
    doc = nlp(line)
    for sent in doc.sents:
        line_sentence = sent.text
        line_sentence = line_sentence.lower()
        line_sentence = line_sentence.strip()
        line_sentence = line_sentence.rstrip('.')
        line_sentence = line_sentence.rstrip(',')

        if line_sentence is not None and len(line_sentence) > 0 and len(line_sentence.split()) > 2:
            lines.append(line_sentence)
    return lines

with open(output_txt_file + 'clean.txt', 'w') as out_file:
    with open(output_txt_file) as fp:
        for cnt, line in enumerate(fp):                
            cleaned_lines = clean(line)
            if cnt % 1000 == 0:
                print('--')
                print('IN: {}'.format(line))
                print('OUT: {}'.format(cleaned_lines))
            for cleaned_line in cleaned_lines:
                out_file.write(cleaned_line + '\n')
            

Run fasttext

In [None]:
import fasttext

model = fasttext.train_unsupervised(output_txt_file + 'clean.txt', model='skipgram')

print(model.words)
print(model['atelectase'])

model.save_model(output_model_file)
model.save_vectors(output_vec_file)


In [None]:
import fasttext
from fasttext import load_model

def save_vectors(model, file):
    f = model
    words = f.get_words()
    print(str(len(words)) + " " + str(f.get_dimension()))
    with open(file, 'w') as out_file:

        for w in words:
            v = f.get_word_vector(w)
            vstr = ""
            for vi in v:
                vstr += " " + str(vi)
            try:
                out_file.write(w + vstr + '\n')
            except IOError as e:
                if e.errno == errno.EPIPE:
                    pass
            
model = fasttext.load_model(output_model_file)
save_vectors(model, 'fasttext_rad.vec')
print(model.get_nearest_neighbors('atelectase'))


In [None]:
print(model.get_nearest_neighbors('onwaarschijnlijk', k=200))


In [None]:
pip install gensim

In [None]:
from gensim.models.fasttext import load_facebook_model

wv = load_facebook_model(output_model_file)


In [None]:
import csv

with open('wv_embeddings.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    words = wv.wv.vocab.keys()
    for word in words:
        vector = wv.wv.get_vector(word).tolist()
        row = [word] + vector
        writer.writerow(row)

In [None]:
pip install -q tensorflow


In [None]:
import tensorflow as tf
import datetime, os

logs_base_dir = "./logs"
os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}