# Create word2vec from elasticsearch index using fasttext

Install dependencies

In [None]:
import sys
import re
sys.path.append("./../")

In [None]:
pip install fasttext

Set variables

In [None]:
es_host = "doccano-dataset-tools-es01"
es_index = "free-text-reports"
field = "text"
output_txt_file = '/home/jovyan/work/resources/data_ignored/data.txt'
output_model_file = "fasttext_rad.bin"
output_vec_file = "fasttext_rad.vec"

Create .txt file of all documents in index

In [None]:
from nlp_datau.index_to_txt import IndexToTxt
to_txt = IndexToTxt(es_host=es_host, es_index=es_index)
to_txt.write_index(output_txt_file, field)

clean text

In [None]:
import spacy

nlp = spacy.load("nl_core_news_sm")

In [None]:
from string import punctuation


def replace_punct(string):
    if string is None: 
        return ''
    for ch in ['&','#', '?', '!', ':', ';', ', ', '- ', '(' , ')', '[', ']']:
         if ch in string:
            string = string.replace(ch, ' ')
    return string

    
def clean(line):
    lines = []
    line = replace_punct(line)
    doc = nlp(line)
    for sent in doc.sents:
        line_sentence = sent.text
        line_sentence = line_sentence.lower()
        line_sentence = line_sentence.strip()
        line_sentence = line_sentence.rstrip('.')
        line_sentence = line_sentence.rstrip(',')

        if line_sentence is not None and len(line_sentence) > 0 and len(line_sentence.split()) > 2:
            lines.append(line_sentence)
    return lines

with open(output_txt_file + 'clean.txt', 'w') as out_file:
    with open(output_txt_file) as fp:
        for cnt, line in enumerate(fp):                
            cleaned_lines = clean(line)
            if cnt % 1000 == 0:
                print('--')
                print('IN: {}'.format(line))
                print('OUT: {}'.format(cleaned_lines))
            for cleaned_line in cleaned_lines:
                out_file.write(cleaned_line + '\n')
            

Run fasttext

In [None]:
import fasttext

model = fasttext.train_unsupervised(output_txt_file + 'clean.txt', model='skipgram')

print(model.words)
print(model['atelectase'])

model.save_model(output_model_file)
model.save_vectors(output_vec_file)


In [None]:
import fasttext
from fasttext import load_model

def save_vectors(model, file):
    f = model
    words = f.get_words()
    print(str(len(words)) + " " + str(f.get_dimension()))
    with open(file, 'w') as out_file:

        for w in words:
            v = f.get_word_vector(w)
            vstr = ""
            for vi in v:
                vstr += " " + str(vi)
            try:
                out_file.write(w + vstr + '\n')
            except IOError as e:
                if e.errno == errno.EPIPE:
                    pass
            
model = fasttext.load_model(output_model_file)
save_vectors(model, output_vec_file)
print(model.get_nearest_neighbors('atelectase'))


In [None]:
print(model.get_nearest_neighbors('onwaarschijnlijk', k=200))


In [None]:
pip install gensim

In [None]:
from gensim.models.fasttext import load_facebook_model

wv = load_facebook_model(output_model_file)


In [None]:
import csv

with open('wv_embeddings.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    words = wv.wv.vocab.keys()
    for word in words:
        vector = wv.wv.get_vector(word).tolist()
        row = [word] + vector
        writer.writerow(row)

In [None]:
pip install -q tensorflow

In [None]:
# import statements
# from pathlib import PurePath
import os

import fasttext
import numpy as np
import tensorflow as tf
from tensorflow.python.framework import ops

from tensorboard.plugins import projector
from tensorboard.plugins.projector import ProjectorConfig


In [None]:
# load pre-trained fasttext model
model = fasttext.load_model(output_model_file)

In [None]:
for i, w in enumerate(model.get_words()):
    print(w)
    if i > 4:
        break

In [None]:
#hide_output

# number of words in the dataset
VOCAB_SIZE = len(model.get_words())


# size of the dimension of each word vector
EMBEDDING_DIM = len(model.get_word_vector(w))


# 2D numpy array initialised to store words with their vector representation
embed = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
embed.shape

In [None]:
# store the vector representation of each word in the 2D numpy array
for i, word in enumerate(model.get_words()):
    embed[i] = model.get_word_vector(word)
embed

In [None]:
# path to store the words
tsv_file_path = "tensorboard/metadata.tsv"

In [None]:
ops.reset_default_graph()  # clearing the default graph stack


def register_embedding(
    embedding_tensor_name: str, meta_data_fname: str, log_dir: str,
) -> None:

    """
    Configuring the projector to be read by the tensorboard.
    
    Args:
    embedding_tensor_name(str): embeddings file name
    meta_data_fname(str): metadata file name
    log_dir(str): folder where tensorboard files and the metadata file are saved
    
    Returns:
    None    
    
    """
    config = projector.ProjectorConfig()
    embedding = config.embeddings.add()
    embedding.tensor_name = embedding_tensor_name
    embedding.metadata_path = meta_data_fname
    projector.visualize_embeddings(
        log_dir, config
    )  # storing the configuration files of projector where tensorboard files are saved

In [None]:
def save_labels_tsv(labels: list, filepath: str, log_dir: str,) -> None:

    """
    Storing the vocabulary of words in the dataset to a file
    
    Args:
    labels: vocabulary i.e. words in the dataset
    filepath: metadata file name
    log_dir: "folder where tensorboard files and projector files are saved
    
    Returns:
    None  
    
    """

    with open(os.path.join(log_dir, filepath), "w") as f:
        for label in labels:
            f.write("{}\n".format(label))

In [None]:
LOG_DIR = "tb2files"  # folder which will contain all the tensorboard log files
os.makedirs(LOG_DIR, exist_ok=True)

# Labels i.e. the words in the dataset will be stored in this file
META_DATA_FNAME = "meta.tsv"

# name of the file which will have the embeddings stored
EMBEDDINGS_TENSOR_NAME = "embeddings"

# path for checkpoint of the saved embeddings
EMBEDDINGS_FPATH = os.path.join(LOG_DIR, EMBEDDINGS_TENSOR_NAME + ".ckpt")
STEP = 0


x = embed  # array containing the embeddings
y = model.get_words()  # list containing the vocabulary
register_embedding(EMBEDDINGS_TENSOR_NAME, META_DATA_FNAME, LOG_DIR)
save_labels_tsv(y, META_DATA_FNAME, LOG_DIR)

In [None]:
tensor_embeddings = tf.Variable(
    x, name=EMBEDDINGS_TENSOR_NAME
)  # creation of the tensorflow variable, x: array which contains the embeddings,
# name: name of the file which will have the embeddings stored

In [None]:
#hide_output

saver = tf.compat.v1.train.Saver(
    [tensor_embeddings]
)  # Tensorflow variable passed as argument for saver object to be initialised
saver.save(
    sess=None, global_step=STEP, save_path=EMBEDDINGS_FPATH
)  # saving the checkpoint for the embedding files

In [None]:
%load_ext tensorboard
%tensorboard --logdir {LOG_DIR} --host 0.0.0.0 --port 6006
# %reload_ext tensorboard 
# %tensorboard --logdir {LOG_DIR} --host 0.0.0.0 --port 6006
