# Imports
Import some basic packages we are going to use

In [3]:
import os
import re
import glob
import nltk
import fileinput
from gensim.models import Word2Vec
from joblib import cpu_count
from pathlib import Path
from nltk import sent_tokenize

from nltk import sent_tokenize
from concurrent.futures import ProcessPoolExecutor
from collections import deque
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Input Helper Functions
Define some helper functions. These are just for input processing, that is, splitting the input files into sentences and tokenizing. 

In [4]:
class InputDirectory(object):
    """
    Provide input for the word2vec model from a directory.
    All files in the directory are expected to contain one sentence per line (see @split_folder_into_sentences).
    """

    def __init__(self, dirname):
        """
        :param dirname: The directory to read sentences from
        """
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()


def split_file_into_sentences(filename: str, output_filename: str, language: str = "english"):
    """
    Take a file containing text and convert it to a new file with one sentence per line

    :param filename: the file to convert
    :param output_filename: the name of the output file
    :param language: the language the text in the file is in
    """
    with open(filename, "r") as f:
        text = f.read()
    sentences = sent_tokenize(text, language=language)
    with open(output_filename, "w") as f:
        for sentence in sentences:
            f.write(sentence.replace("\n", " "))
            f.write("\n")
            
                
def split_folder_into_sentences(dirname: str, output_dir: str, language: str = "english",
                                max_workers=min(10, cpu_count() // 4)):
    """
    Take a folder of files containing text and convert them each of them to a new file with one sentence per
    line. The new file will have the same name as the original file and will be stored in the specified output
    directory.

    :param filename: the folder containing files to convert
    :param output_filename: the name of the output folder.
    :param language: the language the text in the files are in
    :return:
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    process_pool = ProcessPoolExecutor(max_workers=max_workers)

    jobs = deque()

    for file in list(Path(dirname).iterdir()):
        if file.name.startswith("."):
            continue
        split_file_into_sentences(str(file), str(output_dir.joinpath(file.name)),
                                language=language)

        
def tokenize_file(filename: str, output_filename: str, tokenizer=nltk.word_tokenize):
    with open(filename, "r") as in_file:
        with open(output_filename, "w") as out_file:
            for line in in_file.readlines():
                tokens = tokenizer(line)
                out_file.write(" ".join(tokens))
                out_file.write("\n")
        
        
def tokenize_folder(dirname: str, output_dir: str, tokenizer=nltk.word_tokenize,
                    max_workers=min(10, cpu_count() // 4)):
    """
    """
    output_dir = Path(output_dir)
    output_dir.mkdir(exist_ok=True)
    process_pool = ProcessPoolExecutor(max_workers=max_workers)

    jobs = deque()

    for file in list(Path(dirname).iterdir()):
        if file.name.startswith("."):
            continue
        tokenize_file(str(file), str(output_dir.joinpath(file.name)),
                                tokenizer=tokenizer)
        
class Temporary:
    """ Used for FastText: Temporarily create a single file from an input directory """
    def __init__(self, input_directory):
        self.input_dir = input_directory

    def __str__(self):
        return "%s_tmp.txt" % self.input_dir

    def __enter__(self):
        with open(str(self), 'w') as fout:
            fin = fileinput.input(glob.glob("%s/*.txt" % self.input_dir))
            for line in fin:
                fout.write(line)
            fin.close()
        return str(self)

    def __exit__(self, *args):
        os.remove(str(self))

# Training Word2Vec
This is where the actual training happens. Define the input folder in the first line and some hyperparameters of the model in the following lines.

Then, all files in the input folder are split into sentences and tokenized. The last two lines train the model using the gensim package and save it to the output directory.

In [None]:
input_folder = "hp_en"

ITERATIONS = 20
NUM_CPUS = min(cpu_count(), 10)
VECTOR_SIZE = 300
MINIMUM_TOKEN_OCCURRENCES = 100

WORD_REGEX = re.compile("^[a-zA-Z_]+$")

def filter_words(tokens):
    return filter(WORD_REGEX.match, tokens)

def TOKENIZER(x: str):
    return nltk.word_tokenize(x.lower())

input_sentences = "%s_sentences" % input_folder
input_tokenized = "%s_tokenized" % input_folder
model_dir = "%s_model" % input_folder

if not Path(input_sentences).is_dir():
    split_folder_into_sentences(input_folder, input_sentences)
if not Path(input_tokenized).is_dir():
    tokenize_folder(input_sentences, input_tokenized, tokenizer=TOKENIZER, max_workers=NUM_CPUS)
tokens = InputDirectory(input_tokenized)

model = Word2Vec(tokens, vector_size=VECTOR_SIZE, workers=NUM_CPUS, epochs=ITERATIONS,
                 min_count=MINIMUM_TOKEN_OCCURRENCES)

model.save(model_dir)

# Evaluating Word2Vec
Now we can do some simple evaluations on the model.
For example, we can find the most similar words to any given word in the corpus. The folliwing axamples assume a model that has been trained on the english Harry Potter corpus with lowercasing:

In [4]:
print(model.wv.most_similar("harry"))

[('he', 0.6172397136688232), ('cho', 0.4725334048271179), ('neville', 0.47103187441825867), ('krum', 0.42111584544181824), ('hagrid', 0.419950932264328), ('snape', 0.38373202085494995), ('hermione', 0.38220012187957764), ('parvati', 0.38051149249076843), ('she', 0.37862080335617065), ('ron', 0.37831130623817444)]


We see that the most similar words to "Harry" are "he" and some other names. This makes sense, as Harry is the protagonist and very frequently referred to as "he".
Harry Potter nerds might find it interesting to see that Neville is actually most similar character to Harry ;)

In [5]:
print(model.wv.most_similar("dobby"))

[('sir', 0.5541843771934509), ('winky', 0.5175085067749023), ('kreacher', 0.4601425528526306), ('elf', 0.43752405047416687), ('bagman', 0.3867757022380829), ('riddle', 0.37830692529678345), ('nick', 0.3709332346916199), ('snape', 0.36309048533439636), ('potter', 0.35632288455963135), ('he', 0.34559860825538635)]


The most similar words to "Dobby" are "sir", Winky", "Kreacher" and "elf". "Sir" is very frequently used by Dobby, while Dobby, Winky and Kreacher are all elves.

Simple calculations also work. For example, we can ask the model which word is to "woman" as "Harry" is to "man":

In [6]:
model.wv.most_similar(positive=("harry", "woman"), negative=("man", ))

[('she', 0.3949149250984192),
 ('hermione', 0.38137519359588623),
 ('cho', 0.37293750047683716),
 ('he', 0.352822482585907),
 ('ginny', 0.3491544723510742),
 ('parvati', 0.33159440755844116),
 ('bellatrix', 0.3270253539085388),
 ('neville', 0.3085140287876129),
 ('her', 0.28927990794181824),
 ('lily', 0.2862216532230377)]

We find that Hermione is basically a female version of Harry, which seems reasonable.


# Visualizing the embeddings
Rather than just looking at individual similarities, it is often interesting to visualize the complete embedding.
Tensorflow provides a nice tool for this under http://projector.tensorflow.org.

In order to use this tool, we need to export our embeddings in the format the projector wats to see, that is, one file containing each embedding vector in a separate line and one file containing the words:

In [7]:
#with open("%s_vectors.csv" %input_folder, "w") as vectors:
#    with open("%s_words.csv" % input_folder, "w") as words:
#        for word in model.wv.vocab:
#            vectors.write("\t".join(map(str, model.wv[word].tolist())))
#            vectors.write("\n")
#            words.write(word+"\n")

For visualization go to http://projector.tensorflow.org and load the two files generated in the previous step.

t-sne visualization is usually more interesting. Some Clusters you may find include names, words similar to "said", numbers, etc.


# Training FastText

In [8]:
import fasttext

input_folder = "hp_en"

ITERATIONS = 20
VECTOR_SIZE = 300
MINIMUM_TOKEN_OCCURRENCES = 100
MODEL = "skipgram"  # alternaticbow

input_sentences = "%s_sentences" % input_folder
input_tokenized = "%s_tokenized" % input_folder
model_dir = "%s_fasttext_model" % input_folder


with Temporary(input_tokenized) as input_file:
    fast_text_model = fasttext.train_unsupervised(input_file,
                                                  model=MODEL,
                                                  dim=VECTOR_SIZE,
                                                  minCount=MINIMUM_TOKEN_OCCURRENCES,
                                                  epoch=ITERATIONS,
                                                  verbose=2)

# Visualizing the embeddings

In [None]:
with open("%s_fasttext_vectors.csv" %input_folder, "w") as vectors:
    with open("%s_fasttext_words.csv" % input_folder, "w") as words:
        for word in fast_text_model.get_words():
            vectors.write("\t".join(map(str, fast_text_model.get_word_vector(word))))
            vectors.write("\n")
            words.write(word+"\n")
            

# Training GloVe
Note: Installing this is a bit more painful than the other libraries and seems to fail for example on current ARM Macs. The necessary library is called `glove_python`, if you want to try.

In [None]:
# package name glove_python
from glove import Corpus, Glove

input_folder = "hp_en"

ITERATIONS = 20
VECTOR_SIZE = 300
MINIMUM_TOKEN_OCCURRENCES = 100

input_sentences = "%s_sentences" % input_folder
input_tokenized = "%s_tokenized" % input_folder
model_dir = "%s_fasttext_model" % input_folder



corpus_model = Corpus()
corpus_model.fit(InputDirectory(input_tokenized), window=10)

glove_model = Glove(no_components=VECTOR_SIZE)
glove_model.fit(corpus_model.matrix, epochs=ITERATIONS, verbose=True)
glove_model.add_dictionary(corpus_model.dictionary)

# Visualizing the embeddings

In [None]:
with open("%s_glove_vectors.csv" %input_folder, "w") as vectors:
    with open("%s_glove_words.csv" % input_folder, "w") as words:
        for word in glove_model.dictionary:
            vectors.write("\t".join(map(str, glove_model.word_vectors[glove_model.dictionary[word]])))
            vectors.write("\n")
            words.write(word+"\n")
