In [22]:
FASTTEXT_PARAMETERS = {
    'sg': 0,
    'bucket': 2 * 10**6,
    'negative': 10,
    'alpha': 0.05,
    'min_alpha': 0,
    'sample': 10**-5,
    'min_n': 3,
    'max_n': 6,
    'min_count': 5,
    'workers': 32,
    'epochs': 1,
    'vector_size': 300,
}

In [23]:
WITIKO_URL = 'https://github.com/Witiko/gensim.git'

REPOSITORIES = {
    'Branch position-dependent-weighting-vanilla': (WITIKO_URL, 'efe3126', {}),
    'Branch position-dependent-weighting-identity': (WITIKO_URL, '94a57ff', {}),
    'Branch position-dependent-weighting-uniform': (WITIKO_URL, 'fa9dfcf', {}),
    'Branch position-dependent-weighting-square-normal': (WITIKO_URL, '84291c1', {}),
}

CONFIGURATIONS = {
    'CBOW+NS': {},
    'CBOW+NS+PDW': {'position_dependent_weights': 1},
}

VARIANTS = {
    'Window size 5': {'window': 5},
    'Window size 15': {'window': 15},
}

In [24]:
! python --version

Python 3.7.3


In [25]:
import locale

locale.setlocale(locale.LC_ALL, ('en_US', 'UTF-8'))

'en_US.UTF-8'

In [26]:
import numpy as np

np.show_config()

blas_mkl_info:
  NOT AVAILABLE
blis_info:
  NOT AVAILABLE
openblas_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
blas_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_mkl_info:
  NOT AVAILABLE
openblas_lapack_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]
lapack_opt_info:
    libraries = ['openblas', 'openblas']
    library_dirs = ['/usr/local/lib']
    language = c
    define_macros = [('HAVE_CBLAS', None)]


In [27]:
WORD_ANALOGY_DATASET_FILENAME = 'data/word-analogies/questions-words.txt'

In [28]:
%%bash -e -c 'LC_ALL=C bash'
set -e
if ! [ -e data/word-analogies ]
then
    mkdir -p data/word-analogies
    cd data/word-analogies
    wget https://raw.githubusercontent.com/tmikolov/word2vec/master/questions-words.txt
fi

In [29]:
from itertools import chain

PHRASERS = tuple()

def _read_sentences_helper(article):
    from gensim.utils import simple_preprocess
    all_sentences = []
    for section_title, section_text in zip(article['section_titles'], article['section_texts']):
        sentences = section_text.splitlines()
        sentences = map(str.strip, sentences)
        sentences = (sentence for sentence in sentences if sentence)
        sentences = chain([section_title], sentences)
        sentences = map(simple_preprocess, sentences)
        all_sentences.extend(sentences)
    for phraser in PHRASERS:
        all_sentences = phraser[all_sentences]
    all_sentences = list(all_sentences)
    return all_sentences

class CorpusSentenceIterator(object):
    def __init__(self, phrasers=tuple(), corpus_name='wiki-english-20171001', percentage=1.0):
        self.phrasers = tuple(phrasers)
        self.corpus_name = corpus_name
        self.percentage = percentage
        self.iterable = None
        import gensim.downloader
        gensim.downloader.load(self.corpus_name)

    def __iter__(self):
        self.__init__(self.phrasers, self.corpus_name, self.percentage)
        return self

    def _read_sentences(self, corpus_num_articles=4924894):
        assert CURRENT_GENSIM_URL_AND_REF is not None  # _reinstall_gensim has been called prior to importing gensim
        import gensim.downloader
        from tqdm import tqdm
        total = int(corpus_num_articles * self.percentage)
        desc = 'Reading articles from the {} corpus'.format(self.corpus_name)
        articles = gensim.downloader.load(self.corpus_name)
        articles = (article for article, _ in zip(articles, range(total)))
        articles = tqdm(articles, desc=desc, total=total)
        global PHRASERS
        PHRASERS = self.phrasers
        from multiprocessing import Pool
        with Pool(None) as pool:
            for sentences in pool.imap_unordered(_read_sentences_helper, articles):
                for sentence in sentences:
                    yield sentence            

    def __next__(self):
        if self.iterable is None:
            self.iterable = self._read_sentences()
        corpus_sentence = next(self.iterable)
        return corpus_sentence

In [30]:
CURRENT_GENSIM_URL_AND_REF = None

from itertools import product

def _reinstall_gensim(url, ref):
    global CURRENT_GENSIM_URL_AND_REF
    if CURRENT_GENSIM_URL_AND_REF == (url, ref):
        return  # only reinstall when necessary
    from sys import modules
    if 'gensim' in modules:
        raise RuntimeError('Restart the kernel and rerun the Jupyter notebook')
    ! pip uninstall gensim -y
    ! cd /var/tmp && \
      rm -rf gensim && \
      git clone {url} && \
      cd gensim && \
      git checkout {ref} && \
      TMPDIR=/var/tmp pip install . && \
      python setup.py build_ext --inplace && \
      python -c 'from gensim.models import FastText' && \
      cd .. && \
      rm -rf gensim
    CURRENT_GENSIM_URL_AND_REF = (url, ref)

def _stringify_parameters(parameters):
    parameters = sorted(parameters.items())
    import re
    def stringify(obj): return re.sub('_', '-', str(obj))
    parameters = ('{}={}'.format(stringify(key), stringify(value)) for key, value in parameters if key != 'url')
    parameters = '_'.join(parameters)
    return parameters

def _format_duration(duration):
    hours = int(duration // 3600)
    minutes = int((duration % 3600) // 60)
    seconds = int(round((duration % 60)))
    return '{:2d}h {:02d}m {:02d}s'.format(hours, minutes, seconds)

def _format_accuracy(accuracy):
    return '{:.2f}%'.format(100.0 * accuracy)

def _phrasers(num_iterations=0, starting_threshold=100.0, final_threshold=100.0):
    from gensim.models.phrases import Phraser
    phrasers = []
    phrases_filename_template = 'data/models/phrases_{}'
    phraser_filename_template = 'data/models/phraser_{}'
    for iteration in range(num_iterations):
        progress = iteration * 1.0 / (num_iterations - 1) if num_iterations > 1 else 0.0
        threshold = starting_threshold + progress * (final_threshold - starting_threshold)
        parameters = {'iteration': iteration + 1, 'threshold': threshold}
        stringified_parameters = _stringify_parameters(parameters)
        phrases_filename = phrases_filename_template.format(stringified_parameters)
        phraser_filename = phraser_filename_template.format(stringified_parameters)
        try:
            phraser = Phraser.load(phraser_filename)
        except IOError as e:
            from gensim.models.phrases import Phrases
            sentences = CorpusSentenceIterator(phrasers=phrasers)
            phrases_kwargs = {
                'sentences': sentences,
                'max_vocab_size': float('inf'),
                'threshold': threshold,
            }
            phrases = Phrases(**phrases_kwargs)
            phraser = Phraser(phrases)
            ! mkdir -p data/models
            phrases.save(phrases_filename)
            phraser.save(phraser_filename)
        phrasers.append(phraser)
    return phrasers

def _train(fasttext_model_filename, fasttext_parameters, only_load=False):
    training_duration_filename = '{}.duration'.format(fasttext_model_filename)
    try:
        with open(training_duration_filename, 'rt') as f:
            training_duration = float(f.read())
    except IOError as e:
        if only_load:
            raise e
        from datetime import datetime
        assert CURRENT_GENSIM_URL_AND_REF is not None  # # _reinstall_gensim has been called prior to importing gensim
        from gensim.models.fasttext import FastText
        corpus_sentences = CorpusSentenceIterator(phrasers=_phrasers())
        training_start_time = datetime.now()
        fasttext_model = FastText(corpus_sentences, **fasttext_parameters)
        training_finish_time = datetime.now()
        training_duration = (training_finish_time - training_start_time).total_seconds()
        ! mkdir -p data/models
        fasttext_model.save(fasttext_model_filename)
        with open(training_duration_filename, 'wt') as f:
            print(training_duration, file=f)
    return training_duration

def _evaluate(fasttext_model_filename, word_analogy_dataset_filename, only_load=False):
    word_analogy_accuracy_filename = '{}.accuracy'.format(fasttext_model_filename)
    try:
        with open(word_analogy_accuracy_filename, 'rt') as f:
            word_analogy_accuracy = float(f.read())
    except IOError:
        if only_load:
            raise e
        assert CURRENT_GENSIM_URL_AND_REF is not None  # # _reinstall_gensim has been called prior to importing gensim
        from gensim.models.fasttext import FastText
        fasttext_model = FastText.load(fasttext_model_filename)
        def evaluate(*args, **kwargs): return fasttext_model.wv.evaluate_word_analogies(*args, **kwargs)
        word_analogy_accuracy, _ = evaluate(word_analogy_dataset_filename, restrict_vocab=200000)
        with open(word_analogy_accuracy_filename, 'wt') as f:
            print(word_analogy_accuracy, file=f)
    return word_analogy_accuracy

def _train_fasttext_model(install_parameters, fasttext_parameters, fasttext_model_filename):
    train_parameters = (fasttext_model_filename, fasttext_parameters)
    evaluate_parameters = (fasttext_model_filename, WORD_ANALOGY_DATASET_FILENAME)
    
    try:
        training_duration = _train(*train_parameters, only_load=True)
    except IOError:
        _reinstall_gensim(**install_parameters)
        training_duration = _train(*train_parameters)
        word_analogy_accuracy = _evaluate(*evaluate_parameters)
    else:
        try:
            word_analogy_accuracy = _evaluate(*evaluate_parameters, only_load=True)
        except IOError:
            _reinstall_gensim(**install_parameters)
            word_analogy_accuracy = _evaluate(*evaluate_parameters)
    
    return (training_duration, word_analogy_accuracy)

def _resolve_text_specification(repository, configuration, variant, **call_fasttext_parameters):
    configuration_fasttext_parameters = CONFIGURATIONS[configuration]
    url, ref, repository_fasttext_parameters = REPOSITORIES[repository]
    variant_fasttext_parameters = VARIANTS[variant]
    fasttext_parameters = {
        **FASTTEXT_PARAMETERS,
        **configuration_fasttext_parameters,
        **repository_fasttext_parameters,
        **variant_fasttext_parameters,
        **call_fasttext_parameters,
    }
    install_parameters = {'url': url, 'ref': ref}
    parameter_string = _stringify_parameters({**fasttext_parameters, **install_parameters})
    fasttext_model_filename = 'data/models/fasttext-model_{}'.format(parameter_string)
    return (install_parameters, fasttext_parameters, fasttext_model_filename)

def train_fasttext_model(*args, **kwargs):
    training_duration, word_analogy_accuracy = _train_fasttext_model(*_resolve_text_specification(*args, **kwargs))
    print('Training duration: {}'.format(_format_duration(training_duration)))
    print('English word analogy task, total accuracy: {}'.format(_format_accuracy(word_analogy_accuracy)))

In [31]:
train_fasttext_model('Branch position-dependent-weighting-identity', 'CBOW+NS', 'Window size 5')

Training duration:  2h 06m 33s
English word analogy task, total accuracy: 65.52%


In [32]:
train_fasttext_model('Branch position-dependent-weighting-identity', 'CBOW+NS', 'Window size 5', epochs=3)

Training duration:  4h 41m 17s
English word analogy task, total accuracy: 70.94%


In [33]:
train_fasttext_model('Branch position-dependent-weighting-vanilla', 'CBOW+NS+PDW', 'Window size 15')

Training duration:  5h 01m 16s
English word analogy task, total accuracy: 50.96%


In [34]:
train_fasttext_model('Branch position-dependent-weighting-identity', 'CBOW+NS+PDW', 'Window size 15')

Training duration:  4h 59m 27s
English word analogy task, total accuracy: 75.02%


In [35]:
train_fasttext_model('Branch position-dependent-weighting-uniform', 'CBOW+NS+PDW', 'Window size 15')

Training duration:  4h 57m 25s
English word analogy task, total accuracy: 74.31%


In [36]:
train_fasttext_model('Branch position-dependent-weighting-square-normal', 'CBOW+NS+PDW', 'Window size 15')

Training duration:  5h 01m 11s
English word analogy task, total accuracy: 74.95%
