In [2]:
! bash download-word-vectors.sh

In [3]:
! bash download-word-analogies.sh

In [4]:
! python --version

Python 3.6.9


In [5]:
! pip install gensim==3.8.3 PyICU==2.6



In [6]:
from itertools import product
import pandas as pd
from tqdm import tqdm


languages = ('cs', 'de', 'es', 'fi', 'fr', 'hi', 'it', 'pl', 'pt', 'tr', 'zh')
restrict_vocabs = (2 * 10**5, 3 * 10**5, 10**6)
preprocessings = tuple((*product(('upper', 'lower'), (False, True)), ('none', False)))


def evaluate_word_analogies(self, analogies, restrict_vocab, case_transformation, locale, dummy4unknown):
    from gensim import utils
    from itertools import chain

    def case_transform(word):
        import icu
        if case_transformation == 'upper':
            if locale is None:
                return word.upper()
            else:
                return str(icu.UnicodeString(word).toUpper(icu.Locale(locale)))
        elif case_transformation == 'lower':
            if locale is None:
                return word.lower()
            else:
                return str(icu.UnicodeString(word).toLower(icu.Locale(locale)))
        elif case_transformation == 'fold':
            if locale is None:
                return word.casefold()
            else:
                if locale == 'tr':
                    options = icu.U_FOLD_CASE_EXCLUDE_SPECIAL_I
                else:
                    options = icu.U_FOLD_CASE_DEFAULT
                return str(icu.UnicodeString(word).foldCase(options))
        elif case_transformation is None:
            return word
        else:
            raise RuntimeError(f'Unknown case transformation "{case_transformation}"')

    ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]]
    ok_vocab = {case_transform(w): v for w, v in reversed(ok_vocab)} if case_transformation else dict(ok_vocab)
    oov = 0
    sections, section = [], None
    quadruplets_no = 0
    with utils.open(analogies, 'rb') as fin:
        for line_no, line in enumerate(fin):
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                    self._log_evaluate_word_analogies(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("Missing section header before line #%i in %s" % (line_no, analogies))
                try:
                    a, b, c, expected = [case_transform(word) for word in line.split()]
                except ValueError:
                    continue
                quadruplets_no += 1
                if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab:
                    oov += 1
                    if dummy4unknown:
                        section['incorrect'].append((a, b, c, expected))
                    continue
                original_vocab = self.vocab
                self.vocab = ok_vocab
                ignore = {a, b, c}  # input words to be ignored
                predicted = None
                # find the most likely prediction using 3CosAdd (vector offset) method
                # TODO: implement 3CosMul and set-based methods for solving analogies
                sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab)
                self.vocab = original_vocab
                for element in sims:
                    predicted = case_transform(element[0])
                    if predicted in ok_vocab and predicted not in ignore:
                        break
                if predicted == expected:
                    section['correct'].append((a, b, c, expected))
                else:
                    section['incorrect'].append((a, b, c, expected))
    if section:
        # store the last section, too
        sections.append(section)
        self._log_evaluate_word_analogies(section)

    total = {
        'section': 'Total accuracy',
        'correct': list(chain.from_iterable(s['correct'] for s in sections)),
        'incorrect': list(chain.from_iterable(s['incorrect'] for s in sections)),
    }

    oov_ratio = float(oov) / quadruplets_no * 100
    analogies_score = self._log_evaluate_word_analogies(total)
    sections.append(total)
    # Return the overall score and the full lists of correct and incorrect analogies
    return analogies_score, sections


def accuracy(language, preprocessing=preprocessings[0], restrict_vocab=restrict_vocabs[0]):
    import json
    case_transformation, set_locale = preprocessing
    if case_transformation == 'none':
        case_transformation = None
    results_filename = f'results/{language}-{case_transformation}-{set_locale}-{restrict_vocab}.json'
    try:
        with open(results_filename, 'rt') as f:
            results = json.load(f)
    except IOError:
        from gensim.models import KeyedVectors
        vectors = KeyedVectors.load_word2vec_format(f'vectors/{language}.vec', limit=restrict_vocab)
        options = {
            'case_transformation': case_transformation,
            'locale': language if set_locale else None,
            'dummy4unknown': False,
            'restrict_vocab': restrict_vocab,
        }
        results = evaluate_word_analogies(vectors, f'analogies/{language}.txt', **options)
        ! mkdir -p results
        with open(results_filename, 'wt') as f:
            json.dump(results, f, indent=4, sort_keys=True)
    score, sections = results
    return score


def format_accuracy(accuracy):
    return '{:.1f}'.format(accuracy * 100.0)


pd.options.display.float_format = format_accuracy

In [7]:
pd.DataFrame.from_dict({
    language: {
        restrict_vocab: accuracy(language, restrict_vocab=restrict_vocab)
        for restrict_vocab in restrict_vocabs
    }
    for language in tqdm(languages, leave=False)
})



Unnamed: 0,cs,de,es,fi,fr,hi,it,pl,pt,tr,zh
200000,70.7,73.4,65.6,71.2,73.7,32.2,73.0,68.5,67.0,57.0,78.5
300000,68.9,73.3,65.6,68.9,73.2,26.4,72.0,66.3,65.7,55.8,78.5
1000000,65.4,70.4,63.4,60.9,71.9,16.0,68.3,61.1,61.3,51.8,78.3


In [8]:
pd.DataFrame.from_dict({
    language: {
        preprocessing: accuracy(language, preprocessing=preprocessing)
        for preprocessing in preprocessings
    }
    for language in tqdm(languages, leave=False)
})



Unnamed: 0,Unnamed: 1,cs,de,es,fi,fr,hi,it,pl,pt,tr,zh
upper,False,70.7,73.4,65.6,71.2,73.7,32.2,73.0,68.5,67.0,57.0,78.5
upper,True,70.7,73.4,65.6,71.2,73.7,32.2,73.0,68.5,67.0,61.0,78.5
lower,False,70.7,73.4,65.6,71.2,73.7,32.2,73.0,68.5,67.0,56.9,78.5
lower,True,70.7,73.4,65.6,71.2,73.7,32.2,73.0,68.5,67.0,61.0,78.5
none,False,69.9,74.9,63.9,53.3,76.7,32.2,71.9,71.4,67.5,58.2,78.5
