In [None]:
import os
from tqdm import tqdm
from fuzzywuzzy import fuzz
from gensim.models import Word2Vec

# Lists for English words:
from nltk.corpus import words, brown
from english_words import english_words_set

In [None]:
decade = "1860s"

In [None]:
# Read a word2vec model
path2model = os.path.join("data", "w2v_" + decade + "_news", "w2v.model")
model = Word2Vec.load(path2model)

In [None]:
# Words in the embeddings:
w2v_words = list(model.wv.index_to_key)

In [None]:
# Words in the English language:
english_words_set = set([x[0] for x in english_words_set])
english_words = english_words_set.union(set(words.words()))
brown_words = set(brown.words())
english_words = english_words.union(brown_words)

In [None]:
def obtain_matches(word, sims):
    """Given a word and the top 100 nearest neighbours, separate into positive and negative matches."""
    negative = []
    positive = [word]
    for nn in sims:
        nn_word = nn[0]
        # If one word is not a subset of another:
        if not nn_word in word and not word in nn_word:
            # Split both the word and the nearest neighbour in two parts: the idea is that both
            # parts should be equally similar or equally dissimilar, in order to consider them
            # as positive or negative matches (e.g. "careless" and "listless" are two clear words
            # but have high string similarity due to a big chunk of the word---the suffix---being
            # identical):
            nn_word_1 = nn_word[:len(nn_word)//2]
            nn_word_2 = nn_word[len(nn_word)//2:]
            word_1 = word[:len(word)//2]
            word_2 = word[len(word)//2:]
            # If the nearest neighbour word is a word of the English language
            # and the string similarity is less than 0.50, we consider it a
            # negative match (i.e. not an OCR variation):
            if nn_word in english_words and fuzz.ratio(nn_word_1, word_1) < 50 and fuzz.ratio(nn_word_2, word_2) < 50:
                negative.append(nn_word)
            # If the nearest neighbour word is not a word of the English language
            # and the string similarity is more than 0.50, we consider it a
            # positive match (i.e. an OCR variation):
            if not nn_word in english_words and fuzz.ratio(nn_word_1, word_1) > 50 and fuzz.ratio(nn_word_2, word_2) > 50:
                positive.append(nn_word)
    return positive, negative

In [None]:
# For each word in the w2v model, keep likely positive and negative matches:
positive_matches = []
negative_matches = []
for word in tqdm(w2v_words):
    # For each word in the w2v model that is longer than 4 characters and 
    # is a word in the English language:
    if len(word) > 4 and word in english_words:
        # Get the top 100 nearest neighbors
        sims = model.wv.most_similar(word, topn=100)
        # Distinguist between positive and negative matches, where
        # * a positive match is an OCR word variation
        # * a negative match is a different word
        positive, negative = obtain_matches(word, sims)
        # We should have the same number of positive matches as negative:
        shortest_length = min([len(positive), len(negative)])
        negative = negative[:shortest_length]
        positive = positive[:shortest_length]
        # Prepare for writing into file:
        negative_matches += [word + "\t" + x + "\t" + "FALSE\n" for x in negative]
        positive_matches += [word + "\t" + x + "\t" + "TRUE\n" for x in positive]

# Write string pairs into a file:
with open("data/w2v_ocr_pairs_" + decade + ".txt", "w") as fw:
    for nm in negative_matches:
        fw.write(nm)
    for pm in positive_matches:
        fw.write(pm)

In [None]:
queries = []
candidates = []
for string_match in positive_matches + negative_matches:
    candidates.append(string_match.split("\t")[0])
    queries.append(string_match.split("\t")[1])

# Write queries and candidates into a file:
with open("data/queries_" + decade + ".txt", "w") as fw:
    for q in set(queries):
        fw.write(q + "\n")
with open("data/candidates_" + decade + ".txt", "w") as fw:
    for c in set(candidates).union(english_words):
        fw.write(c + "\n")