In [1]:
import ast
import tqdm
import string
import statistics
import pandas as pd
from Levenshtein import distance as levDist

In [2]:
canddf = pd.read_csv("candidates.tsv", sep="\t")

In [3]:
canddf.head()

Unnamed: 0,wiki,gb1900,distance,jaccard_sim,match
0,Abbotsford,Abbotsford,192.360444,1.0,"[('abbotsford', 'abbotsford', 1.0)]"
1,Aberavon,ABERAVON,154.701764,1.0,"[('aberavon', 'aberavon', 1.0)]"
2,Abercarn,Abercarn,446.576164,1.0,"[('abercarn', 'abercarn', 1.0)]"
3,Aberearne,Abercarn,446.576164,1.0,"[('aberearne', 'abercarn', 0.8235294117647058)]"
4,Aberdare,Aberdare U. D.,1439.7546,1.0,"[('aberdare', 'aberdare', 1.0)]"


In [4]:
stop = {'ever', 'but', 'down', 'namely', 'may', 'make', 'within', 'us', 'm', 'up', 'back', 'their', 'third', 'll', 'these', 'every', 'therein', 'off', 'either', 'behind', 'six', 'because', 'done', 'further', 'could', 'cannot', 'still', 'latterly', 'through', 'much', 'with', 'what', 've', 'nowhere', 'over', 'yourselves', 'below', 'which', 'of', "re", 'quite', 'others', 'front', 'by', "ll", 'toward', 'another', 'whom', 'beyond', 'empty', 'if', 'call', 'me', 'itself', 'most', "s", 'who', 'themselves', 'whatever', 'must', 'again', 'get', 'thereafter', 'meanwhile', 's', 'as', "m", 'somehow', 'above', 'please', 'nevertheless', 'whereupon', 'hereafter', 'any', 'anyway', 'was', 'seemed', 'hence', 'here', 'across', 'really', 'never', 'becomes', 'ours', 'this', 'yet', 'seeming', 'than', 'anywhere', 'other', 'whereafter', 'except', 'else', 'own', 'whither', 'elsewhere', 'noone', 'll', 'bottom', 'once', 'move', 'beforehand', 'we', 'whereby', 'both', 'four', 'are', 'anything', 'formerly', 'himself', 'from', 'sixty', 'were', 'each', 'all', 'neither', 'go', 'sometime', 'have', 'latter', 'n t', 'first', 'into', 'during', 'thus', 'them', 'while', 'something', 'everywhere', 'whence', 'did', 'would', 'when', 're', "n t", 'thereupon', 'used', 'nothing', 'become', 'amount', 'his', 'various', 'without', 'mine', 'thence', 'becoming', 'against', 'around', 'least', 'such', 'take', 'even', 'be', 'at', 'together', 'top', 'upon', 'hereupon', 'is', 'has', 'nobody', 'none', 'ourselves', 'beside', 'twenty', 've', 'regarding', 'that', 'am', 'due', 'always', 'show', 'for', 'among', 'out', 'since', 'five', 'though', 'had', 'sometimes', 'the', 'wherein', 'in', 'per', 'afterwards', 'its', 'i', 'do', 'made', 're', 'same', 're', 'those', 'although', 'former', 'hers', 'eleven', 'd', 'everyone', 's', 'should', 'about', 'next', 'well', 'no', 'alone', 'whenever', 'hereby', 'more', 'also', 'forty', 'and', 'thru', 'herein', 'someone', 'a', 'ca', 'moreover', 'perhaps', 'fifty', 'an', 'everything', 'became', 'her', 'being', 'now', 'whereas', 'throughout', 'not', 'eight', 'you', 'less', 'only', 'besides', 'she', 'yourself', 'three', 'say', 'name', 'or', 'him', 'therefore', 'onto', 'almost', 'serious', 'after', 'yours', 'see', 'n t', 'just', 'rather', 'might', 'to', 'anyone', 'whoever', 'two', 'there', 'fifteen', 'keep', 'my', 'using', 'anyhow', 'they', 'somewhere', 'will', 'our', 'put', 'few', 'on', "ve", 'twelve', 'part', 'm', 'd', 'often', 'between', 'towards', "d", 'whose', 'many', 'amongst', 'myself', 'been', 'wherever', 'so', 'it', 'hundred', 'your', 'herself', 'mostly', 'last', 'several', 'too', 'full', 'indeed', 'otherwise', 'enough', 'where', 'then', 'he', 'give', 'via', 'already', 'doing', 'along', 'very', 'before', 'until', 'one', 'nine', 'does', 'nor', 'how', 'unless', 'why', 'thereby', 'ten', 'whether', 'can', 'some', 'seem', 'under', 'side', 'however', 'seems', 'whole'}

def get_tokens(toponym):
    result = toponym.split(" ")
    result = [token.lower().translate({ord(c): None for c in string.punctuation}) for token in result]
    result = [token.strip() for token in result if token not in stop and len(token)>=2]
    return " ".join(result)

In [5]:
wikiproc = []
gb1900proc = []
for i, row in canddf.iterrows():
    wikiproc.append(get_tokens(row["wiki"]))
    gb1900proc.append(get_tokens(row["gb1900"]))
canddf["wikiproc"] = wikiproc
canddf["gb1900proc"] = gb1900proc

In [6]:
def get_ngrams(placename, maxngrams,minngrams):
    
    ngrams = []
    for nlen in range(maxngrams,minngrams,-1):
        for ii in range(len(placename)-nlen+1):
            ngrams.append(placename[ii:(ii+nlen)])
    return ngrams

In [7]:
def levAvg(tok_matches, wiki_top, gb1900_top):
    
    max_items = int((len(wiki_top.split(" ")) + len(gb1900_top.split(" "))) / 2.0)
    tup = ast.literal_eval(tok_matches)
    
    tup.sort(key = lambda x: x[2], reverse=True)
    lev_dists = []
    for m in tup[:max_items]:
        lev_dists.append(m[2])
    return statistics.mean(lev_dists)

dPosMatches = dict()
for i, row in canddf.iterrows():
    avgLevSim = float(levAvg(row["match"], row["wikiproc"], row["gb1900proc"]))
    
    if (avgLevSim > 0.8 or float(row["jaccard_sim"] == 1.0)) and float(row["distance"]) < 5000 and row["wiki"] != row["gb1900"]:
        if row["gb1900"] in dPosMatches:
            dPosMatches[row["gb1900"]].append(row["wiki"])
        else:
            dPosMatches[row["gb1900"]] = [row["wiki"]]

dTrue = dict()
dFalse = dict()
for i, row in tqdm.tqdm(canddf.iterrows(), total=canddf.shape[0]):
    
    # =================================================================
    # TRUE MATCHES:
    # =================================================================
    n_cands = 0
    final_true_cands = []
    if row["gb1900"] in dPosMatches:
        final_true_cands = list(set(dPosMatches[row["gb1900"]]))
        n_cands = len(final_true_cands)
    dTrue[row["gb1900"]] = final_true_cands
    
    # =================================================================
    # FALSE MATCHES:
    # =================================================================
    
    selected_wrong_cands = set()
    
    maxcutoff = len(row["gb1900"])-1
    mincutoff = len(row["gb1900"])-3

    cand_ngrams = get_ngrams(row["gb1900"],maxcutoff,mincutoff)
    
    filtered = canddf[(canddf['wikiproc']!=row['wikiproc']) &
             (canddf['wikiproc']!=row['gb1900proc']) &
             (canddf['gb1900proc']!=row['wikiproc']) &
             (canddf['gb1900proc']!=row['gb1900proc'])]
    list_filtered = filtered["gb1900"].to_list()
    
    for cand_ngram in cand_ngrams:

        collected_wrong_cands = {x for x in list_filtered if cand_ngram in x}
        for k in collected_wrong_cands:
            if k not in selected_wrong_cands:
                selected_wrong_cands.add(k)
    
    if len(selected_wrong_cands)<1:
        selected_wrong_cands = set(list_filtered[:int((len(list_filtered))/10)])
    
    # we rank them using LevDist so that we have on top the most similar wrong ones 
    rank_wrong_cands = [[row['gb1900'],x,levDist(x,row['gb1900'])] for x in selected_wrong_cands]
    
    # we sort them
    rank_wrong_cands.sort(key=lambda x: x[2])
    
    # and we keep only the top n, depending on the number of positive candidates
    final_wrong_cands = [x[1] for x in rank_wrong_cands[:n_cands]]
    
    dFalse[row["gb1900"]] = final_wrong_cands

100%|██████████| 79729/79729 [1:46:17<00:00, 12.50it/s]  


Execution time:
```100%|██████████| 79729/79729 [1:46:17<00:00, 12.50it/s]```

In [29]:
source_top = []
target_top = []
matching = []
for k in dTrue:
    for j in dTrue[k]:
        source_top.append(k)
        target_top.append(j)
        matching.append("TRUE")
    for j in dFalse[k]:
        source_top.append(k)
        target_top.append(j)
        matching.append("FALSE")

gb1900df = pd.DataFrame()
gb1900df["source_top"] = source_top
gb1900df["target_top"] = target_top
gb1900df["matching"] = matching

gb1900df.iloc[:50]

Unnamed: 0,source_top,target_top,matching
0,ABERAVON,Aberavon,True
1,ABERAVON,ABERAYRON,False
2,Abercarn,Aberearne,True
3,Abercarn,Abercairny,False
4,Aberdare U. D.,Aberdare,True
5,Aberdare U. D.,Ardsley U. D.,False
6,ABERDARE,Aberdare,True
7,ABERDARE,ABERDARE PARK,False
8,Abergavenny. 2,Abergavenney,True
9,Abergavenny. 2,Abergavenny,True


In [30]:
gb1900df.to_csv("gb1900dataset.tsv", sep="\t", index=False, header=False)