In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from utils import ner, linking, evaluate
import json

In [None]:
# Path to NER Model:
ner_model = "./outputs/models/lwm-ner.model"

# Load NER pipeline, aggregate grouped entities with "average":
ner_pipe = pipeline("ner", model=ner_model, aggregation_strategy="average", use_fast=True)

In [None]:
# Path to DeezyMatch model and combined candidate vectors:
dm_path = "./outputs/deezymatch/"
dm_cands = "wkdtalts"
dm_model = "ocr_faiss_cur085_l2"
dm_output = "deezymatch_on_the_fly"

# Load mentions to wikidata dictionary
with open('/resources/wikidata/mentions_to_wikidata_normalized.json', 'r') as f:
    mentions_to_wikidata_normalized = json.load(f)
    
# Load wikipedia frequency dictionary by wikidata ID
with open('/resources/wikidata/overall_entity_freq_wikidata.json', 'r') as f:
    overall_entity_freq_wikidata = json.load(f)
    
# Load wikidata gazetteer
gazdf = pd.read_csv("/resources/wikidata/wikidata_gazetteer.csv", low_memory=False)

In [None]:
# Load test data
df = pd.read_csv("outputs/data/linking_lwm_df_test.tsv", sep="\t")

# Split test set into dev and test set:
# We will use dev for evaluation for now, we keep test unseen until the very last experiments:
dev, test = train_test_split(df, test_size=0.5, random_state=42)

In [None]:
publicationToWikidata = {"Dorchester": "Q503331",
                         "Ashton-under-Lyne": "Q659803",
                         "Manchester": "Q18125",
                         "Poole": "Q203349"}

dev["place_publ_wk"] = dev['place'].map(publicationToWikidata)

In [None]:
# dev = dev.iloc[:50]

In [None]:
dev.head()

In [None]:
dev.shape[0]

In [None]:
def compare_toponyms_precision(found, given):
    
    # Reformat dictionaries to make them easy to compare
    dFound = dict()
    for tf in found:
        dFound[(tf["start"], tf["end"])] = (tf["toponym"], tf["place_class"])
    dGiven = dict()
    for gt in given:
        dGiven[(gt["start"], gt["end"])] = (gt["toponym"], gt["place_class"])
    
    # Check for precision:
    true_pos_identified = 0
    true_pos_classified = 0
    false_pos_identified = 0
    false_pos_classified = 0
    
    for tf in dFound:
        if tf in dGiven:
            true_pos_identified += 1
            if dFound[tf][1] == dGiven[tf][1]:
                true_pos_classified += 1
            else:
                false_pos_classified += 1
        else:
            false_pos_identified += 1
    
    return true_pos_identified, true_pos_classified, false_pos_identified, false_pos_classified

def compare_toponyms_recall(found, given):
    
    # Reformat dictionaries to make them easy to compare
    dFound = dict()
    for tf in found:
        dFound[(tf["start"], tf["end"])] = (tf["toponym"], tf["place_class"])
    dGiven = dict()
    for gt in given:
        dGiven[(gt["start"], gt["end"])] = (gt["toponym"], gt["place_class"])
    
    # Check for recall:
    true_pos_identified = 0
    true_pos_classified = 0
    false_neg_identified = 0
    false_neg_classified = 0
    
    for tf in dGiven:
        if tf in dFound:
            true_pos_identified += 1
            if dFound[tf][1] == dGiven[tf][1]:
                true_pos_classified += 1
            else:
                false_neg_classified += 1
        else:
            false_neg_identified += 1
    
    return true_pos_identified, true_pos_classified, false_neg_identified, false_neg_classified

In [None]:
dSentences = dict()
dAnnotations = dict()
dPublication = dict()

dev_sents = dev.drop_duplicates(subset=["article_id", "sent_id", "current_sentence"])
for i, row in dev_sents.iterrows():
    artid = row["article_id"]
    sentid = row["sent_id"]
    fullid = str(artid) + "_" + str(sentid)
    dSentences[fullid] = row["current_sentence"]
    dPublication[fullid] = row["place_publ_wk"]
    tmpdf = dev[(dev["article_id"] == artid) & (dev["sent_id"] == sentid)]
    for i2, row2 in tmpdf.iterrows():
        t_position = (row2["start"], row2["end"])
        if fullid in dAnnotations:
            dAnnotations[fullid].append({"start": row2["start"],
                                         "end": row2["end"],
                                         "toponym": row2["mention"],
                                         "place_class": row2["place_class"],
                                         "link": row2["place_wqid"]})
        else:
            dAnnotations[fullid] = [{"start": row2["start"],
                                     "end": row2["end"],
                                     "toponym": row2["mention"],
                                     "place_class": row2["place_class"],
                                     "link": row2["place_wqid"]}]

In [None]:
%%capture

dResolved = dict()
precision = []
recall = []
for sentid in dAnnotations:
    found_toponyms = ner.find_grouped_entities(dSentences[sentid], ner_pipe)
    precision.append(compare_toponyms_precision(found_toponyms, dAnnotations[sentid]))
    recall.append(compare_toponyms_recall(found_toponyms, dAnnotations[sentid]))
    # Use DeezyMatch to find the most similar place name in our gazetteer:
    candidate_mentions = linking.deezy_on_the_fly(found_toponyms, dm_cands, dm_model,
                                              dm_output, dm_path, thr=10, cands=3,
                                              cdiff=2)
    resolved_entities = linking.resolve_baseline1(candidate_mentions,
                                              mentions_to_wikidata_normalized,
                                              overall_entity_freq_wikidata,
                                              gazdf, dPublication[sentid],
                                              max_relv=1000, max_dist=200,
                                              dmthr=10, max_mentions=3)
    
    dResolved[sentid] = resolved_entities

In [None]:
# ### Time NER for 435 sentences
# import time
# print(len(dAnnotations))
# start_time = time.time()
# for sentid in dAnnotations:
#     found_toponyms = ner.find_grouped_entities(dSentences[sentid], ner_pipe)
# print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
def compare_linking(found, given):
    corr = 0
    incorr = 0
    # Reformat dictionaries to make them easy to compare
    dFound = dict()
    for tf in found:
        if found[tf]:
            dFound[tf] = found[tf][0]
    dGiven = dict()
    for gt in given:
        if gt["place_class"] == "LOC":
            dGiven[gt["toponym"]] = gt["link"]
    for t in dFound:
        if t in dGiven:
            if dFound[t] == dGiven[t]:
                corr += 1
            else:
                incorr += 1
    return corr, incorr

In [None]:
correct = 0
incorrect = 0
for sentid in dAnnotations:
    reslink = compare_linking(dResolved[sentid], dAnnotations[sentid])
    correct += reslink[0]
    incorrect += reslink[1]

print(correct/(correct+incorrect))

In [None]:
recall_true_pos_identified = sum([x[0] for x in recall])
recall_true_pos_classified = sum([x[1] for x in recall])
recall_false_neg_identified = sum([x[2] for x in recall])
recall_false_neg_classified = sum([x[3] for x in recall])

precision_true_pos_identified = sum([x[0] for x in precision])
precision_true_pos_classified = sum([x[1] for x in precision])
precision_false_pos_identified = sum([x[2] for x in precision])
precision_false_pos_classified = sum([x[3] for x in precision])

precision_identified = precision_true_pos_identified / (precision_true_pos_identified + precision_false_pos_identified)
precision_classified = precision_true_pos_classified / (precision_true_pos_classified + precision_false_pos_classified)
recall_identified = recall_true_pos_identified / (recall_true_pos_identified + recall_false_neg_identified)
recall_classified = recall_true_pos_classified / (recall_true_pos_classified + recall_false_neg_classified)

print("Precision identified:", precision_identified)
# print(precision_classified)
print("Recall identified:", recall_identified)
# print(recall_classified)

fscore_identified = (2 * precision_identified * recall_identified) / (precision_identified + recall_identified)
fscore_classified = (2 * precision_classified * recall_classified) / (precision_classified + recall_classified)

print("F-score identified:", fscore_identified)
print("Accuracy classified:", fscore_classified)