In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from utils import ner
from utils import linking
from transformers import pipeline
import pandas as pd
import operator
import json

In [3]:
# Path to NER Model:
ner_model = "./outputs/models/lwm-ner.model"

# Load NER pipeline, aggregate grouped entities with "average":
ner_pipe = pipeline("ner", model=ner_model, aggregation_strategy="average", use_fast=True)

# Path to DeezyMatch model and combined candidate vectors:
dm_path = "./outputs/deezymatch/"
dm_cands = "wkdtalts"
dm_model = "ocr_faiss_cur085_l2"
dm_output = "deezymatch_on_the_fly"

# Load mentions to wikidata dictionary
with open('/resources/wikidata/mentions_to_wikidata_normalized.json', 'r') as f:
    mentions_to_wikidata_normalized = json.load(f)
    
# Load wikipedia frequency dictionary by wikidata ID
with open('/resources/wikidata/overall_entity_freq_wikidata.json', 'r') as f:
    overall_entity_freq_wikidata = json.load(f)
    
# Load wikidata gazetteer
gazdf = pd.read_csv("/resources/wikidata/wikidata_gazetteer.csv", low_memory=False)

### Input the newspaper text and the Wikidata ID of the place of publication of the newspaper

In [4]:
# Newspaper text in which we want to find and geolocate toponyms:
text = """
Mr. Oldham, ol Cheadlc, in Stafford (hire, to Mils Oldlcaa , Lie of this Town.
"""

# Wikidata ID of the place where the newspaper is published, e.g. Manchester is Q18125:
place_of_publication = "Q18125" # Manchester

In [5]:
print(text)


Mr. Oldham, ol Cheadlc, in Stafford (hire, to Mils Oldlcaa , Lie of this Town.



In [6]:
print("https://www.wikidata.org/wiki/" + place_of_publication)

https://www.wikidata.org/wiki/Q18125


### Find toponyms in text

In [7]:
# Given a sentence and a named entity recogniser, find toponyms in text:
found_toponyms = ner.find_grouped_entities(text, ner_pipe)

In [8]:
print("Found toponyms:", len(found_toponyms))
# Print found toponyms
for e in found_toponyms:
    print(e)

Found toponyms: 2
{'score': 0.994, 'toponym': 'Cheadlc', 'place_class': 'LOC'}
{'score': 0.9, 'toponym': 'Stafford (hire', 'place_class': 'LOC'}


### Perform fuzzy string matching

In [9]:
%%capture
# Use DeezyMatch to find the most similar place name in our gazetteer:
candidate_mentions = linking.deezy_on_the_fly(found_toponyms, dm_cands, dm_model,
                                              dm_output, dm_path, thr=10, cands=10,
                                              cdiff=2)

In [10]:
candidate_mentions

{'Cheadlc': OrderedDict([('Cheadle', 1.3701),
              ('Chee Dale', 3.9895),
              ('Wheald', 4.0008),
              ('Chedan', 4.0618),
              ('Chelad', 4.3829),
              ('Chade', 4.4776),
              ('Charleval', 4.4888),
              ('Beechdale', 4.7057)]),
 'Stafford (hire': OrderedDict([('Staffordshire', 1.1361),
              ('Stafford, Ohio', 4.2757),
              ('Salfordshire', 4.7408)])}

### Resolve location names

In [11]:
resolved_entities = linking.resolve_baseline1(candidate_mentions,
                                              mentions_to_wikidata_normalized,
                                              overall_entity_freq_wikidata,
                                              gazdf, place_of_publication,
                                              max_relv=1000, max_dist=200,
                                              dmthr=10, max_mentions=3)

In [12]:
for e in resolved_entities:
    print("Toponym:", e)
    print("Wikidata:", "https://www.wikidata.org/wiki/" + resolved_entities[e][0])
    print("Place name:", resolved_entities[e][1])
    print("Latitude:", resolved_entities[e][2])
    print("Longitude:", resolved_entities[e][3])
    print("Confidence score:", round(resolved_entities[e][4], 2))
    print()

Toponym: Cheadlc
Wikidata: https://www.wikidata.org/wiki/Q1615894
Place name: Cheadle
Latitude: 53.3933
Longitude: -2.2113
Confidence score: 0.69

Toponym: Stafford (hire
Wikidata: https://www.wikidata.org/wiki/Q23105
Place name: Staffordshire
Latitude: 52.833333
Longitude: -2.0
Confidence score: 0.88

