In [3]:
import os
import sys
from pathlib import Path
sys.path.insert(0, os.path.abspath(os.path.pardir))
from geoparser import pipeline, ranking, linking

In [4]:
myranker = ranking.Ranker(
    method="deezymatch",
    resources_path="../resources/wikidata/",
    mentions_to_wikidata=dict(),
    wikidata_to_mentions=dict(),
    wiki_filtering={
        "top_mentions": 3,  # Filter mentions to top N mentions
        "minimum_relv": 0.03,  # Filter mentions with more than X relv
    },
    strvar_parameters={
        # Parameters to create the string pair dataset:
        "ocr_threshold": 60,
        "top_threshold": 85,
        "min_len": 5,
        "max_len": 15,
    },
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": str(Path("outputs/deezymatch/").resolve()),
        "dm_cands": "wkdtalts",
        "dm_model": "w2v_ocr",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 25,
        "num_candidates": 3,
        "search_size": 3,
        "verbose": False,
        # DeezyMatch training:
        "overwrite_training": False,
        "w2v_ocr_path": str(Path("outputs/models/").resolve()),
        "w2v_ocr_model": "w2v_*_news",
        "do_test": False,
    },
)


In [5]:
mylinker = linking.Linker(
    method="mostpopular",
    resources_path="../resources/wikidata/",
    linking_resources=dict(),
    base_model="to-be-removed",  # Base model for vector extraction
    rel_params={},
    overwrite_training=False,
)

In [6]:
geoparser = pipeline.Pipeline(myranker=myranker, mylinker=mylinker)


The NER model is already trained!

*** Creating and loading a NER pipeline.
*** Loading the ranker resources.
The DeezyMatch model is already trained!
*** Load linking resources.
  > Loading mentions to wikidata mapping.
  > Loading gazetteer.
*** Linking resources loaded!



In [7]:
resolved = geoparser.run_text("A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster. Not in Nottingham though. Not in Ashton either, nor in Salop!",
    place_wqid="Q18125")
    
for r in resolved:
    print(r)

                                                 

  0%|          | 0/1 [00:00<?, ?it/s]

{'mention': 'Shefrield', 'pos': 74, 'sent_idx': 0, 'end_pos': 83, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster.', 'prediction': 'Q42448', 'ed_score': 0.893, 'latlon': [53.3825, -1.471944], 'wkdt_class': 'Q515'}
{'mention': 'Lancaster', 'pos': 97, 'sent_idx': 0, 'end_pos': 106, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Shefrield, but also in Lancaster.', 'prediction': 'Q205905', 'ed_score': 0.407, 'latlon': [54.047, -2.801], 'wkdt_class': 'Q515'}
{'mention': 'Nottingham', 'pos': 7, 'sent_idx': 1, 'end_pos': 17, 'tag': 'LOC', 'sentence': 'Not in Nottingham though.', 'prediction': 'Q41262', 'ed_score': 0.917, 'latlon': [52.955, -1.149167], 'wkdt_class': 'Q515'}
{'mention': 'Ashton', 'pos': 7, 'sent_idx': 2, 'end_pos': 13, 'tag': 'LOC', 'sentence': 'Not in Ashton either, nor in Salop!', 'prediction': 'Q4805980', 'ed_score': 0.096, 'latlon': [39

In [8]:
resolved = geoparser.run_sentence("A remarkable case of rattening has just occurred in the building trade at Sheffield.", place_wqid="Q18125")
for r in resolved:
    print(r)

{'mention': 'Sheffield', 'pos': 74, 'sent_idx': 0, 'end_pos': 83, 'tag': 'LOC', 'sentence': 'A remarkable case of rattening has just occurred in the building trade at Sheffield.', 'prediction': 'Q42448', 'ed_score': 0.903, 'latlon': [53.3825, -1.471944], 'wkdt_class': 'Q515'}
