In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys
import pandas as pd

# Add "../" to path to import utils
sys.path.insert(0, os.path.abspath(os.path.pardir))
from geoparser import preparation, recogniser, ranking, linking

In [3]:
dataset = "lwm"  # "hipe" or "lwm"

# Candidate selection approach, options are:
# * perfectmatch
# * partialmatch
# * levenshtein
# * deezymatch
cand_select_method = "deezymatch"

# Toponym resolution approach, options are:
# * mostpopular
# * contextualized
# * reldisamb:relcs # REL disambiguation with their candidates (our mentions)
# * reldisamb:lwmcs # REL disambiguation with our candidates (our mentions)
top_res_method = "mostpopular"
# top_res_method = "reldisamb:lwmcs:relv"
# top_res_method = "reldisamb:relcs"
# top_res_method = "reldisamb:lwmcs:dist"
# top_res_method = "reldisamb:lwmcs:relvdist"
# top_res_method = "gnn"

In [4]:
# --------------------------------------
# Instantiate the recogniser:
myner = recogniser.Recogniser(
    method="lwm",  # NER method
    model_name="blb_lwm-ner",  # NER model name prefix (will have suffixes appended)
    model=None,  # We'll store the NER model here
    pipe=None,  # We'll store the NER pipeline here
    base_model="/resources/models/bert/bert_1760_1900/",  # Base model to fine-tune
    train_dataset="outputs/data/lwm/ner_df_train.json",  # Training set (part of overall training set)
    test_dataset="outputs/data/lwm/ner_df_dev.json",  # Test set (part of overall training set)
    output_model_path="outputs/models/",  # Path where the NER model is or will be stored
    training_args={
        "learning_rate": 5e-5,
        "batch_size": 16,
        "num_train_epochs": 4,
        "weight_decay": 0.01,
    },
    overwrite_training=False,  # Set to True if you want to overwrite model if existing
    do_test=False,  # Set to True if you want to train on test mode
    training_tagset="fine",  # Options are: "coarse" or "fine"
)

In [5]:
# --------------------------------------
# Instantiate the ranker:
myranker = ranking.Ranker(
    method=cand_select_method,
    resources_path="/resources/wikidata/",
    mentions_to_wikidata=dict(),
    deezy_parameters={
        # Paths and filenames of DeezyMatch models and data:
        "dm_path": "/resources/develop/mcollardanuy/toponym-resolution/experiments/outputs/deezymatch/",
        "dm_cands": "wkdtalts",
        "dm_model": "ocr_avgpool",
        "dm_output": "deezymatch_on_the_fly",
        # Ranking measures:
        "ranking_metric": "faiss",
        "selection_threshold": 10,
        "num_candidates": 2,
        "search_size": 2,
        "use_predict": False,
        "verbose": False,
    },
)

In [6]:
# --------------------------------------
# Instantiate the linker:
mylinker = linking.Linker(
    method=top_res_method,
    resources_path="/resources/wikidata/",
    linking_resources=dict(),
    base_model="/resources/models/bert/bert_1760_1900/",  # Base model for vector extraction
    rel_params={"base_path": "/resources/rel_db/", "wiki_version": "wiki_2019/"},
    gnn_params={
        "level": "sentence_id",
        "max_distance": 200,
        "similarity_threshold": 0.7,
        "model_path": "/resources/develop/mcollardanuy/toponym-resolution/experiments/outputs/gnn_models/",
    },
    overwrite_training=False,
)

In [7]:
# --------------------------------------
# Instantiate the experiment:
experiment = preparation.Experiment(
    dataset=dataset,
    data_path="outputs/data/",
    dataset_df=pd.DataFrame(),
    results_path="outputs/results/",
    myner=myner,
    myranker=myranker,
    mylinker=mylinker,
    overwrite_processing=False,  # If True, do data processing, else load existing processing, if exists.
    processed_data=dict(),  # Dictionary where we'll keep the processed data for the experiments.
    test_split="test",  # "dev" while experimenting, "test" when running final experiments.
    rel_experiments=False,  # False if we're not interested in running the different experiments with REL, True otherwise.
)

In [8]:
# Print experiment information:
print(experiment)
print(myner)
print(myranker)
print(mylinker)


Data processing in the LWM dataset.
* Overwrite processing: False
* Experiments run on the >>> test <<< set.


>>> Toponym recogniser:
    * Method: lwm
    * Model name: blb_lwm-ner-fine
    * Base model: /resources/models/bert/bert_1760_1900/
    * Overwrite model if exists: False
    * Train in test mode: False
    * Training args: {'learning_rate': 5e-05, 'batch_size': 16, 'num_train_epochs': 4, 'weight_decay': 0.01}
    * Training tagset: fine

>>> Candidate selection:
	* Method: deezymatch

>>> Entity Linking:
	* Method: gnn
	* Overwrite training: False
	* Linking resources: 



In [9]:
# Load processed data if existing:
experiment.processed_data = experiment.load_data()

In [10]:
# Perform data postprocessing:
experiment.processed_data = experiment.prepare_data()


Data already postprocessed and loaded!



In [11]:
mylinker.linking_resources = mylinker.load_resources()

  > Loading wikidata entity and instance embeddings.
  > Mapping wikidata ids to instance ids.


929854it [00:37, 24790.48it/s]


  > Loading mentions to wikidata mapping.
  > Mapping coordinates to wikidata ids.


In [12]:
# Do the linking experiments:
experiment.linking_experiments()

  data[key] = torch.tensor(value)
100%|██████████| 1991/1991 [01:53<00:00, 17.56it/s]



Number of training nodes tensor(7904) 
Number of dev nodes tensor(3902) 
Number of test nodes tensor(0)


Number of nodes: 11806
Number of edges: 95980
Average node degree: 8.13
Number of training nodes: 7904
Training node label rate: 0.67
Has isolated nodes: False
Has self-loops: False
Is undirected: True
Saving model in:  /resources/develop/mcollardanuy/toponym-resolution/experiments/outputs/gnn_models/best_model_originalsplit
Epoch 1 Training loss : 0.6933 Validation F1 :,0.0000
Epoch 2 Training loss : 0.6804 Validation F1 :,0.0000
Epoch 3 Training loss : 0.6706 Validation F1 :,0.0000
Epoch 4 Training loss : 0.6594 Validation F1 :,0.0000
Epoch 5 Training loss : 0.6496 Validation F1 :,0.0000
Epoch 6 Training loss : 0.6455 Validation F1 :,0.1147
Saving new best model with f1=0.1147
Epoch 7 Training loss : 0.6306 Validation F1 :,0.4165
Saving new best model with f1=0.4165
Epoch 8 Training loss : 0.6094 Validation F1 :,0.4564
Saving new best model with f1=0.4564
Epoch 9 Training loss :

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["matched"] = test_df.candidates.apply(lambda x: "_".join(x.keys()))
100%|██████████| 710/710 [00:46<00:00, 15.27it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["pred_wqid"] = "NIL"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df["pred_wqid_score"] = 0.0
A value is trying

Devonshire ('Q23156', 0.9233861)
Exeter ('Q134672', 0.900731)
Exeter ('Q134672', 0.9764364)
Plymouth ('Q43382', 0.92992276)
Devon ('Q23156', 0.9398634)
Stoke Damerel ('Q7618427', 0.78744787)
East Stonehouse ('Q7619235', 0.8165967)
Turkey ('Q43', 0.9644211)
Greece ('Q41', 0.9858655)
Greece ('Q41', 0.9919041)
Turkey ('Q43', 0.9548655)
Turkey ('Q43', 0.9644211)
Turkey ('Q43', 0.9644211)
Greyhound Inn ('Q5608428', 0.7557585)
BLandford_Blandford ('Q644530', 0.98753697)
Blandford ('Q644530', 0.97914356)
BLandford_Blandford ('Q644530', 0.9836013)
Shaftesbury ('Q631444', 0.97051024)
Whitehall ('Q214820', 0.6147343)
United Kingdom ('Q145', 0.9179733)
Canterbury ('Q29303', 0.92260337)
York ('Q42462', 0.97859246)
United Kingdom ('Q145', 0.886701)
York ('Q42462', 0.96209145)
Canterbury ('Q29303', 0.9260722)
York ('Q42462', 0.9605149)
United Kingdom ('Q145', 0.892299)
Caerleon ('Q1009261', 0.9583394)
Park Farm ('Q23041561', 0.5113827)
Berlin ('Q64', 0.9783839)
FRANCE ('Q142', 0.96727365)
Paris ('Q9