# Entity resolution

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
from pathlib import Path
from tools import eval_methods, resolution_methods
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import random

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [4]:
import pickle
with open("/resources/wikipedia/extractedResources/overall_entity_freq.pickle", 'rb') as fp:
    wikipedia_entity_overall_dict = pickle.load(fp)

### Feature selection

In [5]:
# ------------------
# Feature selection for dev and test
settings = ["test", "dev"]
candrank_approaches = ["perfect_match", "partial_match", "deezy_match"]
for candrank in candrank_approaches:
    for setting in settings:
        features_file = "features_" + setting + "_" + candrank + ".tsv"
        if not Path(features_file).is_file():
            df = pd.read_pickle("candranking_" + setting + ".pkl")
            exp_df = resolution_methods.feature_selection(candrank, df, gazetteer_df, wikipedia_entity_overall_dict)
            exp_df.drop_duplicates(subset=['Query','Candidate'], inplace=True)
            exp_df.to_csv(features_file, sep="\t")
        print(candrank + " " + setting + " done!")

features_dev = pd.read_csv("features_dev_" + candrank + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("features_test_" + candrank + ".tsv",sep='\t', index_col=0)

perfect_match test done!
perfect_match dev done!
partial_match test done!
partial_match dev done!
deezy_match test done!
deezy_match dev done!


### Training classifiers

In [6]:
candrank_method = "partial_match"

In [7]:
# -------------------------------
# Predict railway stations
# -------------------------------
# f0 (SubStFormatted-candidate DeezyMatch confidence)
# f1 (MainStation-candidate DeezyMatch confidence)
# f2 (Altname-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f6 (Closest place confidence)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = pd.read_csv("features_dev_" + candrank_method + ".tsv",sep='\t', index_col=0) # development set feature vectors
use_cols_stations = ['f_0','f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8'] # features to use
run = "dev" # whether we're using for developing ("dev": we return a classifier trained
            # on the train portion of the dev set) or testing ("test": we train a classifier
            # trained on the full dev set).
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_stations = resolution_methods.train_classifier(dev_df, use_cols_stations, run)

# -------------------------------
# Predict places
# -------------------------------
# f1 (MainStation-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = pd.read_csv("features_dev_" + candrank_method + ".tsv",sep='\t', index_col=0) # development set feature vectors
df_inexact = dev_df[dev_df["Exact"] == 0]
use_cols_places = ['f_1','f_3','f_4','f_5','f_7','f_8'] # features to use
run = "dev" # whether we're using for developing ("dev": we return a classifier trained
            # on the train portion of the dev set) or testing ("test": we train a classifier
            # trained on the full dev set).
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_places = resolution_methods.train_classifier(df_inexact, use_cols_places, run)

# -------------------------------
# Finding best stations threshold
# -------------------------------

minthr = 0
maxthr = 1
stepthr = 0.05
mindist = 5
resolution_methods.find_thresholds(dev_df, clf_stations, use_cols_stations, gazetteer_df, minthr, maxthr, stepthr, mindist)

Queries in train and test: 96 95
Instances in train and test: 26886 10631
{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
Classification report on dev test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     10576
           1       0.55      0.80      0.65        55

    accuracy                           1.00     10631
   macro avg       0.77      0.90      0.82     10631
weighted avg       1.00      1.00      1.00     10631

Queries in train and test: 21 20
Instances in train and test: 2807 3399


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(subset=['Query','Candidate'], inplace=True)


{'C': 1000, 'kernel': 'linear'}
Classification report on dev test set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3396
           1       0.19      1.00      0.32         3

    accuracy                           1.00      3399
   macro avg       0.59      1.00      0.66      3399
weighted avg       1.00      1.00      1.00      3399

Stations threshold: 0.0 0.0
Stations threshold: 0.05 0.402
Stations threshold: 0.1 0.402
Stations threshold: 0.15 0.402
Stations threshold: 0.2 0.402
Stations threshold: 0.25 0.402
Stations threshold: 0.3 0.402
Stations threshold: 0.35 0.402
Stations threshold: 0.4 0.402
Stations threshold: 0.45 0.402
Stations threshold: 0.5 0.402
Stations threshold: 0.55 0.402
Stations threshold: 0.6 0.402
Stations threshold: 0.65 0.8858
Stations threshold: 0.7 0.8858
Stations threshold: 0.75 0.8858
Stations threshold: 0.8 0.8858
Stations threshold: 0.85 0.8828
Stations threshold: 0.9 0.8828
Stations threshold:

### Method and baselines

In [8]:
# We will store the results of all methods/baselines as columns in the original structured dataframe:
results_test_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")

In [10]:
# -------------------------------
# Apply our classification methods (column "our_method")
# -------------------------------
optimal_threshold = 0.5
features_test_df = pd.read_csv("features_test_" + candrank_method + ".tsv",sep='\t', index_col=0) # test set feature vectors

results_test_df = resolution_methods.our_method(features_test_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, optimal_threshold, results_test_df)

In [11]:
# -------------------------------
# Baseline: Apply candrank-most-confident baseline
# -------------------------------
results_test_df = resolution_methods.candrank_most_confident(features_test_df, results_test_df)

In [12]:
# -------------------------------
# Baseline: Apply wikipedia-most-relevant baseline
# -------------------------------
results_test_df = resolution_methods.wikipedia_most_relevant(features_test_df, results_test_df)

In [13]:
# -------------------------------
# Baseline: Apply semantically_most_similar baseline
# -------------------------------
results_test_df = resolution_methods.semantically_most_similar(features_test_df, results_test_df)

In [14]:
results_test_df.head()

Unnamed: 0,MainId,SubId,MainStation,SubStation,SubStFormatted,Description,Final Wikidata ID,DevTest,Disambiguator,Companies,...,LocsMapsDescr,Altnames,Referenced,FirstOpening,LastClosing,Interrupted,our_method,candrank_most_confident,wikipedia_most_relevant,semantically_most_similar
0,763,1001,BERE FERRERS,BERE FERRERS,BERE FERRERS,[LSW] op 1 June 1890 (W D Merc 2 nd ) as BEER ...,Q2550845,Test,[],[LSW],...,,[BEER FERRIS],[],1 June 1890,31 December 2001,False,Q2550845,Q2550845,Q1996192,Q4891152
1,7665,10050,SELBY,S BRAYTON GATES,SELBY BRAYTON GATES,op 16 February 1898 ([Yorkshire] Evening Pres...,Q48791901,Test,[],[],...,,[],[],16 February 1898,,False,Q527846,Q27087578,Q527846,Q7447561
2,775,1014,BERVIE,BERVIE,BERVIE,– see INVERBERVIE.,Q58407002,Test,[],[],...,,[],[INVERBERVIE],,,False,Q58407002,Q58407002,Q568713,Q4784967
3,7746,10161,SHEPTON MALLET,SHEPTON MALLET,SHEPTON MALLET,Brad (index only) had S MALLETT at least 1883 ...,Q7494575,Test,[],[],...,,[SHEPTON MALLETT],[],,,False,Q1236661,Q7494574,Q1236661,Q7494575
4,7748,10165,SHERBURN,SHERBURN,SHERBURN,– see WEAVERTHORPE.,Q92796796,Test,[],[],...,,[],[WEAVERTHORPE],,,False,Q7978392,Q1881964,Q7494758,Q7978392


### Evaluate entity resolution

In [15]:
for topres_approach in ["our_method", "candrank_most_confident", "wikipedia_most_relevant", "semantically_most_similar"]:
    print(topres_approach, candrank_method)
    print("-------------------------")
    eval_methods.topres_exactmetrics(results_test_df, topres_approach)
    eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach)
    print()

our_method partial_match
-------------------------
Hamming Loss: 0.4479638009049774
Accuracy Score: 0.5520361990950227
Jaccard Score: 0.4053156146179402
Accuracy at 1: 0.6742081447963801
Accuracy at 5: 0.7285067873303167
Accuracy at 10: 0.7330316742081447

candrank_most_confident partial_match
-------------------------
Hamming Loss: 0.4117647058823529
Accuracy Score: 0.5882352941176471
Jaccard Score: 0.44501718213058417
Accuracy at 1: 0.7058823529411765
Accuracy at 5: 0.751131221719457
Accuracy at 10: 0.7601809954751131

wikipedia_most_relevant partial_match
-------------------------
Hamming Loss: 0.8144796380090498
Accuracy Score: 0.18552036199095023
Jaccard Score: 0.10732984293193717
Accuracy at 1: 0.5339366515837104
Accuracy at 5: 0.6877828054298643
Accuracy at 10: 0.6968325791855203

semantically_most_similar partial_match
-------------------------
Hamming Loss: 0.7782805429864253
Accuracy Score: 0.22171945701357465
Jaccard Score: 0.13101604278074866
Accuracy at 1: 0.48868778280542