In [1]:
%load_ext autoreload
%autoreload 2

# Entity resolution

In [2]:
import pandas as pd
from pathlib import Path
from tools import eval_methods, resolution_methods
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import random

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [4]:
import pickle
with open("/resources/wikipedia/extractedResources/overall_entity_freq.pickle", 'rb') as fp:
    wikipedia_entity_overall_dict = pickle.load(fp)

### Feature selection

In [5]:
# ------------------
# Feature selection for dev and test
num_candidates_list = [5, 3, 1]
settings = ["test", "dev"]
candrank_approaches = ["deezy_match"]
for num_candidates in num_candidates_list:
    for setting in settings:
        for candrank in candrank_approaches:
            features_file = "../processed/resolution/features_" + candrank + "_" + setting + str(num_candidates) + ".tsv"
            if not Path(features_file).is_file():
                df = pd.read_pickle("../processed/resolution/candranking_" + candrank + "_" + setting + str(num_candidates) + ".pkl")
                exp_df = resolution_methods.feature_selection(candrank, df, gazetteer_df, wikipedia_entity_overall_dict)
                exp_df.drop_duplicates(subset=['Query','Candidate'], inplace=True)
                exp_df.to_csv(features_file, sep="\t")
            print(candrank + " " + setting + " " + str(num_candidates) + " done!")

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

deezy_match test 5 done!
deezy_match dev 5 done!
deezy_match test 3 done!
deezy_match dev 3 done!
deezy_match test 1 done!
deezy_match dev 1 done!


### Training classifiers

In [6]:
num_candidates = 1
candrank_method = "deezy_match"

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

In [7]:
# -------------------------------
# Predict railway stations
# -------------------------------
# f0 (SubStFormatted-candidate DeezyMatch confidence)
# f1 (MainStation-candidate DeezyMatch confidence)
# f2 (Altname-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f6 (Closest place confidence)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = features_dev # development set feature vectors
df_exact = dev_df[dev_df["Exact"] == 1]
use_cols_stations = ['f_0','f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_stations = resolution_methods.train_classifier(df_exact, use_cols_stations)

with open('../processed/resolution/clf_stations.pkl', 'wb') as f:
    pickle.dump(clf_stations, f)

# -------------------------------
# Predict places
# -------------------------------
# f1 (MainStation-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = features_dev # development set feature vectors
df_inexact = dev_df[dev_df["Exact"] == 0]
use_cols_places = ['f_1','f_3','f_4','f_5','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_places = resolution_methods.train_classifier(df_inexact, use_cols_places)

with open('../processed/resolution/clf_places.pkl', 'wb') as f:
    pickle.dump(clf_places, f)

Queries in train and test: 82 81
Instances in train and test: 336 4555
{'C': 0.1, 'kernel': 'linear'}
Classification report on the test split of the dev set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4467
           1       0.97      0.77      0.86        88

    accuracy                           1.00      4555
   macro avg       0.98      0.89      0.93      4555
weighted avg       1.00      1.00      0.99      4555

[[ 1.51967785e+00 -5.58781562e-02 -5.63567062e-04  8.28458809e-06
   2.74448717e-02  0.00000000e+00  5.35876468e-01  0.00000000e+00
  -1.81169897e-03]]
Queries in train and test: 24 23
Instances in train and test: 216 115
{'C': 1000, 'kernel': 'linear'}
Classification report on the test split of the dev set:
              precision    recall  f1-score   support

           0       0.96      0.91      0.93       109
           1       0.17      0.33      0.22         6

    accuracy                           0.

### Find best threshold

In [8]:
# Find optimal threshold for stations/placess
# --------------------------------------------
optimal_threshold = 0.0
keep_acc = 0.0
for th in np.arange(0, 1, 0.05):
    th = round(th, 2)
    results_dev_df = pd.read_pickle("../processed/quicks/quicks_dev.pkl")
    results_dev_df = resolution_methods.our_method(features_dev, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, th, results_dev_df)
    acc = eval_methods.topres_exactmetrics(results_dev_df, "our_method")
    if acc >= keep_acc:
        optimal_threshold = th
        keep_acc = acc

### Method and baselines

In [9]:
# We will store the results of all methods/baselines as columns in the original structured dataframe:
results_test_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")

In [10]:
# -------------------------------
# Apply our classification methods (column "our_method")
# -------------------------------
features_test_df = features_test # test set feature vectors
results_test_df = resolution_methods.our_method(features_test_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, optimal_threshold, results_test_df)

In [11]:
# -------------------------------
# Baseline: Apply candrank-most-confident baseline
# -------------------------------
results_test_df = resolution_methods.candrank_most_confident(features_test_df, results_test_df)

In [12]:
# -------------------------------
# Baseline: Apply wikipedia-most-relevant baseline
# -------------------------------
results_test_df = resolution_methods.wikipedia_most_relevant(features_test_df, results_test_df)

In [13]:
# -------------------------------
# Baseline: Apply semantically_most_similar baseline
# -------------------------------
results_test_df = resolution_methods.semantically_most_similar(features_test_df, results_test_df)

### Evaluate entity resolution

In [14]:
for topres_approach in ["our_method", "candrank_most_confident", "wikipedia_most_relevant", "semantically_most_similar"]:
    print(topres_approach, candrank_method)
    print("-------------------------")
    exact_acc = eval_methods.topres_exactmetrics(results_test_df, topres_approach)
    acc_at_1, acc_at_5, acc_at_10 = eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach)
    print("Exact:", exact_acc)
    print("Acc@1:", acc_at_1)
    print("Acc@5:", acc_at_5)
    print("Acc@10:", acc_at_10)
    print()

our_method deezy_match
-------------------------
Exact: 0.7031963470319634
Acc@1: 0.7671232876712328
Acc@5: 0.821917808219178
Acc@10: 0.8264840182648402

candrank_most_confident deezy_match
-------------------------
Exact: 0.634703196347032
Acc@1: 0.684931506849315
Acc@5: 0.7534246575342466
Acc@10: 0.7579908675799086

wikipedia_most_relevant deezy_match
-------------------------
Exact: 0.1187214611872146
Acc@1: 0.4611872146118721
Acc@5: 0.7123287671232876
Acc@10: 0.7214611872146118

semantically_most_similar deezy_match
-------------------------
Exact: 0.3287671232876712
Acc@1: 0.5205479452054794
Acc@5: 0.6986301369863014
Acc@10: 0.7077625570776256

