In [1]:
%load_ext autoreload
%autoreload 2

# Entity resolution

In [2]:
import pandas as pd
from pathlib import Path
from tools import eval_methods, resolution_methods
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import random
import itertools
from tqdm import tqdm

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [4]:
import pickle
with open("/resources/wikipedia/extractedResources/overall_entity_freq.pickle", 'rb') as fp:
    wikipedia_entity_overall_dict = pickle.load(fp)

### Feature selection

In [5]:
# ------------------
# Feature selection for dev and test
num_candidates_list = [5, 3, 1]
settings = ["test", "dev"]
candrank_approaches = ["deezy_match"]
for num_candidates in num_candidates_list:
    for setting in settings:
        for candrank in candrank_approaches:
            features_file = "../processed/resolution/features_" + candrank + "_" + setting + str(num_candidates) + ".tsv"
            if not Path(features_file).is_file():
                df = pd.read_pickle("../processed/resolution/candranking_" + candrank + "_" + setting + str(num_candidates) + ".pkl")
                exp_df = resolution_methods.feature_selection(candrank, df, gazetteer_df, wikipedia_entity_overall_dict)
                exp_df.drop_duplicates(subset=['Query','Candidate'], inplace=True)
                exp_df.to_csv(features_file, sep="\t")
            print(candrank + " " + setting + " " + str(num_candidates) + " done!")

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

deezy_match test 5 done!
deezy_match dev 5 done!
deezy_match test 3 done!
deezy_match dev 3 done!
deezy_match test 1 done!
deezy_match dev 1 done!


### Settings

In [6]:
num_candidates = 1
candrank_method = "deezy_match"

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

### Method and baselines

In [7]:
# We will store the results of all methods/baselines as columns in the original structured dataframe:
results_test_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

In [8]:
# -------------------------------
# Apply our classification method (column "our_method_all")
# -------------------------------

# ------------------------------
# Train the classifier with all the features
dev_df = features_dev # development set feature vectors
use_cols_all = ['f_0','f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8'] # features to use
clf_all = resolution_methods.train_classifier(dev_df, use_cols_all)

# ------------------------------
# Apply the classifier with all the features
features_test_df = features_test # test set feature vectors
results_test_df = resolution_methods.our_method_simple(features_test_df, clf_all, use_cols_all, gazetteer_df, results_test_df)

Queries in train and test: 105 105
Instances in train and test: 746 4476
{'C': 0.1, 'kernel': 'linear'}
Classification report on the test split of the dev set:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4373
           1       0.86      0.69      0.76       103

    accuracy                           0.99      4476
   macro avg       0.92      0.84      0.88      4476
weighted avg       0.99      0.99      0.99      4476

[[ 1.42426317e+00 -5.70736045e-01  9.66669200e-05  2.33409328e-02
  -4.06976539e-01  3.63993217e-01  8.16300491e-01  1.56249737e-04
   1.39545877e-02]]


In [9]:
# -------------------------------
# Baseline: Apply candrank-most-confident baseline
# -------------------------------
results_test_df = resolution_methods.candrank_most_confident(features_test_df, results_test_df)

In [10]:
# -------------------------------
# Baseline: Apply wikipedia-most-relevant baseline
# -------------------------------
results_test_df = resolution_methods.wikipedia_most_relevant(features_test_df, results_test_df)

In [11]:
# -------------------------------
# Baseline: Apply semantically_most_similar baseline
# -------------------------------
results_test_df = resolution_methods.semantically_most_similar(features_test_df, results_test_df)

In [12]:
# -------------------------------
# RankLib: Apply learning to rank
# -------------------------------

code_folder = str(Path("../../").resolve()) + "/"
filter="all"
feature_combination = "allfeatures" # Uncomment if you want to use all features 
cross_val = False

# Apply all features combination to test set:
results_test_df = resolution_methods.ranklib(features_dev,features_test_df,filter,code_folder,cross_val,results_test_df,feature_combination,num_candidates)

P@1 on training data: 0.681 P@1 on test data: 0.6869
feature used: 1 2 3 4 5 6 7 8 9


In [13]:
results_test_df.head()

Unnamed: 0,MainId,SubId,MainStation,SubStation,SubStFormatted,Description,Final Wikidata ID,Disambiguator,Companies,FirstCompanyWkdt,...,Altnames,Referenced,FirstOpening,LastClosing,Interrupted,our_method_all,candrank_most_confident,wikipedia_most_relevant,semantically_most_similar,ranklib
0,4212,5467,HEREFORD,H BARTON,HEREFORD BARTON,[GW] op 2 January 1854 (T 29 December) ; clo ...,Q29379045,[],[GW],Q843251,...,[],[],2 January 1854,2 January 1893,False,Q29379045,Q29379045,Q23129,Q29379045,Q29379045
1,2075,2743,COED ELY,COED ELY,COED ELY,[GW] op 13 July 1925 (Cardiff Divisional Repor...,Q5140512,[],[GW],Q843251,...,[],[],13 July 1925,9 June 1958,False,Q5140512,Q5140512,Q5140496,Q5140496,Q5140512
2,6719,8798,PENYFFORDD,PENYFFORDD,PENYFFORDD,[GC] op 1 May 1866 (T 7 th ) ; still open. Ope...,Q3401808,[],[GC],Q688684,...,"[LEESWOOD, HOPE, HOPE JUNCTION]",[],1 May 1866,31 December 2001,False,Q3401808,Q3401808,Q2752670,Q28970438,Q3401808
3,2622,3409,DERBY,DERBY,DERBY,[Birmingham & Derby Junction] temporary op 12 ...,Q3398679,[],[Birmingham & Derby Junction],Q3700357,...,"[DERBY MIDLAND, DERBY STATION STREET]",[],11 May 1840,31 December 2001,True,Q3398679,Q3398679,Q43475,Q3398679,Q3398679
4,4593,5966,INVERUGLAS,INVERUGLAS,INVERUGLAS,"[LNE] (non-tt): workmen, Loch Sloy HEP; op 29 ...",Q48807380,[],[LNE],Q1092839,...,[],[],29 October 1945,1940,False,Q48807380,Q48807380,Q16892730,Q799488,Q48807380


In [14]:
# -------------------------------
# Skyline: Best possible result given candidates
# -------------------------------
results_test_df = resolution_methods.skyline(features_test_df, results_test_df)

In [15]:
# -------------------------------
# Our method comb: Combine stations and places classifiers
# -------------------------------

use_cols_all = ["f_0", "f_1", "f_2", "f_3", "f_4", "f_5", "f_6", "f_7", "f_8"] 

# Train railway stations classifier (exact setting)
# --------------------------------------------
dev_df = features_dev # development set feature vectors
df_exact = dev_df[dev_df["Exact"] == 1]
use_cols_stations = use_cols_all
# Train the classifier:
clf_stations = resolution_methods.train_classifier(df_exact, use_cols_all)

# Train places classifier (not exact setting)
# --------------------------------------------
dev_df = features_dev # development set feature vectors
df_inexact = dev_df[dev_df["Exact"] == 0]
use_cols_places = use_cols_all
# Train the classifier:
clf_places = resolution_methods.train_classifier(df_inexact, use_cols_all)

# Find optimal threshold for stations/placess
# --------------------------------------------
optimal_threshold = 0.0
keep_acc = 0.0
for th in np.arange(0, 1, 0.05):
    th = round(th, 2)
    results_dev_df = pd.read_pickle("../processed/quicks/quicks_dev.pkl")
    results_dev_df = resolution_methods.our_method_comb(features_dev, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, th, results_dev_df)
    acc = eval_methods.topres_exactmetrics(results_dev_df, "our_method_comb", False)
    if acc >= keep_acc:
        optimal_threshold = th
        keep_acc = acc
        
print(optimal_threshold, keep_acc)

# Apply our classification methods (column "our_method")
# -------------------------------
features_test_df = features_test # test set feature vectors
results_test_df = resolution_methods.our_method_comb(features_test_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, optimal_threshold, results_test_df)

Queries in train and test: 82 81
Instances in train and test: 336 4555
{'C': 0.1, 'kernel': 'linear'}
Classification report on the test split of the dev set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4467
           1       0.97      0.77      0.86        88

    accuracy                           1.00      4555
   macro avg       0.98      0.89      0.93      4555
weighted avg       1.00      1.00      0.99      4555

[[ 1.51967785e+00 -5.58781562e-02 -5.63567062e-04  8.28458809e-06
   2.74448717e-02  0.00000000e+00  5.35876468e-01  0.00000000e+00
  -1.81169897e-03]]
Queries in train and test: 24 23
Instances in train and test: 216 115
{'C': 1000, 'kernel': 'linear'}
Classification report on the test split of the dev set:
              precision    recall  f1-score   support

           0       0.96      0.91      0.93       109
           1       0.17      0.33      0.22         6

    accuracy                           0.

### Evaluate entity resolution

In [17]:
eval_results = []
eval_strict = True
dApproachNames = {"candrank_most_confident":"string confidence", "wikipedia_most_relevant":"wikipedia relevance", "semantically_most_similar":"semantic coherence", "our_method_all":"SVM simple", "our_method_comb":"SVM combined", "skyline": "skyline","ranklib":"ranklib all features"}
for topres_approach in ["skyline", "candrank_most_confident", "wikipedia_most_relevant", "semantically_most_similar","ranklib", "our_method_all", "our_method_comb"]:
    print(topres_approach, candrank_method, "(strict: " + str(eval_strict) + ")", "(numCands: " + str(num_candidates) + ")")
    exact_acc = eval_methods.topres_exactmetrics(results_test_df, topres_approach, eval_strict)
    acc_at = ("-", "-", "-")
    if topres_approach != "skyline":
        acc_at = eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach, eval_strict)

    eval_mode = "strict"
    if eval_strict == False:
        eval_mode = "appr"
        
    eval_results.append([eval_mode, dApproachNames[topres_approach], exact_acc, acc_at[0], acc_at[1], acc_at[2]])

tr_eval_df = pd.DataFrame(eval_results, columns = ["Eval", "Approach", "Prec", "Acc@1km", "Acc@5km", "Acc@10km"])
tr_eval_df.round(3)
print()
print(tr_eval_df.round(2).to_latex(index=False))

skyline deezy_match (strict: True) (numCands: 1)
candrank_most_confident deezy_match (strict: True) (numCands: 1)
wikipedia_most_relevant deezy_match (strict: True) (numCands: 1)
semantically_most_similar deezy_match (strict: True) (numCands: 1)
ranklib deezy_match (strict: True) (numCands: 1)
our_method_all deezy_match (strict: True) (numCands: 1)
our_method_comb deezy_match (strict: True) (numCands: 1)

\begin{tabular}{llrlll}
\toprule
  Eval &             Approach &  Prec & Acc@1km & Acc@5km & Acc@10km \\
\midrule
strict &              skyline &  0.70 &       - &       - &        - \\
strict &    string confidence &  0.63 &    0.67 &    0.72 &     0.72 \\
strict &  wikipedia relevance &  0.07 &    0.41 &    0.65 &     0.66 \\
strict &   semantic coherence &  0.32 &    0.48 &    0.63 &     0.63 \\
strict & ranklib all features &  0.65 &    0.68 &    0.74 &     0.74 \\
strict &           SVM simple &  0.67 &     0.7 &    0.73 &     0.73 \\
strict &         SVM combined &  0.68 &    0.

### Feature analysis

In [18]:
# ------------------------------
# Inspect best features in ranklib
code_folder = str(Path("../../").resolve()) + "/"
features_folder = str(Path("supervised_ranking/features/").resolve()) + "/"
Path(features_folder).mkdir(parents=True, exist_ok=True)
Path("supervised_ranking/feature_combs/").mkdir(parents=True, exist_ok=True)
cross_val = True
for filt in ["exact", "notexact"]:
    if not Path("supervised_ranking/feature_combs/" + filt + str(num_candidates) + ".txt").is_file():
        feature_combination = resolution_methods.find_feature_comb(features_folder, filt, cross_val, code_folder, features_dev, features_test_df, results_test_df, num_candidates)