In [None]:
%load_ext autoreload
%autoreload 2

# Entity resolution

In [None]:
import pandas as pd
from pathlib import Path
from tools import eval_methods, resolution_methods
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import random

In [None]:
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [None]:
import pickle
with open("/resources/wikipedia/extractedResources/overall_entity_freq.pickle", 'rb') as fp:
    wikipedia_entity_overall_dict = pickle.load(fp)

### Feature selection

In [None]:
# ------------------
# Feature selection for dev and test
num_candidates_list = [5, 3, 1]
settings = ["test", "dev"]
candrank_approaches = ["deezy_match"]
for num_candidates in num_candidates_list:
    for setting in settings:
        for candrank in candrank_approaches:
            features_file = "../processed/resolution/features_" + candrank + "_" + setting + str(num_candidates) + ".tsv"
            if not Path(features_file).is_file():
                df = pd.read_pickle("../processed/resolution/candranking_" + candrank + "_" + setting + str(num_candidates) + ".pkl")
                exp_df = resolution_methods.feature_selection(candrank, df, gazetteer_df, wikipedia_entity_overall_dict)
                exp_df.drop_duplicates(subset=['Query','Candidate'], inplace=True)
                exp_df.to_csv(features_file, sep="\t")
            print(candrank + " " + setting + " " + str(num_candidates) + " done!")

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

### Training classifiers

In [None]:
num_candidates = 1
candrank_method = "deezy_match"

features_dev = pd.read_csv("../processed/resolution/features_" + candrank + "_dev" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

In [None]:
# -------------------------------
# Predict railway stations
# -------------------------------
# f0 (SubStFormatted-candidate DeezyMatch confidence)
# f1 (MainStation-candidate DeezyMatch confidence)
# f2 (Altname-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f6 (Closest place confidence)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = features_dev # development set feature vectors
df_exact = dev_df[dev_df["Exact"] == 1]
use_cols_stations = ['f_0','f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_stations = resolution_methods.train_classifier(df_exact, use_cols_stations)

with open('../processed/resolution/clf_stations.pkl', 'wb') as f:
    pickle.dump(clf_stations, f)

# -------------------------------
# Predict places
# -------------------------------
# f1 (MainStation-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = features_dev # development set feature vectors
df_inexact = dev_df[dev_df["Exact"] == 0]
use_cols_places = ['f_1','f_3','f_4','f_5','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_places = resolution_methods.train_classifier(df_inexact, use_cols_places)

with open('../processed/resolution/clf_places.pkl', 'wb') as f:
    pickle.dump(clf_places, f)

### Find best threshold

In [None]:
# Find optimal threshold for stations/placess
# --------------------------------------------
optimal_threshold = 0.0
keep_acc = 0.0
for th in np.arange(0, 1, 0.05):
    th = round(th, 2)
    results_dev_df = pd.read_pickle("../processed/quicks/quicks_dev.pkl")
    results_dev_df = resolution_methods.our_method(features_dev, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, th, results_dev_df)
    acc = eval_methods.topres_exactmetrics(results_dev_df, "our_method", False)
    if acc >= keep_acc:
        optimal_threshold = th
        keep_acc = acc
        
print(optimal_threshold)

### Method and baselines

In [None]:
# We will store the results of all methods/baselines as columns in the original structured dataframe:
results_test_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")
features_test = pd.read_csv("../processed/resolution/features_" + candrank + "_test" + str(num_candidates) + ".tsv",sep='\t', index_col=0)

In [None]:
# -------------------------------
# Apply our classification methods (column "our_method")
# -------------------------------
features_test_df = features_test # test set feature vectors
results_test_df = resolution_methods.our_method(features_test_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, optimal_threshold, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply candrank-most-confident baseline
# -------------------------------
results_test_df = resolution_methods.candrank_most_confident(features_test_df, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply wikipedia-most-relevant baseline
# -------------------------------
results_test_df = resolution_methods.wikipedia_most_relevant(features_test_df, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply semantically_most_similar baseline
# -------------------------------
results_test_df = resolution_methods.semantically_most_similar(features_test_df, results_test_df)

In [None]:
# -------------------------------
# Skyline: Best possible result given candidates
# -------------------------------
results_test_df = resolution_methods.skyline(features_test_df, results_test_df)

### Evaluate entity resolution

In [None]:
eval_results = []
eval_strict = False
dApproachNames = {"candrank_most_confident":"string confidence", "wikipedia_most_relevant":"wikipedia relevance", "semantically_most_similar":"semantic coherence", "our_method":"our method", "skyline": "skyline"}
for topres_approach in ["skyline", "candrank_most_confident", "wikipedia_most_relevant", "semantically_most_similar", "our_method"]:
    print(topres_approach, candrank_method, "(strict: " + str(eval_strict) + ")", "(numCands: " + str(num_candidates) + ")")
    exact_acc = eval_methods.topres_exactmetrics(results_test_df, topres_approach, eval_strict)
    acc_at = ("-", "-", "-")
    if topres_approach != "skyline":
        acc_at = eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach, eval_strict)

    eval_mode = "strict"
    if eval_strict == False:
        eval_mode = "appr"
        
    eval_results.append([eval_mode, dApproachNames[topres_approach], exact_acc, acc_at[0], acc_at[1], acc_at[2]])

tr_eval_df = pd.DataFrame(eval_results, columns = ["Eval", "Approach", "Prec", "Acc@1km", "Acc@5km", "Acc@10km"])
tr_eval_df.round(3)
print()
print(tr_eval_df.round(2).to_latex(index=False))