# Entity resolution

In [None]:
import pandas as pd
from pathlib import Path
from tools import eval_methods, resolution_methods
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score
import random

# import pickle

In [None]:
gazetteer_df = pd.read_csv("../processed/wikidata/british_isles_gazetteer.csv", header=0, index_col=0, low_memory=False)

In [None]:
import pickle
with open("/resources/wikipedia/extractedResources/overall_entity_freq.pickle", 'rb') as fp:
    wikipedia_entity_overall_dict = pickle.load(fp)

### Feature selection

In [None]:
# ------------------
# Feature selection for dev and test
settings = ["test", "dev"]
candrank_approaches = ["deezy_match"]
for candrank in candrank_approaches:
    for setting in settings:
        features_file = "features_" + setting + "_" + candrank + ".tsv"
        if not Path(features_file).is_file():
            df = pd.read_pickle("candranking_" + setting + ".pkl")
            exp_df = resolution_methods.feature_selection(candrank, df, gazetteer_df, wikipedia_entity_overall_dict)
            exp_df.drop_duplicates(subset=['Query','Candidate'], inplace=True)
            exp_df.to_csv(features_file, sep="\t")
        print(candrank + " " + setting + " done!")

features_dev = pd.read_csv("features_dev_" + candrank + ".tsv",sep='\t', index_col=0)
features_test = pd.read_csv("features_test_" + candrank + ".tsv",sep='\t', index_col=0)

### Training classifiers

In [None]:
candrank_method = "deezy_match"

In [None]:
# -------------------------------
# Predict railway stations
# -------------------------------
# f0 (SubStFormatted-candidate DeezyMatch confidence)
# f1 (MainStation-candidate DeezyMatch confidence)
# f2 (Altname-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f6 (Closest place confidence)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = pd.read_csv("features_dev_" + candrank_method + ".tsv",sep='\t', index_col=0) # development set feature vectors
df_exact = dev_df[dev_df["Exact"] == 1]
use_cols_stations = ['f_0','f_1','f_2','f_3','f_4','f_5','f_6','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_stations = resolution_methods.train_classifier(df_exact, use_cols_stations)

with open('clf_stations.pkl', 'wb') as f:
    pickle.dump(clf_stations, f)

# -------------------------------
# Predict places
# -------------------------------
# f1 (MainStation-candidate DeezyMatch confidence)
# f3 (String similarity)
# f4 (Wikidata class: is it an instance of railway station)
# f5 (Wikidata class: is it an instance of populated place)
# f7 (Closest subst/altname confidence)
# f8 (Relevance through wikipedia inlinks)

dev_df = pd.read_csv("features_dev_" + candrank_method + ".tsv",sep='\t', index_col=0) # development set feature vectors
df_inexact = dev_df[dev_df["Exact"] == 0]
use_cols_places = ['f_1','f_3','f_4','f_5','f_7','f_8'] # features to use
        
# The following returns a classifier and the test set portion of the development set
# (independently from the content of the run variable):
clf_places = resolution_methods.train_classifier(df_inexact, use_cols_places)

### Find best threshold

In [None]:
# Find optimal threshold for stations/placess
# --------------------------------------------
optimal_threshold = 0.0
keep_acc = 0.0
for th in np.arange(0, 1, 0.05):
    th = round(th, 2)
    features_dev_df = pd.read_csv("features_dev_" + candrank_method + ".tsv",sep='\t', index_col=0) # test set feature vectors
    results_dev_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")
    results_dev_df = resolution_methods.our_method(features_dev_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, th, results_dev_df)
    acc = eval_methods.topres_exactmetrics(results_dev_df, "our_method")
    if acc >= keep_acc:
        optimal_threshold = th
        keep_acc = acc

### Method and baselines

In [None]:
# We will store the results of all methods/baselines as columns in the original structured dataframe:
results_test_df = pd.read_pickle("../processed/quicks/quicks_test.pkl")

In [None]:
# -------------------------------
# Apply our classification methods (column "our_method")
# -------------------------------
features_test_df = pd.read_csv("features_test_" + candrank_method + ".tsv",sep='\t', index_col=0) # test set feature vectors
results_test_df = resolution_methods.our_method(features_test_df, clf_stations, use_cols_stations, clf_places, use_cols_places, gazetteer_df, optimal_threshold, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply candrank-most-confident baseline
# -------------------------------
results_test_df = resolution_methods.candrank_most_confident(features_test_df, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply wikipedia-most-relevant baseline
# -------------------------------
results_test_df = resolution_methods.wikipedia_most_relevant(features_test_df, results_test_df)

In [None]:
# -------------------------------
# Baseline: Apply semantically_most_similar baseline
# -------------------------------
results_test_df = resolution_methods.semantically_most_similar(features_test_df, results_test_df)

### Evaluate entity resolution

In [None]:
for topres_approach in ["our_method", "candrank_most_confident", "wikipedia_most_relevant", "semantically_most_similar"]:
    print(topres_approach, candrank_method)
    print("-------------------------")
    exact_acc = eval_methods.topres_exactmetrics(results_test_df, topres_approach)
    acc_at_1, acc_at_5, acc_at_10 = eval_methods.topres_distancemetrics(gazetteer_df, results_test_df, topres_approach)
    print("Exact:", exact_acc)
    print("Acc@1:", acc_at_1)
    print("Acc@5:", acc_at_5)
    print("Acc@10:", acc_at_10)
    print()