# Candidate Ranking experiments

In [1]:
from pathlib import Path
from shutil import copyfile
import pandas as pd
import numpy as np
import unicodedata
from haversine import haversine
import time
import ast
from sklearn.metrics import average_precision_score

from find_deezymatch_candidates import find_deezymatch_candidates
from find_levdam_candidates import find_levdam_candidates
from evaluation_functions import evaluate_ranking
from evaluation_functions import map_score

## Evaluation of Candidate Selection methods on different datasets (table 6)

### 1. ArgManuscrita

In [6]:
gazetteer_name = "wikigaz_hgisindias_es"
candrank_dataset = "argentina_manuscrita"
deezymatch_model = "wikigaz_es_001"

# Find DeezyMatch candidates:
find_deezymatch_candidates(gazetteer_name, candrank_dataset, deezymatch_model)

# Find Levenshtein-Damerau candidates:
find_levdam_candidates(gazetteer_name, candrank_dataset)

# Rank candidates:
evaluate_ranking(gazetteer_name, candrank_dataset, deezymatch_model)
print("* 1 candidate")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 1)
print("* 5 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 5)
print("* 10 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 10)
print("* 20 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 20)

* 1 candidate
EXACT P@1 0.69
DM P@1 0.78
LD P@1 0.78

* 5 candidates
DM MAP 0.78
LD MAP 0.77

* 10 candidates
DM MAP 0.76
LD MAP 0.72

* 20 candidates
DM MAP 0.74
LD MAP 0.7



### 2. WOTR

In [5]:
gazetteer_name = "wikigaz_en"
candrank_dataset = "wotr_test"
deezymatch_model = "wikigaz_en_001"

# Find DeezyMatch candidates:
find_deezymatch_candidates(gazetteer_name, candrank_dataset, deezymatch_model)

# Find Levenshtein-Damerau candidates:
find_levdam_candidates(gazetteer_name, candrank_dataset)

# Rank candidates:
evaluate_ranking(gazetteer_name, candrank_dataset, deezymatch_model)
print("* 1 candidate")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 1)
print("* 5 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 5)
print("* 10 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 10)
print("* 20 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 20)

* 1 candidate
EXACT P@1 0.86
DM P@1 0.93
LD P@1 0.92

* 5 candidates
DM MAP 0.92
LD MAP 0.89

* 10 candidates
DM MAP 0.9
LD MAP 0.84

* 20 candidates
DM MAP 0.87
LD MAP 0.8



### 3. FMP

In [4]:
gazetteer_name = "wikigaz_en"
candrank_dataset = "fmp"
deezymatch_model = "wikigaz_en_001"

# Find DeezyMatch candidates:
find_deezymatch_candidates(gazetteer_name, candrank_dataset, deezymatch_model)

# Find Levenshtein-Damerau candidates:
find_levdam_candidates(gazetteer_name, candrank_dataset)

# Rank candidates:
evaluate_ranking(gazetteer_name, candrank_dataset, deezymatch_model)
print("* 1 candidate")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 1)
print("* 5 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 5)
print("* 10 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 10)
print("* 20 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 20)

* 1 candidate
EXACT P@1 0.77
DM P@1 0.85
LD P@1 0.92

* 5 candidates
DM MAP 0.85
LD MAP 0.88

* 10 candidates
DM MAP 0.82
LD MAP 0.82

* 20 candidates
DM MAP 0.78
LD MAP 0.76



In [3]:
gazetteer_name = "wikigaz_en"
candrank_dataset = "fmp"
deezymatch_model = "ocr_001"

# Find DeezyMatch candidates:
find_deezymatch_candidates(gazetteer_name, candrank_dataset, deezymatch_model)

# Find Levenshtein-Damerau candidates:
find_levdam_candidates(gazetteer_name, candrank_dataset)

# Rank candidates:
evaluate_ranking(gazetteer_name, candrank_dataset, deezymatch_model)
print("* 1 candidate")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 1)
print("* 5 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 5)
print("* 10 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 10)
print("* 20 candidates")
map_score(gazetteer_name, candrank_dataset, deezymatch_model, 20)

* 1 candidate
EXACT P@1 0.77
DM P@1 0.83
LD P@1 0.92

* 5 candidates
DM MAP 0.83
LD MAP 0.88

* 10 candidates
DM MAP 0.82
LD MAP 0.82

* 20 candidates
DM MAP 0.8
LD MAP 0.76

