# Find candidates with DeezyMatch

In [None]:
from DeezyMatch import inference as dm_inference
from DeezyMatch import combine_vecs
from DeezyMatch import candidate_ranker

from pathlib import Path
import pandas as pd
import time

In [None]:
def findcandidates(candidates, queries, dm_model, inputfile, overwrite=False):
    
    # --------------------------------------
    # GENERATE AND COMBINE CANDIDATE VECTORS
    
    # generate vectors for candidates (specified in dataset_path) 
    # using a model stored at pretrained_model_path and pretrained_vocab_path 
    if not Path("./candidates/" + candidates + "_" + dm_model + "/embeddings/").is_dir() or overwrite == True:
        start_time = time.time()
        dm_inference(input_file_path="./models/" + dm_model + "/" + inputfile + ".yaml",
                     dataset_path="./gazetteers/" + candidates + ".txt", 
                     pretrained_model_path="./models/" + dm_model + "/" + dm_model + ".model", 
                     pretrained_vocab_path="./models/" + dm_model + "/" + dm_model + ".vocab",
                     inference_mode="vect",
                     scenario="candidates/" + candidates + "_" + dm_model)
        elapsed = time.time() - start_time
        print("Generate candidate vectors: %s" % elapsed)

    # combine vectors stored in the scenario in candidates/ and save them in combined/
    if not Path("./combined/" + candidates + "_" + dm_model).is_dir() or overwrite == True:
        start_time = time.time()
        combine_vecs(rnn_passes=["fwd", "bwd"], 
                     input_scenario="candidates/" + candidates + "_" + dm_model, 
                     output_scenario="combined/" + candidates + "_" + dm_model, 
                     print_every=100)
        elapsed = time.time() - start_time
        print("Combine candidate vectors: %s" % elapsed)
    
    # --------------------------------------
    # GENERATE AND COMBINE QUERY VECTORS
    
    # generate vectors for queries (specified in dataset_path) 
    # using a model stored at pretrained_model_path and pretrained_vocab_path 
    if not Path("./queries/" + queries + "_" + dm_model + "/embeddings/").is_dir() or overwrite == True:
        start_time = time.time()
        dm_inference(input_file_path="./models/" + dm_model + "/" + inputfile + ".yaml",
                     dataset_path="./toponyms/" + queries + ".txt", 
                     pretrained_model_path="./models/" + dm_model + "/" + dm_model + ".model", 
                     pretrained_vocab_path="./models/" + dm_model + "/" + dm_model + ".vocab",
                     inference_mode="vect",
                     scenario="queries/" + queries + "_" + dm_model)
        elapsed = time.time() - start_time
        print("Generate candidate vectors: %s" % elapsed)

    # combine vectors stored in the scenario in queries/ and save them in combined/
    if not Path("./combined/" + queries + "_" + dm_model).is_dir() or overwrite == True:
        start_time = time.time()
        combine_vecs(rnn_passes=["fwd", "bwd"], 
                     input_scenario="queries/" + queries + "_" + dm_model, 
                     output_scenario="combined/" + queries + "_" + dm_model, 
                     print_every=100)
        elapsed = time.time() - start_time
        print("Combine candidate vectors: %s" % elapsed)
        
    # Select candidates based on L2-norm distance (aka faiss distance):
    # find candidates from candidate_scenario 
    # for queries specified in query_scenario
    if not Path("ranker_results/" + queries + "_" + candidates + "_" + dm_model + ".pkl").is_file() or overwrite == True:
        start_time = time.time()
        candidates_pd = \
            candidate_ranker(query_scenario="./combined/" + queries + "_" + dm_model,
                             candidate_scenario="./combined/" + candidates + "_" + dm_model, 
                             ranking_metric="faiss", 
                             selection_threshold=100., 
                             num_candidates=20, 
                             search_size=20, 
                             output_path="ranker_results/" + queries + "_" + candidates + "_" + dm_model, 
                             pretrained_model_path="./models/" + dm_model + "/" + dm_model + ".model", 
                             pretrained_vocab_path="./models/" + dm_model + "/" + dm_model + ".vocab")
        elapsed = time.time() - start_time
        print("Rank candidates: %s" % elapsed)

In [None]:
candidates = "britwikidata_candidates"
queries = "bho_queries"
dm_model = "wikigaz_en_001"
inputfile = "input_dfm_001"
overwrite = True

findcandidates(candidates, queries, dm_model, inputfile, overwrite)

In [None]:
df = pd.read_pickle("ranker_results/" + queries + "_" + candidates + "_" + dm_model + ".pkl")

In [None]:
df.iloc[350:400]