# Tutorial: OCR example

In [None]:
import os

In [None]:
# Name of the w2v model, from which the string pairs dataset
# has been created:
model_name = "w2v_1760_1900"

## Train a new model

In [None]:
from DeezyMatch import train as dm_train

# train a new model
dm_train(input_file_path=os.path.join("inputs", "input_dfm.yaml"), 
         dataset_path=os.path.join("data", f"w2v_ocr_pairs_{model_name}.txt"), 
         model_name=model_name)

In [None]:
from DeezyMatch import plot_log

# plot log file
plot_log(path2log=os.path.join("models", f"{model_name}", "log.txt"), 
         output_name=f"log_{model_name}")

## Model inference

In [None]:
from DeezyMatch import inference as dm_inference

# model inference using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(input_file_path=os.path.join("inputs", "input_dfm.yaml"),
             dataset_path=os.path.join("data", f"w2v_ocr_pairs_{model_name}.txt"), 
             pretrained_model_path=os.path.join("models", f"{model_name}", f"{model_name}.model"), 
             pretrained_vocab_path=os.path.join("models", f"{model_name}", f"{model_name}.vocab"))

## Generate query vectors

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for queries (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(input_file_path=os.path.join("inputs", "input_dfm.yaml"),
             dataset_path=os.path.join("data", f"queries_{model_name}.txt"), 
             pretrained_model_path=os.path.join("models", f"{model_name}", f"{model_name}.model"), 
             pretrained_vocab_path=os.path.join("models", f"{model_name}", f"{model_name}.vocab"),
             inference_mode="vect",
             scenario="queries/test")

## Generate candidate vectors

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for candidates (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(input_file_path=os.path.join("inputs", "input_dfm.yaml"),
             dataset_path=os.path.join("data", f"candidates_{model_name}.txt"), 
             pretrained_model_path=os.path.join("models", f"{model_name}", f"{model_name}.model"), 
             pretrained_vocab_path=os.path.join("models", f"{model_name}", f"{model_name}.vocab"),
             inference_mode="vect",
             scenario="candidates/test")

## Assembling queries vector representations

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in queries/test and save them in combined/queries_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
             input_scenario=os.path.join('queries', 'test'), 
             output_scenario=os.path.join('combined', 'queries_test'), 
             print_every=10)

## Assembling candidates vector representations

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in candidates/test and save them in combined/candidates_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
             input_scenario=os.path.join('candidates', 'test'), 
             output_scenario=os.path.join('combined', 'candidates_test'), 
             print_every=10)

## Candidate Ranker

In [None]:
from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance):
# find candidates from candidate_scenario 
# for queries specified in query_scenario
candidates_pd = \
    candidate_ranker(query_scenario=os.path.join("combined", "queries_test"),
                     candidate_scenario=os.path.join("combined", "candidates_test"), 
                     ranking_metric="faiss", 
                     selection_threshold=50., 
                     num_candidates=2, 
                     search_size=2, 
                     verbose=False,
                     use_predict=False,
                     output_path=os.path.join("ranker_results", "test_candidates_deezymatch"), 
                     pretrained_model_path=os.path.join("models", f"{model_name}", f"{model_name}.model"), 
                     pretrained_vocab_path=os.path.join("models", f"{model_name}", f"{model_name}.vocab"), 
                     number_test_rows=200)

In [None]:
candidates_pd

In [None]:
from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance):
# find candidates from candidate_scenario 
# for queries specified in query_scenario
candidates_pd = \
    candidate_ranker(query=["vvater"],
                     candidate_scenario=os.path.join("combined", "candidates_test"), 
                     ranking_metric="faiss", 
                     selection_threshold=50., 
                     num_candidates=5, 
                     search_size=5, 
                     verbose=False,
                     use_predict=False,
                     output_path=os.path.join("ranker_results", "test_candidates_deezymatch"), 
                     pretrained_model_path=os.path.join("models", f"{model_name}", f"{model_name}.model"), 
                     pretrained_vocab_path=os.path.join("models", f"{model_name}", f"{model_name}.vocab") 
                    #  number_test_rows=200)
    )
candidates_pd.iloc[0].faiss_distance