# Tutorial: OCR example

## Train a new model

In [None]:
from DeezyMatch import train as dm_train

# train a new model
dm_train(input_file_path="inputs/input_dfm.yaml", 
         dataset_path="data/w2v_ocr_pairs_1860s.txt", 
         model_name="ocr002")

In [None]:
from DeezyMatch import plot_log

# plot log file
plot_log(path2log="./models/ocr002/log.txt", 
         output_name="log_ocr002")

## Model inference

In [None]:
# from DeezyMatch import inference as dm_inference

# # model inference using a model stored at pretrained_model_path and pretrained_vocab_path 
# dm_inference(input_file_path="inputs/input_dfm.yaml",
#              dataset_path="data/w2v_ocr_pairs_1860s.txt", 
#              pretrained_model_path="./models/ocr002/ocr002.model", 
#              pretrained_vocab_path="./models/ocr002/ocr002.vocab")

## Generate query vectors

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for queries (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(input_file_path="inputs/input_dfm.yaml",
            dataset_path="data/queries_1870s.txt", 
            pretrained_model_path="./models/ocr002/ocr002.model", 
            pretrained_vocab_path="./models/ocr002/ocr002.vocab",
            inference_mode="vect",
            scenario="queries/test")

## Generate candidate vectors

In [None]:
from DeezyMatch import inference as dm_inference

# generate vectors for candidates (specified in dataset_path) 
# using a model stored at pretrained_model_path and pretrained_vocab_path 
dm_inference(input_file_path="inputs/input_dfm.yaml",
             dataset_path="data/candidates_1870s.txt", 
             pretrained_model_path="./models/ocr002/ocr002.model", 
             pretrained_vocab_path="./models/ocr002/ocr002.vocab",
             inference_mode="vect",
             scenario="candidates/test")

## Assembling queries vector representations

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in queries/test and save them in combined/queries_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
             input_scenario='queries/test', 
             output_scenario='combined/queries_test', 
             print_every=10)

## Assembling candidates vector representations

In [None]:
from DeezyMatch import combine_vecs

# combine vectors stored in candidates/test and save them in combined/candidates_test
combine_vecs(rnn_passes=['fwd', 'bwd'], 
             input_scenario='candidates/test', 
             output_scenario='combined/candidates_test', 
             print_every=10)

## Candidate Ranker

In [None]:
from DeezyMatch import candidate_ranker

# Select candidates based on L2-norm distance (aka faiss distance):
# find candidates from candidate_scenario 
# for queries specified in query_scenario
candidates_pd = \
    candidate_ranker(query_scenario="./combined/queries_test",
                     candidate_scenario="./combined/candidates_test", 
                     ranking_metric="faiss", 
                     selection_threshold=50., 
                     num_candidates=2, 
                     search_size=2, 
                     verbose=False,
                     use_predict=False,
                     output_path="ranker_results/test_candidates_deezymatch", 
                     pretrained_model_path="./models/ocr002/ocr002.model", 
                     pretrained_vocab_path="./models/ocr002/ocr002.vocab", 
                    #  number_test_rows=200)
    )

In [None]:
candidates_pd

In [None]:
candidates_pd_tmp = candidates_pd[candidates_pd["faiss_distance"].astype(str).str.contains("machine", regex=False)]
candidates_pd_tmp