# BOW Evaluation 

In [1]:
import mlflow
import mlflow.sklearn
import polars as pl
import seaborn as sns
import scipy
import duckdb
import numpy as np
import json
import timeit
import matplotlib.pyplot as plt

In [2]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('../../..'))
from app.model_functions import run_BOW_on_line
from app.evaluation import reciprocal_rank, rank_BOW_results, precision_at_k
from app.data_handling.BOW import transform_ds_to_BOW
load_dotenv()

True

# Set up definitions
Datasets:
 - 1k_processed
 - 10k
 - 100k
 - full

In [3]:
dataset_name = '100k'
run_name = 'BOW_100k'

In [4]:
with open("../../../data/eval_data/processed_data.json", "r") as f:
    query_ds = pl.read_json(f)
with open("../../../data/eval_data/human_annotations.json", "r") as f:
    human_query_ds = pl.read_json(f)

In [5]:
if dataset_name == '1k_processed':
    eval_ds = query_ds
elif dataset_name == '10k':
    with open("../../../data/eval_data/eval_10k.json", "r") as f:
        eval_ds = pl.read_json(f)
elif dataset_name == '100k':
    with open("../../../data/eval_data/eval_100k.json", "r") as f:
        eval_ds = pl.read_json(f)

In [6]:
results_df = pl.DataFrame(
    schema={
        'Model': pl.String,
        'Eval_Task': pl.String,
        "Dataset": pl.String,
        "MRR": pl.Float64,
        "TKA": pl.Float64,
        "Time_Tot": pl.Float64,
        "Time_Query": pl.Float64,
        "Error_Count": pl.Int32
    })
results_df

Model,Eval_Task,Dataset,MRR,TKA,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32


## Running code

In [7]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [8]:
model_uri = 'models:/recipe_BOW@prod'
BOW_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

### start = timeit.default_timer()
eval_ds_title = transform_ds_to_BOW(eval_ds, BOW_model, use_title=True)
time = timeit.default_timer() - start
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

In [9]:
start = timeit.default_timer()
eval_ds = transform_ds_to_BOW(eval_ds, BOW_model)
time = timeit.default_timer() - start
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

Dataset transformation took 790.39919418504s, or 0.007903991941850399s/row


In [10]:
# Run once to download model, to put that time into the time measurement
emb_query = run_BOW_on_line("pecans", model_uri)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
def evaluation_task_BOW(eval_ds: pl.DataFrame, query_ds: pl.DataFrame, query_field: str, results_df: pl.DataFrame,
                        task_name: str, use_title: bool = False, model_name: str = "bow"):
    print(f"--- Running {task_name} on {dataset_name} using {model_name} ---")
    tot_rr = 0
    tot_prec_at_3 = 0
    tot_evals = 0
    errors = 0
    start = timeit.default_timer()
    for index, title, q in query_ds.select('index', 'title', query_field).iter_rows():
        query = ""
        if use_title:
            query += title + " "
        if query_field == 'google_search_query_processed':
            query += "\n".join(q['ingredients'])
        else:
            query += "\n".join(q)
        emb_query = run_BOW_on_line(query, model_uri)
        ranked_ds = rank_BOW_results(eval_ds, emb_query)
        tot_rr += reciprocal_rank(ranked_ds, index)
        tot_prec_at_3 += precision_at_k(ranked_ds, index)
        tot_evals += 1
    time = timeit.default_timer() - start
    mrr = tot_rr/tot_evals
    mprec3 = tot_prec_at_3 / tot_evals
    if task_name == 'gold_adversarial_ingredients' or task_name == 'gold_adversarial_subset_ingredients' or task_name == 'added_prep_suffixes':
        mrr = 1-mrr
        mprec3 = 1-mprec3
    result_data = {
        "Model": [model_name],
        'Eval_Task': [task_name],
        'Dataset': [dataset_name],
        "MRR": [mrr],
        "TKA": [mprec3],
        "Time_Tot": time,
        "Time_Query": time/tot_evals,
        "Error_Count": errors
    }
    result_df = pl.DataFrame(result_data)
    results_df = results_df.vstack(result_df)
    print(f"--- COMPLETED {task_name} in {time}s ---")
    return results_df

In [12]:
results_df = evaluation_task_BOW(eval_ds, query_ds, 'ingredients', results_df, 'identity')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'main_food_items_and_preparations', results_df, 'llm1')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'important_ingredients_for_search', results_df, 'llm2')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'google_search_query_processed', results_df, 'llm3')

--- Running identity on 100k using bow ---
--- COMPLETED identity in 1936.1246248109965s ---
--- Running llm1 on 100k using bow ---
--- COMPLETED llm1 in 1897.3799467729405s ---
--- Running llm2 on 100k using bow ---
--- COMPLETED llm2 in 1622.3781642259564s ---
--- Running llm3 on 100k using bow ---
--- COMPLETED llm3 in 1243.842302201083s ---


In [13]:
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_ingredients', results_df, 'gold_ingredients')
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_key_ingredients', results_df, 'gold_key_ingredients')

--- Running gold_ingredients on 100k using bow ---
--- COMPLETED gold_ingredients in 80.75183058995754s ---
--- Running gold_key_ingredients on 100k using bow ---
--- COMPLETED gold_key_ingredients in 80.91088773403317s ---


In [14]:
results_df = evaluation_task_BOW(eval_ds, query_ds, 'added_preps', results_df, 'added_prep_suffixes')
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_adv_ingredients', results_df, 'gold_adversarial_ingredients')
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_adv_subset_ingredients', results_df, 'gold_adversarial_subset_ingredients')

--- Running added_prep_suffixes on 100k using bow ---
--- COMPLETED added_prep_suffixes in 1993.680767904967s ---
--- Running gold_adversarial_ingredients on 100k using bow ---
--- COMPLETED gold_adversarial_ingredients in 85.70343147404492s ---
--- Running gold_adversarial_subset_ingredients on 100k using bow ---
--- COMPLETED gold_adversarial_subset_ingredients in 90.85712778591551s ---


### results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'ingredients', results_df, 'identity', use_title=True, model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'main_food_items_and_preparations', results_df, 'llm1', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'important_ingredients_for_search', results_df, 'llm2', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'google_search_query_processed', results_df, 'llm3', model_name='bow_title')

### results_df = evaluation_task_BOW(eval_ds_title, human_query_ds, 'human_ingredients', results_df, 'gold_ingredients', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, human_query_ds, 'human_key_ingredients', results_df, 'gold_key_ingredients', model_name='bow_title')

In [15]:
results_df

Model,Eval_Task,Dataset,MRR,TKA,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32
"""bow""","""identity""","""100k""",0.999,1.0,1936.124625,1.936125,0
"""bow""","""llm1""","""100k""",0.929226,0.949,1897.379947,1.89738,0
"""bow""","""llm2""","""100k""",0.421733,0.473,1622.378164,1.622378,0
"""bow""","""llm3""","""100k""",0.149401,0.154,1243.842302,1.243842,0
"""bow""","""gold_ingredients""","""100k""",0.707309,0.8,80.751831,1.615037,0
"""bow""","""gold_key_ingredients""","""100k""",0.63576,0.7,80.910888,1.618218,0
"""bow""","""added_prep_suffixes""","""100k""",0.001,0.0,1993.680768,1.993681,0
"""bow""","""gold_adversarial_ingredients""","""100k""",0.430641,0.36,85.703431,1.714069,0
"""bow""","""gold_adversarial_subset_ingred…","""100k""",0.951375,0.98,90.857128,1.817143,0


In [16]:
from datetime import datetime
date_time = datetime.now().strftime("%Y_%m_%d__%H_%M")

In [17]:
date_time

'2025_06_12__20_25'

In [18]:
results_df.write_parquet(f"../../../data/results/BOW/{run_name}_{date_time}.parquet")

In [19]:
pl.read_parquet(f"../../../data/results/BOW/{run_name}_{date_time}.parquet")

Model,Eval_Task,Dataset,MRR,TKA,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32
"""bow""","""identity""","""100k""",0.999,1.0,1936.124625,1.936125,0
"""bow""","""llm1""","""100k""",0.929226,0.949,1897.379947,1.89738,0
"""bow""","""llm2""","""100k""",0.421733,0.473,1622.378164,1.622378,0
"""bow""","""llm3""","""100k""",0.149401,0.154,1243.842302,1.243842,0
"""bow""","""gold_ingredients""","""100k""",0.707309,0.8,80.751831,1.615037,0
"""bow""","""gold_key_ingredients""","""100k""",0.63576,0.7,80.910888,1.618218,0
"""bow""","""added_prep_suffixes""","""100k""",0.001,0.0,1993.680768,1.993681,0
"""bow""","""gold_adversarial_ingredients""","""100k""",0.430641,0.36,85.703431,1.714069,0
"""bow""","""gold_adversarial_subset_ingred…","""100k""",0.951375,0.98,90.857128,1.817143,0
