# BOW Evaluation 

In [22]:
import mlflow
import mlflow.sklearn
import polars as pl
import seaborn as sns
import scipy
import duckdb
import numpy as np
import json
import timeit
import matplotlib.pyplot as plt

In [23]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('../../..'))
from app.model_functions import run_BOW_on_line
from app.evaluation import reciprocal_rank, rank_BOW_results, precision_at_k
from app.data_handling.BOW import transform_ds_to_BOW
load_dotenv()

True

# Set up definitions
Datasets:
 - 1k_processed
 - 10k
 - 100k
 - full

In [3]:
dataset_name = '1k_processed'
run_name = 'first_BOW'

In [4]:
with open("../../../data/eval_data/processed_data.json", "r") as f:
    query_ds = pl.read_json(f)
with open("../../../data/eval_data/human_annotations.json", "r") as f:
    human_query_ds = pl.read_json(f)

In [5]:
if dataset_name == '1k_processed':
    eval_ds = query_ds
elif dataset_name == '10k':
    with open("../../../data/eval_data/eval_10k.json", "r") as f:
        eval_ds = pl.read_json(f)
elif dataset_name == '100k':
    with open("../../../data/eval_data/eval_100k.json", "r") as f:
        eval_ds = pl.read_json(f)

In [6]:
results_df = pl.DataFrame(
    schema={
        'Model': pl.String,
        'Eval_Task': pl.String,
        "Dataset": pl.String,
        "MRR": pl.Float64,
        "P@3": pl.Float64,
        "Time_Tot": pl.Float64,
        "Time_Query": pl.Float64,
        "Error_Count": pl.Int32
    })
results_df

Model,Eval_Task,Dataset,MRR,P@3,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32


## Running code

In [7]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [8]:
model_uri = 'models:/recipe_BOW@prod'
BOW_model = mlflow.sklearn.load_model(model_uri)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
start = timeit.default_timer()
eval_ds_title = transform_ds_to_BOW(eval_ds, BOW_model, use_title=True)
time = timeit.default_timer() - start
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

Dataset transformation took 6.538566679984797s, or 0.006538566679984797s/row


In [10]:
start = timeit.default_timer()
eval_ds = transform_ds_to_BOW(eval_ds, BOW_model)
time = timeit.default_timer() - start
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

Dataset transformation took 5.008462523983326s, or 0.0050084625239833255s/row


In [11]:
# Run once to download model, to put that time into the time measurement
emb_query = run_BOW_on_line("pecans", model_uri)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
def evaluation_task_BOW(eval_ds: pl.DataFrame, query_ds: pl.DataFrame, query_field: str, results_df: pl.DataFrame,
                        task_name: str, use_title: bool = False, model_name: str = "bow"):
    tot_rr = 0
    tot_prec_at_3 = 0
    tot_evals = 0
    errors = 0
    start = timeit.default_timer()
    for index, title, q in query_ds.select('index', 'title', query_field).iter_rows():
        query = ""
        if use_title:
            query += title + " "
        if query_field == 'google_search_query_processed':
            query += "\n".join(q['ingredients'])
        else:
            query += "\n".join(q)
        emb_query = run_BOW_on_line(query, model_uri)
        ranked_ds = rank_BOW_results(eval_ds, emb_query)
        tot_rr += reciprocal_rank(ranked_ds, index)
        tot_prec_at_3 += precision_at_k(ranked_ds, index)
        tot_evals += 1
    time = timeit.default_timer() - start
    mrr = tot_rr/tot_evals
    mprec3 = tot_prec_at_3 / tot_evals
    result_data = {
        "Model": [model_name],
        'Eval_Task': [task_name],
        'Dataset': dataset_name,
        "MRR": [mrr],
        "P@3": [mprec3],
        "Time_Tot": time,
        "Time_Query": time/tot_evals,
        "Error_Count": errors
    }
    result_df = pl.DataFrame(result_data)
    results_df = results_df.vstack(result_df)
    return results_df

In [13]:
results_df = evaluation_task_BOW(eval_ds, query_ds, 'ingredients', results_df, 'identity')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'main_food_items_and_preparations', results_df, 'llm1')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'important_ingredients_for_search', results_df, 'llm2')
results_df = evaluation_task_BOW(eval_ds, query_ds, 'google_search_query_processed', results_df, 'llm3')

In [14]:
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_ingredients', results_df, 'gold_ingredients')
results_df = evaluation_task_BOW(eval_ds, human_query_ds, 'human_key_ingredients', results_df, 'gold_key_ingredients')

In [15]:
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'ingredients', results_df, 'identity', use_title=True, model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'main_food_items_and_preparations', results_df, 'llm1', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'important_ingredients_for_search', results_df, 'llm2', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, query_ds, 'google_search_query_processed', results_df, 'llm3', model_name='bow_title')

In [16]:
results_df = evaluation_task_BOW(eval_ds_title, human_query_ds, 'human_ingredients', results_df, 'gold_ingredients', model_name='bow_title')
results_df = evaluation_task_BOW(eval_ds_title, human_query_ds, 'human_key_ingredients', results_df, 'gold_key_ingredients', model_name='bow_title')

In [17]:
results_df

Model,Eval_Task,Dataset,MRR,P@3,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32
"""bow""","""identity""","""1k_processed""",1.0,1.0,26.316563,0.026317,0
"""bow""","""llm1""","""1k_processed""",0.99385,0.998,27.107453,0.027107,0
"""bow""","""llm2""","""1k_processed""",0.887626,0.938,23.876507,0.023877,0
"""bow""","""llm3""","""1k_processed""",0.620577,0.686,20.794182,0.020794,0
"""bow""","""gold_ingredients""","""1k_processed""",0.985,0.98,1.205586,0.024112,0
…,…,…,…,…,…,…,…
"""bow_title""","""llm1""","""1k_processed""",0.994071,0.999,31.860938,0.031861,0
"""bow_title""","""llm2""","""1k_processed""",0.901975,0.954,20.554046,0.020554,0
"""bow_title""","""llm3""","""1k_processed""",0.62323,0.708,19.460755,0.019461,0
"""bow_title""","""gold_ingredients""","""1k_processed""",1.0,1.0,1.653312,0.033066,0


In [18]:
from datetime import datetime
date_time = datetime.now().strftime("%Y_%m_%d__%H_%M")

In [19]:
date_time

'2025_06_11__14_20'

In [20]:
results_df.write_parquet(f"../../../data/results/BOW/{run_name}_{date_time}.parquet")

In [21]:
pl.read_parquet(f"../../../data/results/BOW/{run_name}_{date_time}.parquet")

Model,Eval_Task,Dataset,MRR,P@3,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32
"""bow""","""identity""","""1k_processed""",1.0,1.0,26.316563,0.026317,0
"""bow""","""llm1""","""1k_processed""",0.99385,0.998,27.107453,0.027107,0
"""bow""","""llm2""","""1k_processed""",0.887626,0.938,23.876507,0.023877,0
"""bow""","""llm3""","""1k_processed""",0.620577,0.686,20.794182,0.020794,0
"""bow""","""gold_ingredients""","""1k_processed""",0.985,0.98,1.205586,0.024112,0
…,…,…,…,…,…,…,…
"""bow_title""","""llm1""","""1k_processed""",0.994071,0.999,31.860938,0.031861,0
"""bow_title""","""llm2""","""1k_processed""",0.901975,0.954,20.554046,0.020554,0
"""bow_title""","""llm3""","""1k_processed""",0.62323,0.708,19.460755,0.019461,0
"""bow_title""","""gold_ingredients""","""1k_processed""",1.0,1.0,1.653312,0.033066,0
