# NER + parquet/duckdbEvaluation 

In [1]:
import spacy
import mlflow
import mlflow.spacy
import polars as pl
import scipy
import duckdb
import numpy as np
import json
import timeit
import matplotlib.pyplot as plt
from rank_bm25 import BM25Okapi

In [2]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('../../..'))
from app.evaluation import reciprocal_rank, rank_BOW_results, precision_at_k
from app.data_handling.NER_parquet import (
process_ingredient,
transform_ingredients_to_tokens,
transform_data_to_tokens,
construct_ingredient_query,
ParquetDefinition,
create_parquet_file
)
load_dotenv()

True

# Set up definitions
Datasets:
 - 1k_processed
 - 10k
 - 100k
 - full

In [3]:
dataset_name = '1k_processed'
run_name = 'line_only_pre_lemma_v6_extra'
model_name = run_name
force_overwrite_pq = False
model_uri = 'models:/recipe_NER@prod'
#model_uri = 'runs:/e00edb7abff845f3913810918de00fe8/model'

In [4]:
with open("../../../data/eval_data/processed_data.json", "r") as f:
    query_ds = pl.read_json(f)
with open("../../../data/eval_data/human_annotations.json", "r") as f:
    human_query_ds = pl.read_json(f)

In [5]:
if dataset_name == '1k_processed':
    eval_ds = query_ds
elif dataset_name == '10k':
    with open("../../../data/eval_data/eval_10k.json", "r") as f:
        eval_ds = pl.read_json(f)
elif dataset_name == '100k':
    with open("../../../data/eval_data/eval_100k.json", "r") as f:
        eval_ds = pl.read_json(f)

In [6]:
results_df = pl.DataFrame(
    schema={
        'Model': pl.String,
        'Eval_Task': pl.String,
        "Dataset": pl.String,
        "MRR": pl.Float64,
        "TKA": pl.Float64,
        "Time_Tot": pl.Float64,
        "Time_Query": pl.Float64,
        "Error_Count": pl.Int32
    })
results_df

Model,Eval_Task,Dataset,MRR,TKA,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32


## Running code

In [7]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [8]:
#model_uri = 'models:/recipe_NER@prod'
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [9]:
pq_filen = f"{dataset_name}.parquet"

In [10]:
def rank_results(df: pl.DataFrame, query: list[str]):
    if df.shape[0] == 0:
        return df
    corpus = list(df['tokens'])
    bm25 = BM25Okapi(corpus)
    scores = bm25.get_scores(query)
    ranked_ds = df.with_columns(
        pl.Series(name='rank', values=scores)
    ).sort('rank', descending=True)
    return ranked_ds

In [11]:
def evaluation_task_NER(parquet_path:str, query_ds: pl.DataFrame, query_field: str, results_df: pl.DataFrame, task_name: str, debug = False, model_name='ner',
                       use_var=False, use_alt=False):
    print(f"--- Running {task_name} on {dataset_name} using {model_name} ---")
    tot_rr = 0
    tot_prec_at_3 = 0
    tot_evals = 0
    errors = 0
    start = timeit.default_timer()
    for index, query in query_ds.select('index', query_field).iter_rows():
        if query_field == 'google_search_query_processed':
            query_obj = transform_ingredients_to_tokens(query['ingredients'], NER_model, include_variety=use_var)
        else:
            query_obj = transform_ingredients_to_tokens(query, NER_model, include_variety=use_var)
        if use_var:
            sql = construct_ingredient_query(parquet_path, query_obj['foods'], query_obj['preps'], query_obj['vars'], use_alt=use_alt)
        else:
            sql = construct_ingredient_query(parquet_path, query_obj['foods'], query_obj['preps'], use_alt=use_alt)
        try:
            results = duckdb.sql(sql).pl()
            results = rank_results(results, query_obj['foods'])
            tot_rr += reciprocal_rank(results, index)
            tot_prec_at_3 += precision_at_k(results, index)
            tot_evals += 1
        except Exception as e:
            errors += 1
            print(e)
            continue
    time = timeit.default_timer() - start
    mrr = tot_rr/tot_evals
    mprec3 = tot_prec_at_3 / tot_evals
    if task_name == 'gold_adversarial_ingredients' or task_name == 'gold_adversarial_subset_ingredients' or task_name == 'added_prep_suffixes':
        mrr = 1-mrr
        mprec3 = 1-mprec3
    result_data = {
        "Model": [model_name],
        'Eval_Task': [task_name],
        'Dataset': [dataset_name],
        "MRR": [mrr],
        "TKA": [mprec3],
        "Time_Tot": time,
        "Time_Query": time/tot_evals,
        "Error_Count": errors
    }
    result_df = pl.DataFrame(result_data)
    results_df = results_df.vstack(result_df)
    print(f"--- COMPLETED {task_name} in {time}s ---")
    return results_df

In [12]:
from datetime import datetime
datasets = ['100k']
model_name = "prod_prep_alt"
date_time = datetime.now().strftime("%Y_%m_%d__%H_%M")
use_var = False
use_alt = True

In [13]:
for dataset_name in datasets:
    results_df = pl.DataFrame(
        schema={
            'Model': pl.String,
            'Eval_Task': pl.String,
            "Dataset": pl.String,
            "MRR": pl.Float64,
            "TKA": pl.Float64,
            "Time_Tot": pl.Float64,
            "Time_Query": pl.Float64,
            "Error_Count": pl.Int32
        }
    )
    run_name = f"{model_name}_{dataset_name}"
    if dataset_name == '10k':
        pq_filen = "../../../data/databases/10k.parquet"
    elif dataset_name == '100k':
        pq_filen = "../../../data/databases/100k.parquet"
    #pq_filen='1k_processed.parquet'
    #results_df = evaluation_task_NER(pq_filen, query_ds, 'ingredients', results_df, 'identity', model_name=model_name, use_var=use_var, use_alt=use_alt)
    #results_df = evaluation_task_NER(pq_filen, query_ds, 'main_food_items_and_preparations', results_df, 'llm1', model_name=model_name, use_var=use_var, use_alt=use_alt)
    #results_df = evaluation_task_NER(pq_filen, query_ds, 'important_ingredients_for_search', results_df, 'llm2', model_name=model_name, use_var=use_var, use_alt=use_alt)
    #results_df = evaluation_task_NER(pq_filen, query_ds, 'google_search_query_processed', results_df, 'llm3', model_name=model_name, use_var=use_var, use_alt=use_alt)
    #results_df = evaluation_task_NER(pq_filen, query_ds, 'added_preps', results_df, 'added_prep_suffixes', model_name=model_name, use_var=use_var, use_alt=use_alt)
    results_df = evaluation_task_NER(pq_filen, human_query_ds, 'human_ingredients', results_df, 'gold_ingredients', model_name=model_name, use_var=use_var, use_alt=use_alt)
    results_df = evaluation_task_NER(pq_filen, human_query_ds, 'human_key_ingredients', results_df, 'gold_key_ingredients', model_name=model_name, use_var=use_var, use_alt=use_alt)
    results_df = evaluation_task_NER(pq_filen, human_query_ds, 'human_adv_ingredients', results_df, 'gold_adversarial_ingredients', model_name=model_name, use_var=use_var, use_alt=use_alt)
    results_df = evaluation_task_NER(pq_filen, human_query_ds, 'human_adv_subset_ingredients', results_df, 'gold_adversarial_subset_ingredients', model_name=model_name, use_var=use_var, use_alt=use_alt)
    
    results_df.write_parquet(f"../../../data/results/NER/{run_name}_{date_time}.parquet")

--- Running gold_ingredients on 100k using prod_prep_alt ---
--- COMPLETED gold_ingredients in 35.728461399907246s ---
--- Running gold_key_ingredients on 100k using prod_prep_alt ---
--- COMPLETED gold_key_ingredients in 27.233477127039805s ---
--- Running gold_adversarial_ingredients on 100k using prod_prep_alt ---
--- COMPLETED gold_adversarial_ingredients in 17.88959748391062s ---
--- Running gold_adversarial_subset_ingredients on 100k using prod_prep_alt ---
--- COMPLETED gold_adversarial_subset_ingredients in 17.30488138704095s ---


In [14]:
results_df

Model,Eval_Task,Dataset,MRR,TKA,Time_Tot,Time_Query,Error_Count
str,str,str,f64,f64,f64,f64,i32
"""prod_prep_alt""","""gold_ingredients""","""100k""",0.37,0.42,35.728461,0.714569,0
"""prod_prep_alt""","""gold_key_ingredients""","""100k""",0.511952,0.54,27.233477,0.54467,0
"""prod_prep_alt""","""gold_adversarial_ingredients""","""100k""",0.95,0.94,17.889597,0.357792,0
"""prod_prep_alt""","""gold_adversarial_subset_ingred…","""100k""",0.98942,0.98,17.304881,0.346098,0


### results_df.write_parquet(f"../../../data/results/NER/{run_name}_{date_time}.parquet")

### pl.read_parquet(f"../../../data/results/NER/{run_name}_{date_time}.parquet")