In [98]:
import mlflow
import mlflow.spacy
import mlflow.sklearn
import polars as pl
import seaborn as sns
import scipy
import duckdb
import numpy as np

In [54]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
from app.data_handling.preprocessing import process_ingredient, transform_ingredients_to_tokens
from app.data_handling.model_functions import run_BOW_on_line

load_dotenv()

True

In [33]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [5]:
with open("../../data/eval_data/processed_data.json", "r") as f:
    eval_ds = pl.read_json(f)

In [20]:
filtered_ds = eval_ds.with_row_index("id").filter(pl.col("index") == 7006)

In [130]:
def reciprocal_rank(ranked_ds: pl.dataframe.frame.DataFrame, correct_idx: int):
    filtered_ds = ranked_ds.with_row_index("id", offset=1).filter(pl.col("index") == correct_idx)
    if filtered_ds.shape[0] == 0:  # Index not found, reciprocal rank is 0
        return 0
    return 1 / filtered_ds.select('id')[0,0]

In [34]:
model_uri = 'runs:/083d261eafb942a2878f1dab36056631/model'
BOW_model = mlflow.sklearn.load_model(model_uri)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.90it/s]


In [117]:
def transform_ds_to_BOW(ds: pl.DataFrame, model):
    tfidfs = []
    for ingredients in ds.select('ingredients').iter_rows():
        datapoint_tfidf = model.transform(["\n".join(ingredients[0])])
        tfidfs.append(datapoint_tfidf)
    return ds.with_columns(pl.Series(name="tfidf", values=tfidfs))

In [121]:
def rank_results(ds: pl.DataFrame, embedding: scipy.sparse._csr.csr_matrix, embedding_col='tfidf'):
    similarities = cosine_similarity(emb_query, scipy.sparse.vstack(eval_ds_tfidf['tfidf']))
    ranked_ds = eval_ds_tfidf.with_columns(pl.Series(name='rank', values=similarities[0])).filter(pl.col('rank') > 0).sort('rank', descending=True)
    return ranked_ds

In [133]:
tot_rr = 0
tot_evals = 0
                                    
for index, ingredient_list in eval_ds.select('index', 'ingredients').iter_rows():
    emb_query = run_BOW_on_line("\n".join(ingredient_list), model_uri)
    ranked_ds = rank_results(eval_ds, emb_query)
    tot_rr += reciprocal_rank(ranked_ds, index)
    tot_evals += 1

In [134]:
print(tot_rr)
print(tot_evals)

996.8928571428571
1000


In [135]:
mrr = tot_rr/tot_evals