# NER + parquet/duckdbEvaluation 

In [27]:
import spacy
import mlflow
import mlflow.spacy
import polars as pl
import scipy
import duckdb
import numpy as np
import json
import timeit
import matplotlib.pyplot as plt
from rank_bm25 import BM25Okapi

In [28]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('../../..'))
from app.evaluation import reciprocal_rank, rank_BOW_results, precision_at_k
from app.data_handling.NER_parquet import (
process_ingredient,
transform_ingredients_to_tokens,
transform_data_to_tokens,
construct_ingredient_query,
ParquetDefinition,
create_parquet_file
)
load_dotenv()

True

# Set up definitions
Datasets:
 - 1k_processed
 - 10k
 - 100k
 - full

In [29]:
dataset_name = '1k_processed'
run_name = 'line_only_pre_lemma_v6'
model_name = run_name
force_overwrite_pq = True
#model_uri = 'runs:/e00edb7abff845f3913810918de00fe8/model'

In [30]:
with open("../../../data/eval_data/eval_10k.json", "r") as f:
    ds_10k = pl.read_json(f)
with open("../../../data/eval_data/eval_100k.json", "r") as f:
    ds_100k = pl.read_json(f)

##  code

### Normal 10k

In [5]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [6]:
model_uri = 'models:/recipe_NER@prod'

In [7]:
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [8]:
pq_filen = "../../../data/databases/10k.parquet"

In [9]:
start = timeit.default_timer()
tokens, preps, optionals, varieties, brands, alt_foods = transform_data_to_tokens(
    ds_10k['ingredients'], NER_model,
    True, True, True
)

In [10]:
inp = [
    ParquetDefinition(data=tokens, name="tokens"),
    ParquetDefinition(data=preps, name="preps", is_map=True),
    ParquetDefinition(data=optionals, name="optionals"),
    ParquetDefinition(data=varieties, name="varieties", is_map=True),
    ParquetDefinition(data=brands, name="brands", is_map=True),
    ParquetDefinition(data=alt_foods, name="alt_foods"),
]
create_parquet_file(pq_filen, ds_10k, inp, force_overwrite=force_overwrite_pq)
time = timeit.default_timer() - start

In [12]:
print(f"Dataset transformation took {time}s, or {time/ds_10k.shape[0]}s/row")

Dataset transformation took 485.9048548399005s, or 0.04859048548399005s/row


In [13]:
del tokens, preps, optionals, varieties, brands, alt_foods, inp

### Additional preprocessed data model 10k

In [14]:
model_uri = 'models:/recipe_NER@prod_extra'

In [15]:
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [16]:
pq_filen = "../../../data/databases/10k_extra.parquet"

In [17]:
start = timeit.default_timer()
tokens, preps, optionals, varieties, brands, alt_foods = transform_data_to_tokens(
    ds_10k['ingredients'], NER_model,
    True, True, True
)

In [18]:
inp = [
    ParquetDefinition(data=tokens, name="tokens"),
    ParquetDefinition(data=preps, name="preps", is_map=True),
    ParquetDefinition(data=optionals, name="optionals"),
    ParquetDefinition(data=varieties, name="varieties", is_map=True),
    ParquetDefinition(data=brands, name="brands", is_map=True),
    ParquetDefinition(data=alt_foods, name="alt_foods"),
]
create_parquet_file(pq_filen, ds_10k, inp, force_overwrite=force_overwrite_pq)
time = timeit.default_timer() - start

In [19]:
print(f"Dataset transformation took {time}s, or {time/ds_10k.shape[0]}s/row")

Dataset transformation took 507.80475788994227s, or 0.05078047578899423s/row


In [20]:
del tokens, preps, optionals, varieties, brands, alt_foods, inp

### Normal 100k

In [31]:
model_uri = 'models:/recipe_NER@prod'

In [32]:
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [33]:
pq_filen = "../../../data/databases/100k.parquet"

In [34]:
start = timeit.default_timer()
tokens, preps, optionals, varieties, brands, alt_foods = transform_data_to_tokens(
    ds_100k['ingredients'], NER_model,
    True, True, True
)

In [35]:
inp = [
    ParquetDefinition(data=tokens, name="tokens"),
    ParquetDefinition(data=preps, name="preps", is_map=True),
    ParquetDefinition(data=optionals, name="optionals"),
    ParquetDefinition(data=varieties, name="varieties", is_map=True),
    ParquetDefinition(data=brands, name="brands", is_map=True),
    ParquetDefinition(data=alt_foods, name="alt_foods"),
]
create_parquet_file(pq_filen, ds_100k, inp, force_overwrite=force_overwrite_pq)
time = timeit.default_timer() - start

In [36]:
print(f"Dataset transformation took {time}s, or {time/ds_100k.shape[0]}s/row")

Dataset transformation took 5761.082738347002s, or 0.05761082738347002s/row
