# NER + parquet/duckdbEvaluation 

In [1]:
import spacy
import mlflow
import mlflow.spacy
import polars as pl
import scipy
import duckdb
import numpy as np
import json
import timeit
import matplotlib.pyplot as plt
from rank_bm25 import BM25Okapi

In [2]:
import os
import sys
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
sys.path.insert(0, os.path.abspath('../../..'))
from app.evaluation import reciprocal_rank, rank_BOW_results, precision_at_k
from app.data_handling.NER_parquet import (
process_ingredient,
transform_ingredients_to_tokens,
transform_data_to_tokens,
construct_ingredient_query,
ParquetDefinition,
create_parquet_file
)
load_dotenv()

True

# Set up definitions
Datasets:
 - 1k_processed
 - 10k
 - 100k
 - full

In [3]:
dataset_name = '1k_processed'
run_name = 'line_only_pre_lemma_v6'
model_name = run_name
force_overwrite_pq = True
#model_uri = 'runs:/e00edb7abff845f3913810918de00fe8/model'

In [4]:
nlg_ds = pl.read_csv('../../../data/recipenlg/RecipeNLG_dataset.csv')#,  n_rows=1000)
nlg_ds = nlg_ds.with_columns(pl.col("ingredients").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.col("directions").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.arange(pl.len()).alias("index"))

In [5]:
eval_ds = nlg_ds

## Running code

### Normal model

In [6]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [7]:
model_uri = 'models:/recipe_NER@prod'

In [8]:
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [9]:
pq_filen = "../../../data/databases/full.parquet"

In [10]:
start = timeit.default_timer()
tokens, preps, optionals, varieties, brands, alt_foods = transform_data_to_tokens(
    eval_ds['ingredients'], NER_model,
    True, True, True
)

In [11]:
inp = [
    ParquetDefinition(data=tokens, name="tokens"),
    ParquetDefinition(data=preps, name="preps", is_map=True),
    ParquetDefinition(data=optionals, name="optionals"),
    ParquetDefinition(data=varieties, name="varieties", is_map=True),
    ParquetDefinition(data=brands, name="brands", is_map=True),
    ParquetDefinition(data=alt_foods, name="alt_foods"),
]
create_parquet_file(pq_filen, eval_ds, inp, force_overwrite=force_overwrite_pq)
time = timeit.default_timer() - start

In [12]:
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

Dataset transformation took 0.23875798098742962s, or 0.047751596197485924s/row


In [14]:
del tokens, preps, optionals, varieties, brands, alt_food, inp

NameError: name 'tokens' is not defined

### Additional preprocessed data model

In [15]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [16]:
model_uri = 'models:/recipe_NER@prod_extra'

In [17]:
NER_model = mlflow.spacy.load_model(model_uri)

Downloading artifacts:   0%|          | 0/17 [00:00<?, ?it/s]

In [32]:
pq_filen = "../../../data/databases/full_extra.parquet"

In [33]:
start = timeit.default_timer()
tokens, preps, optionals, varieties, brands, alt_foods = transform_data_to_tokens(
    eval_ds['ingredients'], NER_model,
    True, True, True
)

In [34]:
inp = [
    ParquetDefinition(data=tokens, name="tokens"),
    ParquetDefinition(data=preps, name="preps", is_map=True),
    ParquetDefinition(data=optionals, name="optionals"),
    ParquetDefinition(data=varieties, name="varieties", is_map=True),
    ParquetDefinition(data=brands, name="brands", is_map=True),
    ParquetDefinition(data=alt_foods, name="alt_foods"),
]
create_parquet_file(pq_filen, eval_ds.select(pl.exclude('tfidf')), inp, force_overwrite=force_overwrite_pq)
time = timeit.default_timer() - start

In [35]:
print(f"Dataset transformation took {time}s, or {time/eval_ds.shape[0]}s/row")

Dataset transformation took 0.36488505703164265s, or 0.07297701140632853s/row


In [37]:
duckdb.sql(f"SELECT * FROM read_parquet('{pq_filen}')")

┌───────┬───────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────┬──────────┬───────────────────────────────────────────────────────────────────────────────────────────┬───────┬───────────────────────────────────────────────────────────────────────┬────────────────────────────────────────────┬──────────────────────────────────────────────────────────────────────────┬─────────┬────────────────────┬─────────────────────