In [1]:
import os
import spacy
import polars as pl
import mlflow
import mlflow.spacy
from pathlib import Path
import json
from spacy.util import minibatch, compounding
from spacy.cli.train import train
from dotenv import load_dotenv
import sys
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
from app.data_handling.NER_parquet import process_ingredient, transform_ingredients_to_tokens, transform_ds_to_tokens, construct_ingredient_query

# Create Dataframe and save as parquet

In [2]:
load_dotenv()

True

In [3]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))

In [4]:
model_uri = "models:/recipe_NER@prod"
ner = mlflow.spacy.load_model(model_uri=model_uri)


  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|████████████████████████████████████████████████████████| 17/17 [00:03<00:00,  4.89it/s]


In [5]:
type(ner)

spacy.lang.en.English

In [6]:
nlg_ds = pl.read_csv('../../data/recipenlg/RecipeNLG_dataset.csv',  n_rows=1000)
nlg_ds = nlg_ds.with_columns(pl.col("ingredients").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.col("directions").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.arange(pl.len()).alias("index"))

In [7]:
nlg_ds.head()

Unnamed: 0_level_0,title,ingredients,directions,link,source,NER,index
i64,str,list[str],list[str],str,str,str,i64
0,"""No-Bake Nut Cookies""","[""1 c. firmly packed brown sugar"", ""1/2 c. evaporated milk"", … ""3 1/2 c. bite size shredded rice biscuits""]","[""In a heavy 2-quart saucepan, mix brown sugar, nuts, evaporated milk and butter or margarine."", ""Stir over medium heat until mixture bubbles all over top."", … ""Let stand until firm, about 30 minutes.""]","""www.cookbooks.com/Recipe-Detai…","""Gathered""","""[""brown sugar"", ""milk"", ""vanil…",0
1,"""Jewell Ball'S Chicken""","[""1 small jar chipped beef, cut up"", ""4 boned chicken breasts"", … ""1 carton sour cream""]","[""Place chipped beef on bottom of baking dish."", ""Place chicken on top of beef."", ""Mix soup and cream together; pour over chicken. Bake, uncovered, at 275° for 3 hours.""]","""www.cookbooks.com/Recipe-Detai…","""Gathered""","""[""beef"", ""chicken breasts"", ""c…",1
2,"""Creamy Corn""","[""2 (16 oz.) pkg. frozen corn"", ""1 (8 oz.) pkg. cream cheese, cubed"", … ""1/4 tsp. pepper""]","[""In a slow cooker, combine all ingredients. Cover and cook on low for 4 hours or until heated through and cheese is melted. Stir well before serving. Yields 6 servings.""]","""www.cookbooks.com/Recipe-Detai…","""Gathered""","""[""frozen corn"", ""cream cheese""…",2
3,"""Chicken Funny""","[""1 large whole chicken"", ""2 (10 1/2 oz.) cans chicken gravy"", … ""4 oz. shredded cheese""]","[""Boil and debone chicken."", ""Put bite size pieces in average size square casserole dish."", … ""Sprinkle shredded cheese on top and bake at 350° for approximately 20 minutes or until golden and bubbly.""]","""www.cookbooks.com/Recipe-Detai…","""Gathered""","""[""chicken"", ""chicken gravy"", ""…",3
4,"""Reeses Cups(Candy) ""","[""1 c. peanut butter"", ""3/4 c. graham cracker crumbs"", … ""1 large pkg. chocolate chips""]","[""Combine first four ingredients and press in 13 x 9-inch ungreased pan."", ""Melt chocolate chips and spread over mixture. Refrigerate for about 20 minutes and cut into pieces before chocolate gets hard."", ""Keep in refrigerator.""]","""www.cookbooks.com/Recipe-Detai…","""Gathered""","""[""peanut butter"", ""graham crac…",4


In [8]:
nlg_ds.shape

(1000, 8)

### def transform_ds_to_tokens(ds: pl.dataframe.frame.DataFrame, create_vocab=False):  # Assume recipeNLG dataset, where each ingredient line is a list item
    tokens = []
    preps = []
    optionals = []
    for i, ingredients in enumerate(ds['ingredients']):
        datapoint_obj = transform_ingredients_to_tokens(ingredients, ner, create_vocab)
        tokens.append(datapoint_obj['foods'])
        preps.append(json.dumps(datapoint_obj['preps']))
        optionals.append(datapoint_obj['optionals'])
    new_ds = ds.with_columns(
        pl.Series(name='tokens', values=tokens),
        pl.Series(name='preps', values=preps),
        pl.Series(name='optionals', values=optionals)
    )
    return new_ds

In [None]:
nlg_ds = transform_ds_to_tokens(nlg_ds, ner, create_vocab=True)

In [None]:
nlg_ds.write_parquet('test.parquet')

# Test some queries using DuckDB and parquet

In [None]:
import duckdb

In [None]:
duckdb.sql("SELECT * FROM 'test.parquet' AS tbl WHERE list_contains(tbl.tokens, 'egg')").pl()

In [None]:
duckdb.sql("COPY(SELECT preps::JSON AS preps, * EXCLUDE (preps) FROM 'test.parquet') TO 'test.parquet' (FORMAT PARQUET, OVERWRITE TRUE)")

In [None]:
ingredients = ['butter']
preps = {'butter': ['cube'], 'cream_cheese': ['cube']}

In [None]:
list(preps.keys())

### def construct_ingredient_query(pq_path: str, ingredients: list, preps: dict):
    prep_keys = list(preps.keys())
    base_sql = (f"SELECT *, json_extract(preps, {prep_keys}) AS prep_lists"
                f" FROM '{pq_path}'"
                " AS tbl"
                f" WHERE list_has_all(tbl.tokens, {ingredients})"
               )
    prep_filter = ""
    for i, (key, value) in enumerate(preps.items()):
        prep_filter += (f" AND (prep_lists->>{i}) IS NOT NULL"
                        f" AND list_has_all(from_json(prep_lists->>{i}, '[\"VARCHAR\"]'), {value})"
                       )
    sql = base_sql + prep_filter
    return sql

In [None]:
duckdb.sql(construct_ingredient_query('test.parquet', ingredients, preps)).pl()

In [None]:
ingredients = ['almond']
preps = {}
sql = construct_ingredient_query('test.parquet', ingredients, preps)
results = duckdb.sql(sql).pl()
print("###############################")
print("####### WITHOUT PREPS #########")
print("###############################")
for title, ingr in results['title', 'ingredients'].iter_rows():
    print()
    print(f"---- {title} ----")
    print("\n".join(ingr))
    
ingredients = ['almond']
preps = {'almond': ['chop']}
sql = construct_ingredient_query('test.parquet', ingredients, preps)
results = duckdb.sql(sql).pl()
print("##############################")
print("######## WITH PREPS ##########")
print("##############################")
for title, ingr in results['title', 'ingredients'].iter_rows():
    print()
    print(f"---- {title} ----")
    print("\n".join(ingr))