In [27]:
import polars as pl
import scipy
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from dotenv import load_dotenv
import os
import sys
sys.path.insert(0, os.path.abspath('..'))
sys.path.insert(0, os.path.abspath('../..'))
from app.data_handling.preprocessing import lemmatize_line
from app.data_handling.model_functions import run_BOW_on_line

load_dotenv()

True

In [2]:
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URL"))
mlflow.set_experiment("recipe_BOW")
mlflow.sklearn.autolog()

In [3]:
#nlg_ds = pd.read_csv('../../data/recipenlg/RecipeNLG_dataset.csv', converters={'ingredients': pd.eval})

In [7]:
nlg_ds = pl.read_csv('../../data/recipenlg/RecipeNLG_dataset.csv',  n_rows=10)
nlg_ds = nlg_ds.with_columns(pl.col("ingredients").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.col("directions").str.json_decode())
nlg_ds = nlg_ds.with_columns(pl.arange(pl.len()).alias("index"))

In [8]:
for t, i in nlg_ds.select('title','ingredients').iter_rows():
    print(t, i)
    break

No-Bake Nut Cookies ['1 c. firmly packed brown sugar', '1/2 c. evaporated milk', '1/2 tsp. vanilla', '1/2 c. broken nuts (pecans)', '2 Tbsp. butter or margarine', '3 1/2 c. bite size shredded rice biscuits']


In [9]:
def ingredient_iter(ds):
    for title, ingredient_list in nlg_ds.select('title','ingredients').iter_rows():
        yield lemmatize_line(title)
        for ingredient_line in ingredient_list:
            yield lemmatize_line(ingredient_line)

In [10]:
vectorizer = TfidfVectorizer(stop_words='english')
with mlflow.start_run() as run:
    vectorizer.fit(ingredient_iter(nlg_ds))
    mlflow.sklearn.log_model(sk_model=vectorizer, artifact_path="model")



🏃 View run shivering-midge-633 at: https://mlflow.local.agefvert.com/#/experiments/995147294083552734/runs/dbf1d0ee3ad74736b27056624469ed9a
🧪 View experiment at: https://mlflow.local.agefvert.com/#/experiments/995147294083552734


In [11]:
model_uri = f"runs:/{run.info.run_id}/model"

In [12]:
sk_model = mlflow.sklearn.load_model(model_uri)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 14.99it/s]


In [18]:
model_info = mlflow.models.get_model_info(model_uri=model_uri)

Downloading artifacts: 100%|██████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  7.60it/s]


In [21]:
model_info.model_uri

'runs:/dbf1d0ee3ad74736b27056624469ed9a/model'

In [22]:
model_uri

'runs:/dbf1d0ee3ad74736b27056624469ed9a/model'

In [23]:
"\n".join(nlg_ds['ingredients'][0])

'1 c. firmly packed brown sugar\n1/2 c. evaporated milk\n1/2 tsp. vanilla\n1/2 c. broken nuts (pecans)\n2 Tbsp. butter or margarine\n3 1/2 c. bite size shredded rice biscuits'

In [24]:
sk_model.transform(["\n".join(nlg_ds['ingredients'][0])])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14 stored elements and shape (1, 123)>

In [25]:
sk_model.transform(["\n".join(nlg_ds['ingredients'][0])])

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 14 stored elements and shape (1, 123)>

In [29]:
run_BOW_on_line([
    "\n".join(nlg_ds['ingredients'][0]),
    "\n".join(nlg_ds['ingredients'][1])
    ], model_uri)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 32 stored elements and shape (2, 123)>

In [26]:
def transform_ds_to_BOW(ds: pd.core.frame.DataFrame, model):
    ds['tfidf'] = ""
    ds['tfidf'] = ds['tfidf'].astype('object')
    for i, ingredients in enumerate(ds['ingredients']):
        datapoint_tfidf = model.transform(["\n".join(ingredients)])
        ds.at[i, 'tfidf'] = datapoint_tfidf

NameError: name 'pd' is not defined

In [19]:
transform_ds_to_BOW(nlg_ds, sk_model)

In [20]:
nlg_ds['tfidf'][0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20 stored elements and shape (1, 2964)>

In [21]:
nlg_ds

Unnamed: 0.1,Unnamed: 0,title,ingredients,directions,link,source,NER,tfidf
0,0,No-Bake Nut Cookies,"[1 c. firmly packed brown sugar, 1/2 c. evapor...","[""In a heavy 2-quart saucepan, mix brown sugar...",www.cookbooks.com/Recipe-Details.aspx?id=44874,Gathered,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...",<Compressed Sparse Row sparse matrix of dtype ...
1,1,Jewell Ball'S Chicken,"[1 small jar chipped beef, cut up, 4 boned chi...","[""Place chipped beef on bottom of baking dish....",www.cookbooks.com/Recipe-Details.aspx?id=699419,Gathered,"[""beef"", ""chicken breasts"", ""cream of mushroom...",<Compressed Sparse Row sparse matrix of dtype ...
2,2,Creamy Corn,"[2 (16 oz.) pkg. frozen corn, 1 (8 oz.) pkg. c...","[""In a slow cooker, combine all ingredients. C...",www.cookbooks.com/Recipe-Details.aspx?id=10570,Gathered,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...",<Compressed Sparse Row sparse matrix of dtype ...
3,3,Chicken Funny,"[1 large whole chicken, 2 (10 1/2 oz.) cans ch...","[""Boil and debone chicken."", ""Put bite size pi...",www.cookbooks.com/Recipe-Details.aspx?id=897570,Gathered,"[""chicken"", ""chicken gravy"", ""cream of mushroo...",<Compressed Sparse Row sparse matrix of dtype ...
4,4,Reeses Cups(Candy),"[1 c. peanut butter, 3/4 c. graham cracker cru...","[""Combine first four ingredients and press in ...",www.cookbooks.com/Recipe-Details.aspx?id=659239,Gathered,"[""peanut butter"", ""graham cracker crumbs"", ""bu...",<Compressed Sparse Row sparse matrix of dtype ...
...,...,...,...,...,...,...,...,...
9995,9995,Pink Fruit Salad,"[1 can cherry pie filling, 1 can sweetened con...","[""Combine all ingredients and chill.""]",www.cookbooks.com/Recipe-Details.aspx?id=183170,Gathered,"[""cherry pie filling"", ""condensed milk"", ""pine...",<Compressed Sparse Row sparse matrix of dtype ...
9996,9996,Peppered Steak,"[1 lb. round steak, 1 bell pepper, 1 onion, 1 ...","[""Cut steaks into strips; brown in cooking oil...",www.cookbooks.com/Recipe-Details.aspx?id=462037,Gathered,"[""bell pepper"", ""onion"", ""tomatoes"", ""salt"", ""...",<Compressed Sparse Row sparse matrix of dtype ...
9997,9997,Chicken Casserole,"[3 lb. fryer, 1 large onion, 1 large green pep...","[""Stew and bone fryer."", ""Saute in small amoun...",www.cookbooks.com/Recipe-Details.aspx?id=292083,Gathered,"[""fryer"", ""onion"", ""green pepper"", ""celery"", ""...",<Compressed Sparse Row sparse matrix of dtype ...
9998,9998,Sweet Potatoes Casserole,"[1 large can yams, 1 c. sugar, 1 egg, 1/3 c. m...","[""Mix together for 2 or 3 minutes. Put into gr...",www.cookbooks.com/Recipe-Details.aspx?id=56276,Gathered,"[""yams"", ""sugar"", ""egg"", ""milk"", ""vanilla flav...",<Compressed Sparse Row sparse matrix of dtype ...


In [27]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
q_emb = sk_model.transform(["tomato, bell pepper, cream"])

In [30]:
# Calculate cosine similarity between query and each embedding
nlg_ds['similarity'] = nlg_ds['tfidf'].apply(lambda x: cosine_similarity(q_emb, x))

In [32]:
# Sort by similarity score
df_sorted = nlg_ds.sort_values('similarity', ascending=False)