In [None]:
pip install transformers sentence-transformers umap-learn hdbscan torch tqdm rapidfuzz

In [3]:
import re

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import umap.umap_ as umap
from sentence_transformers import SentenceTransformer
import nltk

from rapidfuzz import process
from tqdm import tqdm

# Importing the datasets


In [4]:
full_dataset = pd.read_csv('../datasets/ds_verbs.csv')

In [5]:
print(f'Shape: {full_dataset.shape}')
full_dataset.head(5)

Shape: (25758, 18)


Unnamed: 0,title,ingredients,directions,link,source,NER,NER_list,NER_len,directions_len,title_lower,NER_clean,verbs,lemmed_verbs,filtered_verbs,set_verbs,clean_direction,pairs,pairs_set
0,""" Bar"" Cheese","[""32 ounces Velveeta cheese (regular)"", ""1 1/2...","[""In a large sauce pan over low heat, melt the...",www.food.com/recipe/bar-cheese-42151,Gathered,"[""Velveeta cheese"", ""mayonnaise"", ""horseradish...","[velveeta cheese, mayonnaise, horseradish, tab...",5,368,""" bar"" cheese","[velveeta cheese, mayonnaise, horseradish, tab...","[pan, add, stirring, add, stirring, cool, refr...","[pan, add, stir, add, stir, cool, refrigerate]","[pan, add, stir, add, stir, cool, refrigerate]","[refrigerate, cool, pan, stir, add]",large sauce pan low heat melt velveeta add jar...,"[add horseradish, add mayo, add worcestershire...","[refrigerate choice, add horseradish, add worc..."
1,""" Barber's"" Chewy Vanilla & Salted Butter Toffee","[""2 14 cups icing sugar"", ""12 cup milk"", ""14 c...","[""Butter a loaf pan."", ""In a pot,combine icing...",www.food.com/recipe/barbers-chewy-vanilla-salt...,Recipes1M,"[""icing sugar"", ""milk"", ""corn syrup"", ""vanilla...","[icing sugar, milk, corn syrup, vanilla bean, ...",5,557,""" barber's"" chewy vanilla & salted butter toffee","[icing sugar, milk, corn syrup, vanilla bean, ...","[pot, stirring, reduce, remove, stirring, reac...","[pot, stir, reduce, remove, stir, reach, pour,...","[pot, stir, reduce, remove, stir, reach, pour,...","[cut, remove, reach, stir, reduce, pot, pour]",butter loaf pan pot combine icing sugar milk c...,"[remove vanilla, stir butter]","[stir butter, remove vanilla]"
2,""" Denauseating"" With Ginger Tea","[""1 1/2 cups water"", ""1/2 teaspoon of crushed ...","[""Bring water to boil with crushed ginger in i...",www.food.com/recipe/denauseating-with-ginger-t...,Gathered,"[""water"", ""ginger"", ""honey"", ""milk""]","[water, ginger, honey, milk]",4,239,""" denauseating"" with ginger tea","[water, ginger, honey, milk]","[crushed, let, medium, strain, add, add, want]","[crush, let, medium, strain, add, add, want]","[crush, let, medium, strain, add, add, want]","[let, strain, want, crush, add, medium]",bring water boil crushed ginger let simmer min...,"[crush ginger, add tea, add milk]","[crush ginger, add milk, add tea]"
3,""" Dunkin Donuts "" Oreo Coffee Coolatta","[""16 Oreo's - crushed with few ones set sdide""...","[""Add all your ice cubes into the blender."", ""...",cookpad.com/us/recipes/483003-dunkin-donuts-or...,Recipes1M,"[""coffee"", ""heavy cream"", ""chocolate syrup"", ""...","[coffee, heavy cream, chocolate syrup, sugar]",4,503,""" dunkin donuts "" oreo coffee coolatta","[coffee, heavy cream, chocolate syrup, sugar]","[add, blender, give, using, blend, give, want,...","[add, blender, give, use, blend, give, want, g...","[add, blender, give, use, blend, give, want, g...","[get, give, blend, use, make, want, blender, add]",add ice cubes blender pour cup coffee give liq...,"[add cubes, give liquid, want coffee, blend co...","[want coffee, blend coffee, add coffee, give l..."
4,""" German"" Barbecued Carrots","[""1 lb carrot, peeled and sliced"", ""6 slices b...","[""Preheat oven to 350F."", ""Cook bacon until cr...",www.food.com/recipe/german-barbecued-carrots-1...,Recipes1M,"[""carrot"", ""bacon"", ""tomato soup"", ""sugar""]","[carrot, bacon, tomato soup, sugar]",4,256,""" german"" barbecued carrots","[carrot, bacon, tomato soup, sugar]","[cook, peel, place, baking, add, bake, eat]","[cook, peel, place, bake, add, bake, eat]","[cook, peel, place, bake, add, bake, eat]","[eat, place, bake, peel, cook, add]",preheat oven cook bacon crisp drain grease cru...,"[cook bacon, peel carrots, add tomato]","[peel carrots, add tomato, cook bacon]"


# Preprocessing ingredients

Using RapidFuzz’s extractOne fuzzy‐matching function with a 90% similarity threshold, build a mapping from each raw token to its chosen canonical representative

In [None]:
all_cleaned_ings = set()
for entry in tqdm(full_dataset["NER_clean"], desc="Gathering unique ingredients"):
    if isinstance(entry, (list, np.ndarray)):
        parts = entry
    elif isinstance(entry, str):
        parts = entry.split(",")
    else:
        continue

    for part in parts:
        ing = part.strip()
        if ing:
            all_cleaned_ings.add(ing)

all_cleaned_ings = list(all_cleaned_ings)
print(f"Total unique cleaned ingredients: {len(all_cleaned_ings)}")

In [None]:
# 2. Build the canonical mapping using fuzzy matching:
def build_canonical_dict(all_ingredients, threshold=90):
    canonical_set = set()
    ing_to_canonical = {}
    for ing in tqdm(all_ingredients, desc="Building canonical dictionary"):
        if not canonical_set:
            canonical_set.add(ing)
            ing_to_canonical[ing] = ing
        else:
            best_match, score, _ = process.extractOne(ing, list(canonical_set))
            if score >= threshold:
                ing_to_canonical[ing] = best_match
            else:
                canonical_set.add(ing)
                ing_to_canonical[ing] = ing
    return ing_to_canonical, canonical_set

ing_to_canonical, canonical_set = build_canonical_dict(all_cleaned_ings, threshold=90)
print(f"Total unique canonical ingredients: {len(canonical_set)}")

In [None]:
def unify_ingredient_entry(entry, ing_map):
    if isinstance(entry, (list, np.ndarray)):
        parts = entry
    elif isinstance(entry, str):
        parts = entry.split(",")
    else:
        return entry

    unified = []
    for part in parts:
        ing = part.strip()
        if ing:
            unified.append(ing_map.get(ing, ing))
    return unified

full_dataset["NER_unified"] = full_dataset["NER_clean"].progress_apply(
    lambda e: unify_ingredient_entry(e, ing_to_canonical)
)

In [None]:
inverted_dict = {}
for raw_ing, canonical_ing in ing_to_canonical.items():
    if canonical_ing not in inverted_dict:
        inverted_dict[canonical_ing] = []
    inverted_dict[canonical_ing].append(raw_ing)

for canonical, raw_list in inverted_dict.items():
    print(f"{canonical}: {raw_list}")

In [None]:
full_dataset.to_csv('/content/drive/MyDrive/NLP_Fanta_Projactt/RecipeNLG/DATASET/ingr_preprocessed.csv', index=False)

# Embeddings

## Embeddings Ingredients

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

In [None]:
full_dataset["ingredient_str"] = full_dataset["NER_unified"].apply(
    lambda lst: ", ".join(lst) if isinstance(lst, (list, np.ndarray)) else lst
)

ingredient_docs = full_dataset["ingredient_str"].tolist()

TF‑IDF assigns low weights to common pairs (salt, water) and higher weights to rarer items (saffron, miso). Next, we load the all‑MiniLM‑L6‑v2 SentenceTransformer and pre‑compute an embedding for each token that survives TF‑IDF. For every recipe we look up the TF‑IDF score of each token, retrieve its BERT embedding, and take a weighted average. The result is a single 384‑dimensional vector per recipe (ingredient_embeddings) that captures both the semantic meaning of each ingredient and its relative distinctiveness within the corpus.








### TF_IDF

In [None]:
def comma_tokenizer(text):
    return [token.strip() for token in text.split(",") if token.strip()]

In [None]:
vectorizer = TfidfVectorizer(
    tokenizer=comma_tokenizer,
    min_df=2,
    max_df=0.85,
    ngram_range=(1, 1),
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False
)

tfidf_matrix = vectorizer.fit_transform(ingredient_docs)
vocab = vectorizer.get_feature_names_out()
vocab_set = set(vocab)



### BERT

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def get_weighted_embedding(tokens, tfidf_scores, model, vocab_set):
    """
    tokens: list of ingredient tokens (already cleaned)
    tfidf_scores: dictionary mapping token to its TF-IDF weight for one recipe
    model: pre-trained SentenceTransformer for token embeddings
    vocab_set: set containing tokens in the vocabulary (for quick check)
    """
    token_embeddings = []
    weights = []
    for token in tokens:
        if token in vocab_set:
            emb = model.encode(token)
            token_embeddings.append(emb)
            weights.append(tfidf_scores.get(token, 0.0))
    if token_embeddings and np.sum(weights) > 0:
        token_embeddings = np.array(token_embeddings)
        weights = np.array(weights).reshape(-1, 1)
        return np.sum(token_embeddings * weights, axis=0) / np.sum(weights)
    else:
        return model.encode(" ".join(tokens))

In [None]:
ingredient_embeddings = []

for idx, doc in enumerate(ingredient_docs):
    tokens = comma_tokenizer(doc)
    row = tfidf_matrix[idx]
    token_idx = row.nonzero()[1]
    row_tfidf = {vocab[col]: row[0, col] for col in token_idx}
    emb = get_weighted_embedding(tokens, row_tfidf, model, vocab_set)
    ingredient_embeddings.append(emb)

ingredient_embeddings = np.array(ingredient_embeddings)
print("Shape of ingredient embeddings:", ingredient_embeddings.shape)

Shape of ingredient embeddings: (25758, 384)


In [None]:
titles = full_dataset["title"].tolist()
embedding_dict = {t: e for t, e in zip(titles, ingredient_embeddings)}

np.savez('/content/drive/MyDrive/NLP_Fanta_Projactt/RecipeNLG/DATASET/ingredient_embeddings_by_title.npz',
         **embedding_dict)

## Embeddings Directions

### TF_IDF

In [None]:
full_dataset["verbs_str"] = full_dataset["set_verbs"].apply(
    lambda lst: ", ".join(lst) if isinstance(lst, (list, np.ndarray)) else str(lst)
)

verbs_docs = full_dataset["verbs_str"].tolist()

In [None]:
def comma_tokenizer(text):
    return [t.strip() for t in text.split(",") if t.strip()]

verb_vectorizer = TfidfVectorizer(
    tokenizer=comma_tokenizer,
    min_df=2,
    max_df=0.85,
    ngram_range=(1,1),
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False
)

verb_tfidf = verb_vectorizer.fit_transform(verbs_docs)
verb_vocab  = verb_vectorizer.get_feature_names_out()
verb_vocab_set = set(verb_vocab)




### Bert

In [None]:
token_embeds = model.encode(verb_vocab, show_progress_bar=True, batch_size=64)
embed_map    = dict(zip(verb_vocab, token_embeds))

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
def weighted_verb_embedding(doc_idx, doc_text):
    tokens = comma_tokenizer(doc_text)
    row = verb_tfidf[doc_idx]
    cols = row.nonzero()[1]
    tfidf_scores = { verb_vocab[c]: row[0, c] for c in cols }
    emb_list, w_list = [], []
    for tok in tokens:
        if tok in embed_map:
            emb_list.append(embed_map[tok])
            w_list.append(tfidf_scores.get(tok, 0.0))
    if emb_list and np.sum(w_list) > 0:
        embs = np.vstack(emb_list)
        ws   = np.array(w_list)[:, None]
        return (embs * ws).sum(axis=0) / ws.sum()
    return model.encode(" ".join(tokens))

In [None]:
verb_embeddings = []
for i, doc in enumerate(verbs_docs):
    verb_embeddings.append(weighted_verb_embedding(i, doc))

verb_embeddings = np.vstack(verb_embeddings)
print("Shape of verb embeddings:", verb_embeddings.shape)

Shape of verb embeddings: (25758, 384)


In [None]:
titles = full_dataset["title"].tolist()
embedding_dict = {t: e for t, e in zip(titles, verb_embeddings)}

np.savez('/content/drive/MyDrive/NLP_Fanta_Projactt/RecipeNLG/DATASET/verb_embeddings_by_title.npz',
         **embedding_dict)

## Embeddings Set Pairs

### TF_IDF

In [None]:
full_dataset["pairs_str"] = full_dataset["pairs_set"].apply(
    lambda lst: ", ".join(lst) if isinstance(lst, (list, np.ndarray)) else str(lst)
)

verbs_docs = full_dataset["pairs_str"].tolist()

In [None]:
def comma_tokenizer(text):
    return [token.strip() for token in text.split(",") if token.strip()]

verb_vectorizer = TfidfVectorizer(
    tokenizer=comma_tokenizer,
    min_df=2,
    max_df=0.85,
    ngram_range=(1,1), # single verbs only
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=False
)

verb_tfidf = verb_vectorizer.fit_transform(verbs_docs)
verb_vocab  = verb_vectorizer.get_feature_names_out()
verb_vocab_set = set(verb_vocab)



### Bert

In [None]:
token_embeds = model.encode(verb_vocab, show_progress_bar=True, batch_size=64)
embed_map    = dict(zip(verb_vocab, token_embeds))

def weighted_verb_embedding(doc_idx, doc_text):
    # get tokens
    tokens = comma_tokenizer(doc_text)
    # get tfidf weights for this row
    row = verb_tfidf[doc_idx]
    cols = row.nonzero()[1]
    tfidf_scores = { verb_vocab[c]: row[0, c] for c in cols }
    #  accumulate embeddings
    emb_list, w_list = [], []
    for tok in tokens:
        if tok in embed_map:
            emb_list.append(embed_map[tok])
            w_list.append(tfidf_scores.get(tok, 0.0))
    if emb_list and np.sum(w_list) > 0:
        embs = np.vstack(emb_list)
        ws   = np.array(w_list)[:, None]
        return (embs * ws).sum(axis=0) / ws.sum()
    return model.encode(" ".join(tokens))

Batches:   0%|          | 0/120 [00:00<?, ?it/s]

In [None]:
verb_embeddings = []
for i, doc in enumerate(verbs_docs):
    verb_embeddings.append(weighted_verb_embedding(i, doc))
verb_embeddings = np.vstack(verb_embeddings)
print("Shape of verb embeddings:", verb_embeddings.shape)

Shape of verb embeddings: (25758, 384)


In [None]:
titles = full_dataset["title"].tolist()
embedding_dict = {t: e for t, e in zip(titles, verb_embeddings)}

np.savez('/content/drive/MyDrive/NLP_Fanta_Projactt/RecipeNLG/DATASET/pairs_embeddings_by_title.npz',
         **embedding_dict)

## Preprocessing/Embeddings of titles

In [None]:
def clean_title(text):
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text.strip()

full_dataset["title_clean"] = full_dataset["title"].apply(clean_title)

In [None]:
title_texts = full_dataset["title_clean"].tolist()
title_embeddings = model.encode(title_texts, show_progress_bar=True, batch_size=128)

Batches:   0%|          | 0/202 [00:00<?, ?it/s]

In [None]:
titles = full_dataset["title"].tolist()
embedding_dict = {t: e for t, e in zip(titles, title_embeddings)}

np.savez('/content/drive/MyDrive/NLP_Fanta_Projactt/RecipeNLG/DATASET/title_embeddings_by_title.npz',
         **embedding_dict)