In [12]:
import numpy as np
import pandas as pd
"""Index(['id', 'title', 'ingredients_raw', 'instructions_full',
    'ingredients_clean', 'title_nutrition', 'url_nutrition', 'instructions',
    'ingredients_structured', 'nutrition_per_100g', 'fsa_lights_per_100g',
    'clean_title', 'clean_instructions', 'text_for_embedding',
    'nutrition_dict', 'nutrition_features', 'fsa_dict', 'fsa_features',
    'nutrition_energy', 'nutrition_fat', 'nutrition_protein',
    'nutrition_salt', 'nutrition_saturates', 'nutrition_sugars',
    'nutrition_vector', 'fsa_fat', 'fsa_saturates', 'fsa_sugars',
    'fsa_salt', 'fsa_vector', 'title_instruction_embedding',
    'enhanced_nutrition_embedding', 'enhanced_fsa_embedding',
    'partial_combined_embedding', 'ingredients_embedding',
    'full_embedding'],
    dtype='object')
"""
all_embeddings = pd.read_pickle("embeddings/full_recipe_embeddings.pkl")
all_embeddings=all_embeddings[['id', 'title','instructions_full','ingredients_clean','full_embedding']]
all_embeddings.head(1)

Unnamed: 0,id,title,instructions_full,ingredients_clean,full_embedding
0,000095fc1d,Yogurt Parfaits,Layer all ingredients in a serving dish.,"[non - fat vanilla yogurt, strawberries, low -...","[0.007353680208325386, -0.1631745994091034, -0..."


In [13]:
def create_query_embedding(self, ingredients, dietary_preference, nutrition_goals=None):
    """
    Create a query embedding of length 1177 to match the recipe full_embedding.
    Allows optional user-provided nutrition goals.

    Parameters:
    - ingredients: list of strings
    - dietary_preference: string
    - nutrition_goals: dict with optional keys:
        'energy', 'fat', 'saturates', 'carbohydrate', 'sugars',
        'fibre', 'protein', 'salt'

    Returns:
    - query_embedding: np.ndarray of shape (1177,)
    """
    logger.info("Creating query embedding...")

    # --- Step 1: Dietary preference embedding (768)
    preference_emb = self.create_text_embeddings([dietary_preference])[0]

    # --- Step 2: Nutrition embedding (15)
    if nutrition_goals is not None:
        # Ordered features
        keys = ['energy', 'fat', 'saturates', 'carbohydrate', 'sugars', 'fibre', 'protein', 'salt']
        raw_nutrition = [nutrition_goals.get(k, 0) for k in keys]
        derived_nutrition = self.create_nutrition_embeddings([raw_nutrition])[0]
    else:
        derived_nutrition = np.zeros(15)

    # --- Step 3: FSA (keep dummy unless explicitly provided later) (9)
    dummy_fsa = np.zeros(9)

    # --- Step 4: Ingredient embedding (385)
    ingredient_text = " "._


In [14]:
import faiss
import numpy as np

def recommend_recipes(
    ingredients,
    dietary_preference,
    embedder,
    df,
    faiss_index_path="recipe_faiss_index.idx",
    nutrition_goals=None,
    top_k=10
):
    """
    Search for recipes matching user query.

    Parameters:
    - ingredients (list of str): List of ingredients user has
    - dietary_preference (str): e.g., 'low carb'
    - embedder (RecipeEmbedder): Initialized embedder class
    - df (pd.DataFrame): DataFrame with full embeddings
    - faiss_index_path (str): Path to FAISS index file
    - nutrition_goals (dict): Optional nutrition goal inputs
    - top_k (int): Number of results to return

    Returns:
    - pd.DataFrame with top_k recipes and similarity distances
    """
    # Step 1: Create query embedding
    query_vector = embedder.create_query_embedding(
        ingredients=ingredients,
        dietary_preference=dietary_preference,
        nutrition_goals=nutrition_goals
    ).reshape(1, -1)

    # Step 2: Load FAISS index
    index = faiss.read_index(faiss_index_path)

    # Step 3: Perform similarity search
    D, I = index.search(query_vector.astype("float32"), top_k)

    # Step 4: Retrieve matching recipes from df
    results = df.iloc[I[0]].copy()
    results["similarity_distance"] = D[0]

    return results


In [15]:
print('Adding directory')
# Add the directory containing the file to sys.path
import os
import sys
import logging
import torch
module_path = os.path.abspath(os.path.join('..', '99_6.C01'))

if module_path not in sys.path:
    sys.path.append(module_path)
#from load_complete_df import return_df_final
from RecipeEmbedder import RecipeEmbedder
#from RecipePreprocessor import RecipePreprocessor
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Suppress specific warnings
#warnings.filterwarnings("ignore", category=UserWarning)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")


results_df = recommend_recipes(
    ingredients=["chicken", "olive oil"],
    dietary_preference="chicken, olive oil, mean high protein",
    embedder=RecipeEmbedder(logger = logger),
    df=all_embeddings,  # Load with pd.read_pickle if needed
    faiss_index_path="recipe_faiss_index.idx",
    nutrition_goals={"protein": 30, "fat": 10},
    top_k=5
)

# View top results
results_df[["title", "instructions_full", "similarity_distance"]]


2025-04-24 00:28:00,842 - INFO - Using device: cpu
2025-04-24 00:28:01,008 - INFO - Creating query embedding...
2025-04-24 00:28:01,009 - INFO - Creating embeddings for 1 texts with batch size 32...


Adding directory
device: cpu


100%|██████████| 1/1 [00:00<00:00, 11.14it/s]
2025-04-24 00:28:01,101 - INFO - Creating enhanced nutrition embeddings...
2025-04-24 00:28:01,102 - INFO - Creating embeddings for 1 texts with batch size 32...
100%|██████████| 1/1 [00:00<00:00, 14.59it/s]


preference_emb: (768,)
derived_nutrition: (15,)
dummy_fsa: (9,)
ingredient_emb: (385,)
Total combined: (1177,)


Unnamed: 0,title,instructions_full,similarity_distance
18175,Apple Harvest Rice Recipe,"Cook carrots in margarine til tender-crisp, ab...",26.995068
11323,Low Carb Chicken Salad,Combine all ingredients and serve on whole gra...,27.027374
34035,"Olive, Thyme, and Orange Chicken","mix olives, thyme, orange, and red pepper. let...",27.203611
10746,High Protein Diet Bread,Follow manufacturer's directions for order of ...,27.27479
26230,Easy Flavorful Chicken Recipe,Heat Olive Oil in a large skillet over medium ...,27.280203


In [7]:
def filter_by_available_ingredients(df, allowed_ingredients, ingredient_col="ingredients_clean"):
    def is_valid(row_ingredients):
        return all(any(ai in ing.lower() for ing in row_ingredients) for ai in allowed_ingredients)
    
    return df[df[ingredient_col].apply(is_valid)]

filtered_results = filter_by_available_ingredients(results_df, allowed_ingredients=["chicken", "olive oil"])
filtered_results.head()

NameError: name 'results_df' is not defined

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

"""Initialize the embedder with the specified model"""
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')
model_path = "sentence-transformers/all-mpnet-base-v2"
#model_path = "all-mpnet-base-v2"

# Force offline mode
#os.environ["HF_HUB_OFFLINE"] = "1"
#os.environ["TRANSFORMERS_OFFLINE"] = "1"
#os.environ["TRANSFORMERS_NO_ADVISORY_LOCKING"] = "1"
#self.tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
#self.model = AutoModel.from_pretrained(model_path, local_files_only=True).to(self.device)
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path).to(device)
tokenizer(["hello"])

device: cpu


{'input_ids': [[0, 7596, 2]], 'attention_mask': [[1, 1, 1]]}