In [1]:
import numpy as np
import pandas as pd
import faiss
from typing import List, Dict, Any, Optional
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA
from transformers import pipeline
from langchain_community.vectorstores import FAISS
from langchain.chains.retrieval import create_retrieval_chain
from dotenv import load_dotenv
import os
from RecipeEmbedder import RecipeEmbedder
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

RuntimeError: Failed to import transformers.pipelines because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [3]:
class AdvancedRecipeRecommender:
    def __init__(self, recipe_data_path: str, faiss_index_path: str):
        """Initialize the recipe recommender system"""
        # Load data and models
        self.df = pd.read_pickle(recipe_data_path)
        self.df = self.df[['id', 'title','instructions_full','ingredients_clean','full_embedding']]
        self.faiss_index = faiss.read_index(faiss_index_path)
        
        # Initialize embedding model
        #self.embedding_model = HuggingFaceEmbeddings(
        #    model_name="sentence-transformers/all-mpnet-base-v2"
        #)
        load_dotenv()  # this loads variables from .env into the environment

        api_key = os.getenv("OPENAI_API_KEY")
        
        # Initialize LLM
        self.llm = OpenAI(openai_api_key=api_key,base_url="https://api.deepseek.com",temperature=0.2)
        
        # Initialize reranker
        self.reranker = pipeline(
            "text-classification", 
            model="cross-encoder/ms-marco-MiniLM-L-6-v2",
            device=0 if faiss.get_num_gpus() > 0 else -1
        )
        self.recipe_embedder = RecipeEmbedder(logger=logger)
    
    def create_query_embedding(self, 
                              ingredients: List[str], 
                              dietary_preference: str,
                              nutrition_goals: Optional[Dict[str, float]] = None) -> np.ndarray:
        return self.recipe_embedder.create_query_embedding(
            ingredients=ingredients,
            dietary_preference=dietary_preference,
            nutrition_goals=nutrition_goals
        )
    
    def retrieve_candidate_recipes(self, 
                                  query_vector: np.ndarray,
                                  top_k: int = 50) -> pd.DataFrame:
        """Retrieve initial candidates using FAISS"""
        # Perform similarity search
        D, I = self.faiss_index.search(query_vector.astype("float32"), top_k)
        
        # Retrieve matching recipes
        results = self.df.iloc[I[0]].copy()
        results["similarity_score"] = D[0]
        
        return results
    
    def filter_by_ingredients(self, 
                             candidates: pd.DataFrame,
                             available_ingredients: List[str],
                             max_missing: int = 2) -> pd.DataFrame:
        """Filter recipes by available ingredients"""
        def ingredient_match_score(recipe_ingredients):
            available = set(i.lower() for i in available_ingredients)
            recipe_set = set(i.lower() for i in recipe_ingredients)
            
            # Count ingredients that can be matched
            matched = sum(1 for r in recipe_set if any(a in r for a in available))
            
            # Calculate match percentage
            match_percent = matched / len(recipe_set) if recipe_set else 0
            missing = len(recipe_set) - matched
            
            return match_percent, missing
        
        # Calculate match scores
        candidates['ingredient_match'], candidates['missing_count'] = zip(
            *candidates['ingredients_clean'].apply(ingredient_match_score)
        )
        
        # Filter by maximum missing ingredients
        filtered = candidates[candidates['missing_count'] <= max_missing].copy()
        
        # Add ingredient match to ranking
        filtered['combined_score'] = (
            filtered['similarity_score'] * 0.6 + 
            filtered['ingredient_match'] * 0.4
        )
        
        return filtered.sort_values('combined_score', ascending=False)
    
    def filter_by_dietary_preferences(self, 
                                     candidates: pd.DataFrame,
                                     preferences: Dict[str, Any]) -> pd.DataFrame:
        """Apply dietary filters based on preferences"""
        filtered = candidates.copy()
        
        # Apply each constraint
        if 'low_carb' in preferences and preferences['low_carb']:
            filtered = filtered[
                filtered['nutrition_per_100g'].apply(
                    lambda x: x.get('carbohydrate', 100) < preferences.get('max_carbs', 15)
                )
            ]
            
        if 'high_protein' in preferences and preferences['high_protein']:
            filtered = filtered[
                filtered['nutrition_per_100g'].apply(
                    lambda x: x.get('protein', 0) > preferences.get('min_protein', 20)
                )
            ]
            
        if 'low_fat' in preferences and preferences['low_fat']:
            filtered = filtered[
                filtered['nutrition_per_100g'].apply(
                    lambda x: x.get('fat', 100) < preferences.get('max_fat', 10)
                )
            ]
            
        return filtered
    
    def rerank_with_llm(self, 
                       candidates: pd.DataFrame,
                       user_query: str,
                       top_k: int = 10) -> pd.DataFrame:
        """Rerank top candidates using a cross-encoder"""
        if len(candidates) == 0:
            return candidates
            
        if len(candidates) <= top_k:
            return candidates
        
        # Prepare data for reranking
        rerank_pairs = []
        for _, recipe in candidates.head(min(50, len(candidates))).iterrows():
            # Create description for each recipe
            recipe_desc = (
                f"Title: {recipe['title']}\n"
                f"Ingredients: {', '.join(recipe['ingredients_clean'])}\n"
                f"Instructions: {recipe['instructions'][:200]}...\n"
                f"Nutrition: {recipe['nutrition_per_100g']}"
            )
            rerank_pairs.append((user_query, recipe_desc))
        
        # Get reranking scores
        rerank_scores = self.reranker(rerank_pairs)
        
        # Get top recipes based on reranking
        score_dict = {
            i: score['score'] 
            for i, score in enumerate(rerank_scores)
        }
        
        # Get top indices
        sorted_indices = sorted(
            score_dict.keys(), 
            key=lambda i: score_dict[i], 
            reverse=True
        )[:top_k]
        
        # Return reranked results
        return candidates.iloc[sorted_indices].copy()
    
    def generate_recipe_explanation(self, 
                                   recipe: pd.Series,
                                   user_query: str) -> str:
        """Generate an explanation for why this recipe was recommended"""
        prompt = f"""
        User is looking for: {user_query}
        
        You recommended the recipe "{recipe['title']}" which has these characteristics:
        - Ingredients: {', '.join(recipe['ingredients_clean'])}
        - Nutrition: {recipe['nutrition_per_100g']}
        - Instructions: {recipe['instructions'][:100]}...
        
        Write a brief explanation (2-3 sentences) for why this recipe matches their needs:
        """
        
        return self.llm(prompt)
    
    def recommend(self, 
                 ingredients: List[str],
                 dietary_preference: str,
                 nutrition_goals: Optional[Dict[str, float]] = None,
                 max_missing: int = 2,
                 top_k: int = 5) -> Dict[str, Any]:
        """End-to-end recommendation pipeline"""
        # 1. Create query embedding
        query_vector = self.create_query_embedding(
            ingredients, dietary_preference, nutrition_goals
        )
        
        # 2. Retrieve initial candidates
        candidates = self.retrieve_candidate_recipes(query_vector, top_k=100)
        
        # 3. Apply ingredient filter
        filtered = self.filter_by_ingredients(
            candidates, ingredients, max_missing=max_missing
        )
        
        # 4. Apply dietary preferences filter
        preferences = {}
        if 'low carb' in dietary_preference.lower():
            preferences['low_carb'] = True
        if 'high protein' in dietary_preference.lower():
            preferences['high_protein'] = True
        if 'low fat' in dietary_preference.lower():
            preferences['low_fat'] = True
            
        filtered = self.filter_by_dietary_preferences(filtered, preferences)
        
        # 5. Rerank with LLM
        user_query = f"I have {', '.join(ingredients)} and I want {dietary_preference} recipes"
        reranked = self.rerank_with_llm(filtered, user_query, top_k=top_k)
        
        # 6. Add explanations
        if not reranked.empty:
            reranked['explanation'] = reranked.apply(
                lambda row: self.generate_recipe_explanation(row, user_query), axis=1
            )
        
        # Return top recommendations with explanations
        result = {
            "query": user_query,
            "total_candidates": len(candidates),
            "filtered_count": len(filtered),
            "recommendations": reranked.to_dict('records') if not reranked.empty else []
        }
        
        return result

In [4]:
# Example usage
if __name__ == "__main__":
    recommender = AdvancedRecipeRecommender(
        recipe_data_path="embeddings/full_recipe_embeddings.pkl",
        faiss_index_path="recipe_faiss_index.idx"
    )
    
    results = recommender.recommend(
        ingredients=["chicken", "olive oil"],
        dietary_preference="high protein low carb",
        nutrition_goals={"protein": 30, "fat": 15},
        max_missing=2,
        top_k=5
    )
    
    print(f"Found {len(results['recommendations'])} recommendations")
    for i, recipe in enumerate(results['recommendations']):
        print(f"\n{i+1}. {recipe['title']}")
        print(f"Explanation: {recipe['explanation']}")
        print(f"Ingredients: {', '.join(recipe['ingredients_clean'][:5])}...")
        print(f"Instructions: {recipe['instructions'][:100]}...")
        print(f"Nutrition (per 100g): protein={recipe['nutrition_per_100g'].get('protein', 'N/A')}g, carbs={recipe['nutrition_per_100g'].get('carbohydrate', 'N/A')}g")

  self.llm = OpenAI(openai_api_key=api_key,base_url="https://api.deepseek.com",temperature=0.2)
Device set to use cpu
2025-04-24 01:01:19,188 - INFO - Creating query embedding...
2025-04-24 01:01:19,188 - INFO - Creating embeddings for 1 texts with batch size 32...


device: cpu


  0%|          | 0/1 [00:00<?, ?it/s]

: 

In [1]:
import os

if "TRANSFORMERS_OFFLINE" in os.environ:
    print('True')

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
print("Model downloaded successfully!")

Model downloaded successfully!
