vector database

In [1]:
!pip install pandas chromadb sentence-transformers tqdm


Collecting chromadb
  Using cached chromadb-1.1.1-cp39-abi3-win_amd64.whl.metadata (7.4 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.3.0-py3-none-any.whl.metadata (5.6 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-win_amd64.whl.metadata (9.0 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Using cached posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.23.1-cp312-cp312-win_amd64.whl.metadata (5.2 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Using cached opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Using cached opentelemetry_exporter_otlp_proto_grpc-1.38.0-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chrom

In [None]:
# Load HF token and login if necessary
# Method 1: Set your HF token as an environment variable
# You can set it in your system environment variables or in a .env file
import os
from huggingface_hub import login

hf_token = os.getenv('HF_TOKEN')
if hf_token:
    login(token=hf_token)

In [2]:
import chromadb
from chromadb.utils import embedding_functions
from tqdm import tqdm

chroma_client = chromadb.PersistentClient(path="vector_new_db/recipes")

embedder = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="BAAI/bge-m3"
)

collection = chroma_client.get_or_create_collection(
    name="recipes",
    metadata={"hnsw:space": "cosine"}
)


In [4]:
import pandas as pd
import re

df = pd.read_csv("recipes-final.csv")

def clean_text(x):
    if pd.isna(x): return ""
    return re.sub(r"\s+", " ", str(x)).strip()

for col in df.columns:
    df[col] = df[col].map(clean_text)

def build_doc(row):
    return (
        f"Recipe Name: {row['Recipes_name']}\n"
        f"Ingredients: {row['Ingredients']}\n"
        f"Flavor: {row['Flavor']}\n"
    )

df["doc_text"] = df.apply(build_doc, axis=1)

df["doc_id"] = [f"recipe_{i}" for i in range(len(df))]

BATCH_SIZE = 100
for i in tqdm(range(0, len(df), BATCH_SIZE)):
    batch = df.iloc[i:i+BATCH_SIZE]
    docs = batch["doc_text"].tolist()
    ids = batch["doc_id"].tolist()
    # 3️⃣ 元数据（不用于 embedding，但用于查询过滤或展示）
    metas = batch[[
        "Recipes_name",
        "Ingredients",
        "Flavor",
        "Difficulty",
        "Estimated Cooking Time",
        "Recipes"
    ]].to_dict(orient="records")

    embeddings = embedder(docs)

    collection.upsert(
        ids=ids,
        documents=docs,
        embeddings=embeddings,
        metadatas=metas
    )

100%|██████████| 40/40 [12:31<00:00, 18.79s/it]
100%|██████████| 40/40 [12:31<00:00, 18.79s/it]


In [4]:
def search_recipes(query: str, top_k: int = 3):
    query_emb = embedder([query])[0]

    results = collection.query(
        query_embeddings=[query_emb],
        n_results=top_k
    )

    print(f"\n🔎 Query: {query}\n")
    for i in range(top_k):
        name = results["metadatas"][0][i]["Recipes_name"]
        flavor = results["metadatas"][0][i]["Flavor"]
        diff = results["metadatas"][0][i]["Difficulty"]
        time = results["metadatas"][0][i]["Estimated Cooking Time"]

        print(f"🍳 {i+1}. {name} ({flavor}, {diff}, {time})")
        print(f"    → {results['documents'][0][i][:120]}...")
    print("\n")

search_recipes("I want an easy spicy dish with pork")



🔎 Query: I want an easy spicy dish with pork

🍳 1. Stir-Fried Pork with Onion (Spicy and savory, Easy, 20 mins)
    → Recipe Name: Stir-Fried Pork with Onion
Ingredients: Pork; Onion; Soy sauces; Chili paste
Flavor: Spicy and savory
Diffi...
🍳 2. Minced Pork with Tiger Peppers (Spicy and umami, Medium, 30 mins)
    → Recipe Name: Minced Pork with Tiger Peppers
Ingredients: Pork; Salt; Scallion; Green chili; Garlic; Soy sauces; Vinegar;...
🍳 3. Stir-Fried Pork with Chili (Spicy & Hearty, Simple, 25 min)
    → Recipe Name: Stir-Fried Pork with Chili
Ingredients: Pork; Green chili; Garlic; Ginger; Soy sauce; Oyster sauce
Flavor: ...




Set-Up LLM

In [5]:
import google.generativeai as genai
genai.configure(api_key="")
model = genai.GenerativeModel("gemini-2.5-flash-lite")


In [6]:
model


genai.GenerativeModel(
    model_name='models/gemini-2.5-flash-lite',
    generation_config={},
    safety_settings={},
    tools=None,
    system_instruction=None,
    cached_content=None
)

Database search results access Gemini

In [8]:
def search_recipes_rag(query: str, top_k: int = 3):
    query_emb = embedder([query])[0]
    results = collection.query(
        query_embeddings=[query_emb],
        n_results=top_k
    )

    retrieved_docs = []
    recipe_names = []
    for i in range(top_k):
        name = results["metadatas"][0][i]["Recipes_name"]
        flavor = results["metadatas"][0][i]["Flavor"]
        diff = results["metadatas"][0][i]["Difficulty"]
        time = results["metadatas"][0][i]["Estimated Cooking Time"]
        doc_text = results["documents"][0][i]

        recipe_names.append(name.lower())
        retrieved_docs.append(
            f"Recipe {i+1}: {name}\nFlavor: {flavor}, Difficulty: {diff}, Time: {time}\n{doc_text}"
        )

    context = "\n\n".join(retrieved_docs)

    query_lower = query.lower()
    matched_recipe = None
    for name in recipe_names:
        if name in query_lower:
            matched_recipe = name
            break

        prompt = f"""
You are a professional chef assistant.

User query:
{query}

You have found a recipe called "{matched_recipe}" in the database.

Here is the relevant recipe information:
{context}

Please provide the *complete detailed recipe* for "{matched_recipe}" —
including ingredients, step-by-step instructions, cooking tips,
and any flavor profile descriptions in natural English.
If the user mentioned preferences (like less spicy or easier), adjust the recipe accordingly.
"""
    else:
        # 🍽️ 模式1：用户在描述需求，想要推荐
        prompt = f"""
You are a professional chef assistant.

User query:
{query}

Here are some related recipes from the database:
{context}

Please recommend 2–3 dishes that best match the user's request.
Explain why you chose them (consider ingredients, flavor, difficulty, and cooking time).
Output in natural English sentences.
"""

    # Step 5️⃣ 让 Gemini 生成结果
    response = model.generate_content(contents=prompt)
    print("\n💬 Gemini Output:\n")
    print(response.text)


In [9]:
search_recipes_rag("I have pork and I want some sweet food.")


💬 Gemini Output:

Based on your request for a sweet pork dish, I have two excellent recommendations for you:

First, **Sweet and Sour Pork** is a classic choice. It directly fulfills your craving for sweetness with its sugar, ketchup, and pineapple-based sauce. The recipe is of medium difficulty and takes about 40 minutes, making it a manageable option for a flavorful meal.

Secondly, **Sweet and Sour Crispy Pork (Guo Bao Rou)** is another fantastic option. This dish also delivers on the sweet and sour profile you're looking for, with the added bonus of crispy texture from the double-frying technique. Like the first option, it's a medium difficulty recipe with an estimated cooking time of 40 minutes.

If you're looking for something a bit quicker and simpler, **Light Sweet Lychee Pork** would be a great choice. It offers a sweet and tender flavor profile, is rated as simple difficulty, and has the shortest cooking time at just 25 minutes.


In [10]:
print("\n💡 Diagnostic info for environment\n")
import sys, importlib, os
print('sys.executable =', sys.executable)
print('sys.version =', sys.version)
print('sys.path (first 10) =', sys.path[:10])

def show_pkg(name):
    try:
        mod = importlib.import_module(name)
        print(f'{name} ->', getattr(mod, '__file__', 'built-in or namespace'))
        print('  version =', getattr(mod, '__version__', 'unknown'))
    except Exception as e:
        print(f'{name} import failed:', e)

show_pkg('typing_extensions')
show_pkg('pydantic')
show_pkg('pydantic_core')
show_pkg('chromadb')

import typing_extensions as te
print('typing_extensions.__file__ =', getattr(te, '__file__', None))

import subprocess
try:
    out = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'typing_extensions'], universal_newlines=True)
    print('\npip show typing_extensions:\n', out)
except Exception as e:
    print('pip show typing_extensions failed:', e)

try:
    out = subprocess.check_output([sys.executable, '-m', 'pip', 'show', 'pydantic'], universal_newlines=True)
    print('\npip show pydantic:\n', out)
except Exception as e:
    print('pip show pydantic failed:', e)


💡 Diagnostic info for environment

sys.executable = d:\Anaconda3\envs\lab-mushroom-chatbot\python.exe
sys.version = 3.12.11 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 12:58:53) [MSC v.1929 64 bit (AMD64)]
sys.path (first 10) = ['d:\\Anaconda3\\envs\\lab-mushroom-chatbot\\python312.zip', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\DLLs', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\Lib', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot', '', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\Lib\\site-packages', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\Lib\\site-packages\\win32', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\Lib\\site-packages\\win32\\lib', 'd:\\Anaconda3\\envs\\lab-mushroom-chatbot\\Lib\\site-packages\\Pythonwin']
typing_extensions -> d:\Anaconda3\envs\lab-mushroom-chatbot\Lib\site-packages\typing_extensions.py
  version = unknown
pydantic -> d:\Anaconda3\envs\lab-mushroom-chatbot\Lib\site-packages\pydantic\__init__.py
  version = 2.11.7
pydantic_core -> d:\Anaconda3\envs\