In [None]:
import pandas as pd
from tqdm.auto import tqdm

## Data ingestion

In [None]:
df_recipes = pd.read_csv("../data/recipes.csv")  # source data

In [None]:
# format documents for Qdrant vector DB
recipes_raw = df_recipes.to_dict(orient="records")
for recipe in tqdm(recipes_raw):
    description_stripped = recipe["recipe_description"].strip()
    directions_joined = " ".join(eval(recipe["directions"]))
    ingredients_joined = "; ".join(eval(recipe["ingredients"]))

    text = f"Recipe: {recipe['recipe_name'].strip()} | Description: {description_stripped} | Ratings: {recipe['ratings'].strip()} | Ready in: {recipe['ready-in'].strip()} | Directions: {directions_joined.strip()} | Ingredients: {ingredients_joined.strip()}"

    recipe["text"] = text

  0%|          | 0/88 [00:00<?, ?it/s]

In [None]:
# Qdrant
from qdrant_client import QdrantClient, models

qdrant_client = QdrantClient("http://localhost:6333")

EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [None]:
collection_name = "recipe-rag"

if qdrant_client.collection_exists(collection_name=collection_name):
    qdrant_client.delete_collection(collection_name=collection_name)

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY, distance=models.Distance.COSINE
    ),
)

True

In [None]:
# construct points
points = []

for recipe in tqdm(recipes_raw):
    point = models.PointStruct(
        id=recipe["recipe_id"],
        vector=models.Document(
            text=recipe["text"], model=model_handle
        ),  # embedding the text
        payload={
            "recipe_id": recipe["recipe_id"],
            "text": recipe["text"],
            "recipe_name": recipe["recipe_name"],
            "recipe_link": recipe["recipe_link"],
            "recipe_description": recipe["recipe_description"],
            "ratings": recipe["ratings"],
            "ready-in": recipe["ready-in"],
            "directions": recipe["directions"],
            "ingredients": recipe["ingredients"],
        },
    )
    points.append(point)

# upsert into DB
qdrant_client.upsert(collection_name=collection_name, points=points)

  0%|          | 0/88 [00:00<?, ?it/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## RAG pipeline

In [None]:
from dotenv import load_dotenv

load_dotenv()

True

In [None]:
from openai import OpenAI

openai_client = OpenAI()

In [None]:
# search
def vector_search(collection_name, query, limit=1):
    query_points = qdrant_client.query_points(
        collection_name=collection_name,
        query=models.Document(  # embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query, model=model_handle
        ),
        limit=limit,  # top closest matches
        with_payload=True,  # to get metadata in the results
    )

    results = []

    for point in query_points.points:
        results.append(point.payload)
    return results

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
You're a cooking assistant. Answer the QUESTION based on the CONTEXT from the recipe database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""

    for doc in search_results:
        context = (
            context + f"{doc['text']}\n\n"
        )  # doc['text'] should contain all the recipe information

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
def llm(openai_client, llm_model, prompt):
    response = openai_client.chat.completions.create(
        model=llm_model, messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [12]:
def rag(query, openai_client, collection_name, llm_model, limit=5):
    search_results = vector_search(collection_name, query, limit)
    prompt = build_prompt(query, search_results)
    answer = llm(openai_client, llm_model, prompt)
    return answer

In [None]:
query = "Are there any recommended fish recipes? Can you give me the details about the recipes as well?"
answer = rag(
    query=query,
    openai_client=openai_client,
    collection_name=collection_name,
    llm_model="gpt-4o-mini",
)
print(answer)

Yes, there are several recommended fish recipes. Here are the details about each recipe:

1. **Easy Oven-Baked Cod**
   - **Description:** Amazingly quick, fresh, and light, this recipe can be made with any type of fish fillet.
   - **Ratings:** 4.6 stars (60 ratings)
   - **Ready in:** 25 minutes
   - **Directions:** Preheat oven to 400°F. Line your pan with aluminum foil and lightly brush with olive oil. Combine parsley, garlic, lemon zest, and coarse salt; chop finely and mix with breadcrumbs. Brush cod fillets with olive oil, press into breadcrumb mixture, and place crust-side up in a baking dish. Bake for 12-15 minutes. Serve with lemon wedges.
   - **Ingredients:** 11⁄2 cups plain breadcrumbs, 1⁄2 cup fresh parsley, 2-3 garlic cloves, 1 lemon (zest), 3⁄4 teaspoon coarse salt, 4 (6-8 ounce) cod fillets, olive oil.

2. **Lemon Baked Cod**
   - **Description:** A great baked cod recipe.
   - **Ratings:** 4.35 stars (209 ratings)
   - **Ready in:** 35 minutes
   - **Directions:** If 

## Retrieval evaluation

In [None]:
df_question = pd.read_csv("../data/ground-truth-retrievel.csv")

In [15]:
df_question

Unnamed: 0,id,question
0,0,What are the main ingredients for the Ground B...
1,0,How long does it take to cook the Ground Beef ...
2,0,What kind of sauce is recommended with the Gro...
3,0,Can turkey be used instead of beef in Ground B...
4,0,How is the Ground Beef Gyros rated by those wh...
...,...,...
435,87,What is the rating for the Yakisoba recipe?
436,87,How long does it take to cook Yakisoba?
437,87,What type of meat is used in the Yakisoba recipe?
438,87,Does the Yakisoba recipe include soy sauce?


In [None]:
ground_truth = df_question.to_dict(orient="records")

### Metric calculation and evaluation

In [None]:
# metric calculation
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank]:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)


def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q["id"]
        results = search_function(q)
        relevance = [d["recipe_id"] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }

### Qdrant sparse vector search

In [None]:
# qdrant sparse vector search
collection_name = "recipe-rag-sparse"
if qdrant_client.collection_exists(collection_name=collection_name):
    qdrant_client.delete_collection(collection_name=collection_name)

qdrant_client.create_collection(
    collection_name=collection_name,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    },
)

True

In [None]:
# construct points
points = []

for recipe in tqdm(recipes_raw):
    point = models.PointStruct(
        id=recipe["recipe_id"],
        vector={
            "bm25": models.Document(
                text=recipe["text"],
                model="Qdrant/bm25",
            ),
        },
        payload={
            "recipe_id": recipe["recipe_id"],
            "text": recipe["text"],
            "recipe_name": recipe["recipe_name"],
            "recipe_link": recipe["recipe_link"],
            "recipe_description": recipe["recipe_description"],
            "ratings": recipe["ratings"],
            "ready-in": recipe["ready-in"],
            "directions": recipe["directions"],
            "ingredients": recipe["ingredients"],
        },
    )
    points.append(point)

# upsert into DB
qdrant_client.upsert(collection_name=collection_name, points=points)

  0%|          | 0/88 [00:00<?, ?it/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
def qdrant_sparse_search(query, collection_name, limit=5) -> list[models.ScoredPoint]:
    query_points = qdrant_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)
    return results

In [None]:
evaluate(
    ground_truth,
    lambda q: qdrant_sparse_search(
        q["question"], collection_name="recipe-rag-sparse", limit=10
    ),
)

  0%|          | 0/440 [00:00<?, ?it/s]

{'hit_rate': 0.9954545454545455, 'mrr': 0.9595129870129869}

### Qdrant vector search

In [None]:
def qdrant_search(query, collection_name, limit=5):
    query_points = qdrant_client.query_points(
        collection_name=collection_name,
        query=models.Document(  # embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query, model=model_handle
        ),
        limit=limit,  # top closest matches
        with_payload=True,  # to get metadata in the results
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)
    return results

In [None]:
evaluate(
    ground_truth,
    lambda q: qdrant_search(q["question"], collection_name="recipe-rag", limit=10),
)

  0%|          | 0/440 [00:00<?, ?it/s]

{'hit_rate': 0.9886363636363636, 'mrr': 0.9756439393939393}

### Qdrant hybrid search

In [None]:
# hybrid search with Qdrant
collection_name = "recipe-rag-hybrid"
if qdrant_client.collection_exists(collection_name):
    qdrant_client.delete_collection(collection_name=collection_name)

qdrant_client.create_collection(
    collection_name=collection_name,
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    },
)

True

In [None]:
# construct points
points = []

for recipe in tqdm(recipes_raw):
    point = models.PointStruct(
        id=recipe["recipe_id"],
        vector={
            "jina-small": models.Document(
                text=recipe["text"],
                model="jinaai/jina-embeddings-v2-small-en",
            ),
            "bm25": models.Document(
                text=recipe["text"],
                model="Qdrant/bm25",
            ),
        },
        payload={
            "recipe_id": recipe["recipe_id"],
            "text": recipe["text"],
            "recipe_name": recipe["recipe_name"],
            "recipe_link": recipe["recipe_link"],
            "recipe_description": recipe["recipe_description"],
            "ratings": recipe["ratings"],
            "ready-in": recipe["ready-in"],
            "directions": recipe["directions"],
            "ingredients": recipe["ingredients"],
        },
    )
    points.append(point)

# upsert into DB
qdrant_client.upsert(collection_name=collection_name, points=points)

  0%|          | 0/88 [00:00<?, ?it/s]

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [57]:
def qdrant_rrf_search(query, collection_name, limit=5) -> list[models.ScoredPoint]:
    query_points = qdrant_client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    results = []
    for point in query_points.points:
        results.append(point.payload)
    return results

In [None]:
evaluate(
    ground_truth,
    lambda q: qdrant_rrf_search(
        q["question"], collection_name="recipe-rag-hybrid", limit=10
    ),
)

  0%|          | 0/440 [00:00<?, ?it/s]

{'hit_rate': 0.9931818181818182, 'mrr': 0.9742514430014431}