In [59]:
import openai
import os

from qdrant_client import QdrantClient
from qdrant_client.models import Filter, FieldCondition, MatchValue

from langsmith import Client

## Download the data from Qdrant

In [60]:
qdrant_client = QdrantClient(url="http://localhost:6333")

In [61]:
all_points = qdrant_client.scroll(
    collection_name="Amazon-items-collection-00",
    limit=100,
    offset=None,
    with_payload=True,
    with_vectors=False
)

In [62]:
all_points

([Record(id=0, payload={'parent_asin': 'B0BF9GZFBQ', 'description': "Moyabo Women's Knit Ribbed Maternity Dress Long Sleeve Bodycon Dress for Daily Wearing or Baby Shower ‚úø 34%Cotton+62%Polyester+4%Spandex. Imported Pull On closure Hand Wash Only ‚úø The maternity dress is super soft, stretchy, comfortable, skin-friendly, breathable, high-quality. The womens dresses provide plenty of room for your growing pregnant belly and available in a vatiety of colors.You can wear them throughout your pregnancy and beyond. ‚úø Classic Elegant Simple Style-This bodycon pregnancy dress is features with turtle neck, long sleeve, knee length, midi bodycon maternity dress. It's super versatile, easy to style, great with coats, jacket, trench coat, leggings, boots, very charming and feminine. ‚úø Show Off Your Bump-The knit ribbed maternity dress flatters the body so beautiful it accentuates your curves show your great slim fit and good temperament through all stages of your pregnancy. ‚úø Maternity D

In [63]:
all_context = [{"id": data.payload["parent_asin"], "text": data.payload["description"], "image": data.payload["image"]} for data in all_points[0]]
all_context


[{'id': 'B0BF9GZFBQ',
  'text': "Moyabo Women's Knit Ribbed Maternity Dress Long Sleeve Bodycon Dress for Daily Wearing or Baby Shower ‚úø 34%Cotton+62%Polyester+4%Spandex. Imported Pull On closure Hand Wash Only ‚úø The maternity dress is super soft, stretchy, comfortable, skin-friendly, breathable, high-quality. The womens dresses provide plenty of room for your growing pregnant belly and available in a vatiety of colors.You can wear them throughout your pregnancy and beyond. ‚úø Classic Elegant Simple Style-This bodycon pregnancy dress is features with turtle neck, long sleeve, knee length, midi bodycon maternity dress. It's super versatile, easy to style, great with coats, jacket, trench coat, leggings, boots, very charming and feminine. ‚úø Show Off Your Bump-The knit ribbed maternity dress flatters the body so beautiful it accentuates your curves show your great slim fit and good temperament through all stages of your pregnancy. ‚úø Maternity Dress can be used pre and post pregna

## Prompt to generate synthetic Eval dataset

In [64]:
import json

output_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "question": {
                "type": "string",
                "description": "Suggested question.",
            },
            "chunk_ids": {
                "type": "array",
                "items": {
                    "type": "string",
                    "description": "ID of the chunk that could be used to answer the question.",
                },
            },
            "answer_example": {
                "type": "string",
                "description": "Suggested answer grounded in the context.",
            },
            "reasoning": {
                "type": "string",
                "description": "Reasoning why the question could be answered with the chunks.",
            },
        },
    },
}


SYSTEM_PROMPT = f"""
I am building a RAG application. I have a collection of 50 chunks of text.
The RAG application will act as a shopping assistant that can answer questions about the stock of the products we have available.
I will provide all of the available products to you with IDs of each chunk.
I want you to come up with 30 questions to which the answers could be grounded in the chunk context.
As an output I need you to provide me the list of questions and the IDs of the chunks that could be used to answer them.
Also, provide an example answer to the question given the context of the chunks.
Also, provide the reason why you chose the chunks to answer the questions.
Construct 10 that could use multipple chunks in the answer.
Construct 15 questions that could use single chunk in the answer.
Also, include 5 questions that can't be answered with the available chunks.

<OUTPUT JSON SCHEMA>
{json.dumps(output_schema, indent=2)}
</OUTPUT JSON SCHEMA>

I need to be able to parse the json output.
"""

USER_PROMPT = f"""
Here is the list of chunks, each list element is a dictionary with id and text:
{all_context}
"""

In [65]:
response = openai.chat.completions.create(
    model="gpt-4.1",
    messages=[
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT}
    ]
)

print(response.choices[0].message.content)

```json
[
  {
    "question": "Which products do you have that are suitable for maternity or baby showers?",
    "chunk_ids": ["B0BF9GZFBQ"],
    "answer_example": "We have the Moyabo Women's Knit Ribbed Maternity Dress, which is comfortable and suitable for daily wear or baby showers.",
    "reasoning": "Chunk B0BF9GZFBQ specifically mentions maternity and baby shower use."
  },
  {
    "question": "Do you have any men's rain boots available for outdoor work?",
    "chunk_ids": ["B0BN7ZX6PZ"],
    "answer_example": "Yes, we offer Rain Boots for Men that are waterproof and designed for comfort and durability during outdoor work or rainy days.",
    "reasoning": "Chunk B0BN7ZX6PZ describes men's rain boots suitable for yard, farm, and outdoor work."
  },
  {
    "question": "Which of your products are suitable for plus-size women looking for activewear?",
    "chunk_ids": ["B09TXDJ2X1"],
    "answer_example": "The WUSENST Women Plus Size Summer 2 Pieces Short Set is a great casual activ

In [66]:
import json

json_output = response.choices[0].message.content
json_output = json_output.replace("```json", "")
json_output = json_output.replace("```", "")
json_output = json_output.replace("// Single-chunk questions (15)", "")
json_output = json.loads(json_output)

In [67]:
len(json_output)

35

## Get description of relevant chunks based on chunk IDs

In [68]:
points = qdrant_client.scroll(
    collection_name="Amazon-items-collection-00",
    scroll_filter=Filter(
        must=[
            FieldCondition(
                key="parent_asin",
                match=MatchValue(value="B0BXS2YXYH")
            )
        ]
    ),
    limit=100,
    with_payload=True,
    with_vectors=False
)[0]

In [69]:
points[0].payload["parent_asin"]

'B0BXS2YXYH'

In [75]:
def get_description(parent_asin: str) -> str:

    points = qdrant_client.scroll(
        collection_name="Amazon-items-collection-00",
        scroll_filter=Filter(
            must=[
                FieldCondition(
                    key="parent_asin",
                    match=MatchValue(value=parent_asin)
                )
            ]
        ),
        limit=100,
        with_payload=True,
        with_vectors=False
    )[0]

    return points[0].payload["description"]

## Create Eval Dataset in Langsmith

In [71]:
client =  Client(api_key=os.getenv("LANGCHAIN_API_KEY"))

In [73]:
dataset_name = "rag-evaluation-dataset"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="Dataset for evaluating RAG pipeline"
)

In [76]:
for item in json_output:
    print(item["chunk_ids"])
    client.create_example(
        dataset_id=dataset.id,
        inputs={"question": item["question"]},
        outputs={
            "ground_truth": item["answer_example"],
            "reference_context_ids": item["chunk_ids"],
            "reference_descriptions": [get_description(id) for id in item["chunk_ids"]]
        }
    )

['B0BF9GZFBQ']
['B0BN7ZX6PZ']
['B09TXDJ2X1']
['B0BBVS49J9']
['B0C4YDNNK7']
['B0C9Q7M2H3']
['B09Q381NT1']
['B0BFR4X9TG']
['B09S8J4QC5']
['B0B2WLNFCQ', 'B0BXL5D67L', 'B0B6WLWTGP']
['B0BXL5D67L', 'B0C52QKLF7', 'B0B9QVWWM4', 'B0BZHTMJXR', 'B0BG68XDCN']
['B09PYHQVFR', 'B0BYRQ4J3B']
['B0B2WLNFCQ', 'B0BXL5D67L', 'B0C52QKLF7', 'B09X7BVK19', 'B0BPJ83VK1']
['B0BT12HDJ8']
['B0BQYWDFFR']
['B0B6WLWTGP']
['B0B3DDVWTH', 'B0BZHTMJXR']
['B09T6R7HCJ']
['B09VN35T1M', 'B09XJ28ZS5', 'B09TK64B1L', 'B0BPJ83VK1']
['B0B69PM6DL', 'B0BG62148B', 'B0C57GCZTJ']
['B0C3H5B7LB']
['B0BB9FBXKP', 'B0BXC6R6L5', 'B0BQTXN8VQ']
['B0BZSFTPDN', 'B09TK64B1L']
['B0B5G897MR']
['B0BG68XDCN', 'B0BLRGWXQL']
['B0B21S83Y2']
['B0C649RHHN', 'B09QM776N9']
['B0BGQL667R']
['B0BHT7D5WZ']
['B0C27YBLX9']
[]
[]
[]
[]
[]
