In [24]:
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
from groq import Groq
import os
import dotenv
import time 
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

dotenv.load_dotenv()

True

In [2]:
model_name = os.getenv("MODEL_NAME")
pinecone_api_key = os.getenv("PINECONE_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
index_name = os.getenv("PINECONE_INDEX_NAME")

In [3]:
client = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={
        'normalize_embeddings': True
    }
)

  client = HuggingFaceBgeEmbeddings(


In [4]:
pc = Pinecone(api_key=pinecone_api_key)

# Try out embeddings

In [5]:
test_query = "What is coffee?"
query_embedding = client.embed_query(test_query)

print(f"✅ Query: '{test_query}'")
print(f"✅ Embedding dimension: {len(query_embedding)}")
print(f"✅ First 5 values: {query_embedding[:5]}")
print(f"✅ Vector type: {type(query_embedding)}\n")

✅ Query: 'What is coffee?'
✅ Embedding dimension: 384
✅ First 5 values: [0.0368453748524189, 0.012061568908393383, -0.025216102600097656, 0.029770467430353165, 0.04896198958158493]
✅ Vector type: <class 'list'>



In [6]:
output = client.embed_query('hello world')
print(output)

[-0.02657674252986908, -0.016637271270155907, -0.006781684700399637, -0.030367190018296242, 0.013455442152917385, -0.017892757430672646, -0.01098969392478466, 0.04435940459370613, 0.02644093707203865, -0.020191442221403122, -0.003610821906477213, 0.014323770068585873, 0.036119669675827026, 0.043115563690662384, 0.045061152428388596, 0.010315672494471073, 0.011875133961439133, -0.010059352032840252, -0.08524903655052185, -0.008190781809389591, 0.09314368665218353, 0.060102708637714386, 0.014295180328190327, -0.05056868493556976, -0.02880781702697277, 0.00024380248214583844, 0.026618143543601036, 0.015001095831394196, 0.0006413300870917737, -0.10944204777479172, -0.05080209672451019, -0.012296824716031551, 0.017423462122678757, 0.014841594733297825, 0.03755756467580795, 0.0062120999209582806, 0.04210670664906502, 0.010007616132497787, -0.007931591011583805, 0.011070908047258854, 0.03834463283419609, -0.030788596719503403, 0.03215635567903519, 0.015154127962887287, 0.03439926356077194, -0

In [7]:
len(output)

384

# Wrangle Dataset

In [8]:
df = pd.read_json("products/products.jsonl", lines=True)

In [9]:
df.head(2)

Unnamed: 0,name,category,description,ingredients,price,rating,image_path
0,Cappuccino,Coffee,A rich and creamy cappuccino made with freshly...,"[Espresso, Steamed Milk, Milk Foam]",4.5,4.7,cappuccino.jpg
1,Jumbo Savory Scone,Bakery,"Deliciously flaky and buttery, this jumbo savo...","[Flour, Butter, Cheese, Herbs, Baking Powder, ...",3.25,4.3,SavoryScone.webp


In [10]:
df['text'] = df['name'] + " : " + df['description'] + \
    " -- Ingredients: " + df['ingredients'].astype(str) +\
    " -- Price: " + df['price'].astype(str) +\
    " -- rating: " + df['rating'].astype(str)

In [11]:
df['text'].head()

0    Cappuccino : A rich and creamy cappuccino made...
1    Jumbo Savory Scone : Deliciously flaky and but...
2    Latte : Smooth and creamy, our latte combines ...
3    Chocolate Chip Biscotti : Crunchy and delightf...
4    Espresso shot : A bold shot of rich espresso, ...
Name: text, dtype: object

In [12]:
texts = df['text'].tolist()

In [13]:
texts[:2]

["Cappuccino : A rich and creamy cappuccino made with freshly brewed espresso, steamed milk, and a frothy milk cap. This delightful drink offers a perfect balance of bold coffee flavor and smooth milk, making it an ideal companion for relaxing mornings or lively conversations. -- Ingredients: ['Espresso', 'Steamed Milk', 'Milk Foam'] -- Price: 4.5 -- rating: 4.7",
 "Jumbo Savory Scone : Deliciously flaky and buttery, this jumbo savory scone is filled with herbs and cheese, creating a mouthwatering experience. Perfect for a hearty snack or a light lunch, it pairs beautifully with your favorite coffee or tea. -- Ingredients: ['Flour', 'Butter', 'Cheese', 'Herbs', 'Baking Powder', 'Salt'] -- Price: 3.25 -- rating: 4.3"]

In [14]:
with open("products/Merry's_way_about_us.txt") as f:
    Merry_way_about_section = f.read()

Merry_way_about_section = "Coffee shop Merry's Way about section: " +  Merry_way_about_section
texts.append(Merry_way_about_section)

In [15]:
Merry_way_about_section

"Coffee shop Merry's Way about section: Welcome to Merry's Way Coffee, your neighborhood coffee shop located in the heart of Greenwich Village, New York City. At Merry's Way, we believe that coffee is more than just a drink—it’s an experience, a moment of joy, and a way to connect with others.\n\nOur Story\nFounded in 2015, Merry’s Way started as a small family-owned café with one mission: to share the love of quality, ethically-sourced coffee with our community.\n\nMerry's passion for travel and coffee led her on a journey across South America, where she handpicked partnerships with small farms and cooperatives. We ensure that every cup we brew tells a story of dedication and care, from farm to table. Our beans are roasted in-house to bring out unique flavors that reflect the regions where they were grown.\n\nDelivery & Locations Served\nIn addition to offering a cozy place to enjoy coffee in our café, we proudly deliver to Greenwich Village, SoHo, West Village, and Lower Manhattan. W

In [16]:
with open("products/menu_items_text.txt") as f:
    menu_items_text = f.read()

menu_items_text = "Menu Items:" +  menu_items_text
texts.append(menu_items_text)

In [17]:
menu_items_text

'Menu Items:Menu Items\n\nCappuccino - $4.50\nJumbo Savory Scone - $3.25\nLatte - $4.75\nChocolate Chip Biscotti - $2.50\nEspresso shot - $2.00\nHazelnut Biscotti - $2.75\nChocolate Croissant - $3.75\nDark chocolate (Drinking Chocolate) - $5.00\nCranberry Scone - $3.50\nCroissant - $3.25\nAlmond Croissant - $4.00\nGinger Biscotti - $2.50\nOatmeal Scone - $3.25\nGinger Scone - $3.50\nChocolate syrup - $1.50\nHazelnut syrup - $1.50\nCarmel syrup - $1.50\nSugar Free Vanilla syrup - $1.50\nDark chocolate (Packaged Chocolate) - $3.00'

# Generatine Embeddings

In [18]:
embeddings = client.embed_documents(texts)

In [19]:
len(texts)

20

In [20]:
len(embeddings[0])

384

In [21]:
print(output)

[-0.02657674252986908, -0.016637271270155907, -0.006781684700399637, -0.030367190018296242, 0.013455442152917385, -0.017892757430672646, -0.01098969392478466, 0.04435940459370613, 0.02644093707203865, -0.020191442221403122, -0.003610821906477213, 0.014323770068585873, 0.036119669675827026, 0.043115563690662384, 0.045061152428388596, 0.010315672494471073, 0.011875133961439133, -0.010059352032840252, -0.08524903655052185, -0.008190781809389591, 0.09314368665218353, 0.060102708637714386, 0.014295180328190327, -0.05056868493556976, -0.02880781702697277, 0.00024380248214583844, 0.026618143543601036, 0.015001095831394196, 0.0006413300870917737, -0.10944204777479172, -0.05080209672451019, -0.012296824716031551, 0.017423462122678757, 0.014841594733297825, 0.03755756467580795, 0.0062120999209582806, 0.04210670664906502, 0.010007616132497787, -0.007931591011583805, 0.011070908047258854, 0.03834463283419609, -0.030788596719503403, 0.03215635567903519, 0.015154127962887287, 0.03439926356077194, -0

# Push data to Pinecone

In [None]:
# Already created index
# pc.create_index(
#     name=index_name,
#     dimension=384,
#     metric="cosine",
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1",
#     )
# )

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2025-04', 'x-cloud-trace-context': 'dab5cd02fb8eea7dfbc0ffc0df0e2bad', 'date': 'Fri, 10 Oct 2025 14:40:53 GMT', 'server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [23]:
index_name

'coffeeshop'

In [27]:
# wait for the index to be ready
while not pc.describe_index(index_name).status.ready:
    time.sleep(1)

index = pc.Index(index_name)

vectors = []
for text,e in zip(texts, embeddings):
    entry_id = text.split(":")[0]
    vectors.append({
        "id": entry_id,
        "values": e,
        "metadata": {"text": text}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 20}

# Get Closest Documents

In [32]:
embedding = client.embed_query("Is Cappuccino lactose-free?")
print(embedding)

[-0.010232332162559032, -0.07079202681779861, 0.013101769611239433, 0.022865671664476395, -0.007587413769215345, -0.09336430579423904, 0.028398215770721436, 0.020006418228149414, -0.06876154989004135, -0.07542302459478378, -0.02959904819726944, 0.009782595559954643, -0.012115735560655594, 0.02086402103304863, 0.051913827657699585, -0.019710736349225044, 0.054689984768629074, -0.03429940342903137, -0.05258718878030777, 0.032577358186244965, 0.0355956070125103, -0.029410868883132935, -0.06972324103116989, -0.05228806659579277, 0.08192303031682968, -0.03983461856842041, 0.09830321371555328, 0.0056315939873456955, -0.060353100299835205, -0.15804611146450043, -0.03744832053780556, -0.11788293719291687, -0.049299612641334534, 0.016216572374105453, -0.03635699674487114, 0.03685462102293968, 0.03033679910004139, -0.07127507030963898, 0.03538621962070465, -0.021950263530015945, 0.005022057332098484, 0.029132850468158722, -0.026964591816067696, 0.05379251763224602, -0.02294052392244339, 0.040055

In [33]:
results = index.query(
    namespace="ns1",
    vector=embedding,
    top_k=3,
    include_values=False,
    include_metadata=True
)

In [34]:
results

{'matches': [{'id': 'Cappuccino ',
              'metadata': {'text': 'Cappuccino : A rich and creamy cappuccino '
                                   'made with freshly brewed espresso, steamed '
                                   'milk, and a frothy milk cap. This '
                                   'delightful drink offers a perfect balance '
                                   'of bold coffee flavor and smooth milk, '
                                   'making it an ideal companion for relaxing '
                                   'mornings or lively conversations. -- '
                                   "Ingredients: ['Espresso', 'Steamed Milk', "
                                   "'Milk Foam'] -- Price: 4.5 -- rating: 4.7"},
              'score': 0.729644775,
              'values': []},
             {'id': 'Sugar Free Vanilla syrup ',
              'metadata': {'text': 'Sugar Free Vanilla syrup : Enjoy the sweet '
                                   'flavor of vanilla without th