In [22]:
import pandas as pd
import openai
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import pinecone
from langchain.llms import openai 

In [2]:
# to load api keys
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# read the docum
def read_doc(directory):
    fileloader=PyPDFDirectoryLoader(directory)
    document=fileloader.load()
    return document

In [4]:
doc=read_doc('document/')
len(doc)

8

In [17]:
doc

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'document\\FRG-Nutrition-Tips.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Nutrition Tips\nCalories are a measurement of how much energy food gives us. Our bodies use energy\nevery second, whether we are moving, resting, or sleeping...so we need to eat a certain\nnumber of calories each day to function. \nPracticing healthy eating habits during your isolation or quarantine\n(safe separation) period for COVID-19 can be a great way to spend\nyour time.\nThis booklet will cover four tips for healthy eating that can be\npracticed in isolation or quarantine: \nNote: Those living with diabetes and/or obesity should follow a doctor’s recommendation\nfor a healthy meal plan. \nThese tips can help you find a simple plan for balancing nutrients and calories. \nEating a variety of nutrients within a healthy amount of calories is a great step toward a\nhealthier life. \nNutrients are 

In [6]:
def chunks(doc,chunk_size=500,chunk_overlap=50):
    textsplitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size,chunk_overlap=chunk_overlap)
    docs=textsplitter.split_documents(doc)
    return docs


In [7]:
d=chunks(doc=doc)
d

[Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'document\\FRG-Nutrition-Tips.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='Nutrition Tips\nCalories are a measurement of how much energy food gives us. Our bodies use energy\nevery second, whether we are moving, resting, or sleeping...so we need to eat a certain\nnumber of calories each day to function. \nPracticing healthy eating habits during your isolation or quarantine\n(safe separation) period for COVID-19 can be a great way to spend\nyour time.\nThis booklet will cover four tips for healthy eating that can be\npracticed in isolation or quarantine:'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'document\\FRG-Nutrition-Tips.pdf', 'total_pages': 8, 'page': 0, 'page_label': '1'}, page_content='practiced in isolation or quarantine: \nNote: Those living with diabetes and/or obesity should follow a doctor’s recommendation\nfor

In [8]:
import os

In [9]:
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
# configure the API key
genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
# import pinecone-client
response=genai.embed_content(model="models/embedding-001", content="Hello world",task_type="retrieval_query")

In [20]:
len(response['embedding'])

768

In [34]:
embeddings=response['embedding']

In [28]:
from pinecone import Pinecone, ServerlessSpec


In [None]:
Pinecone(
    api_key='your api key')
# Index details
index_name = "pinecone-1"

In [40]:
from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

In [None]:
# 1. Instantiate the Pinecone client
pc = Pinecone(api_key='your api key')

# 2. Create an index
# index_config = pc.create_index(
#     name="pinecone-1",
#     dimension=768,
#     spec=ServerlessSpec(
#         cloud=CloudProvider.AWS,
#         region=AwsRegion.US_EAST_1
#     ),
#     vector_type=VectorType.DENSE
# )

In [46]:
index_config=pc.describe_index(name="pinecone-1")
index_config

{
    "name": "pinecone-1",
    "metric": "cosine",
    "host": "pinecone-1-cqp5qb2.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 768,
    "deletion_protection": "disabled",
    "tags": null
}

In [49]:
index=pc.Index(host=index_config.host)
index

<pinecone.db_data.index.Index at 0x206677a5510>

In [51]:
fileloader=PyPDFDirectoryLoader('document/')
pages=fileloader.load()

In [52]:
pdf_content = " ".join([page.page_content for page in pages])

In [78]:
pdf_content

'Nutrition Tips\nCalories are a measurement of how much energy food gives us. Our bodies use energy\nevery second, whether we are moving, resting, or sleeping...so we need to eat a certain\nnumber of calories each day to function. \nPracticing healthy eating habits during your isolation or quarantine\n(safe separation) period for COVID-19 can be a great way to spend\nyour time.\nThis booklet will cover four tips for healthy eating that can be\npracticed in isolation or quarantine: \nNote: Those living with diabetes and/or obesity should follow a doctor’s recommendation\nfor a healthy meal plan. \nThese tips can help you find a simple plan for balancing nutrients and calories. \nEating a variety of nutrients within a healthy amount of calories is a great step toward a\nhealthier life. \nNutrients are chemicals that help our bodies grow and function. We get them through the\nfood we eat. \nEat a Variety of Food and Drink Plenty of Water\nFollow Portion Recommendations\nUse Nutrition Labe

In [54]:
len(pdf_content)

7748

In [55]:
pdf_embeddings = genai.embed_content(
    model="models/embedding-001",content=pdf_content,task_type="retrieval_document")

In [56]:
pdf_embeddings

{'embedding': [-0.06413752,
  -0.01333827,
  -0.009375478,
  -0.012806482,
  0.046987254,
  0.043460514,
  0.00045337356,
  -0.03177753,
  0.024927456,
  0.03262803,
  0.026471386,
  0.0315664,
  0.022919359,
  -0.009012579,
  -0.006097734,
  -0.036479045,
  0.033625994,
  0.031211523,
  -0.007957947,
  -0.02215466,
  -0.028220031,
  -0.0038316878,
  -0.0032738051,
  -0.028270306,
  0.038000725,
  -0.02391081,
  0.032508627,
  -0.08248835,
  -0.049062602,
  0.0077904765,
  -0.044107806,
  0.033873398,
  -0.0331948,
  -0.007865641,
  0.02558204,
  -0.06620086,
  0.00073785154,
  -0.028075982,
  -0.030218158,
  0.04795404,
  0.01667722,
  -0.036980394,
  -0.052970834,
  -0.0024140545,
  -0.019461267,
  -0.019160932,
  0.0056727077,
  -0.012913992,
  -0.01179307,
  -0.08341724,
  -0.05239321,
  -0.013589654,
  0.07946139,
  -0.02032827,
  -0.024383282,
  -0.011933306,
  0.029028576,
  -0.0027925752,
  -0.005494137,
  0.02150491,
  -0.034408458,
  0.051199693,
  0.009010255,
  0.0012105071

In [57]:
# Extract the actual vector
pdf_vector = pdf_embeddings['embedding']

In [58]:
index.upsert(
    vectors=[('pdf-doc-1',pdf_vector,{'category': 'pdf', 'source': 'nutrition.pdf'})],
    namespace='pdf-ns'
)

{'upserted_count': 1}

In [59]:
query_embedding = genai.embed_content(
    model="models/embedding-001",
    content="What is the document about?",
    task_type="retrieval_query"
)['embedding']


In [79]:
query_embedding

[0.0024116896,
 -0.053263173,
 -0.04109813,
 0.000596933,
 0.026214909,
 0.00990018,
 0.017615667,
 -0.01500826,
 -0.0028524215,
 0.03913288,
 0.03067052,
 0.04489722,
 -0.010789816,
 -0.014715588,
 0.021496782,
 -0.019746985,
 0.02932996,
 0.023880618,
 -0.020894276,
 -0.027779268,
 0.032714877,
 0.035867043,
 -0.029296722,
 -0.011039544,
 -0.011349986,
 -0.011402191,
 0.0060603986,
 -0.10301671,
 -0.047633342,
 0.032086063,
 -0.08127465,
 0.03270212,
 -0.053610153,
 0.023685765,
 0.018022606,
 -0.06609352,
 0.0056463634,
 -0.01194635,
 -0.03888464,
 0.017856162,
 -0.0031921067,
 -0.021560539,
 -0.029423501,
 -0.008233013,
 0.028948948,
 -0.020985294,
 0.017326225,
 0.038838062,
 0.022781257,
 -0.06191751,
 0.00032920248,
 0.030459136,
 0.017110964,
 -0.039430868,
 -0.024102315,
 -0.008177515,
 0.022510972,
 0.034372676,
 -0.01611428,
 -0.004811478,
 -0.0072205095,
 0.017423574,
 -0.03391297,
 0.056906845,
 -0.017903633,
 -0.05365642,
 -0.015560612,
 0.010452689,
 0.07737923,
 -0.0371

In [81]:
response = index.query(
    vector=query_embedding,
    top_k=3,  # number of similar results to return
    namespace="pdf-ns",
    include_metadata=True
)


In [82]:
response

{'matches': [{'id': 'pdf-doc-1',
              'metadata': {'category': 'pdf', 'source': 'nutrition.pdf'},
              'score': 0.568464458,
              'values': []}],
 'namespace': 'pdf-ns',
 'usage': {}}

In [61]:
for match in response['matches']:
    print(f"Score: {match['score']}")
    print(f"Metadata: {match['metadata']}")


Score: 0.568464458
Metadata: {'category': 'pdf', 'source': 'nutrition.pdf'}


In [83]:
context = "\n".join([match['metadata']['source'] for match in response['matches']])
context

'nutrition.pdf'

In [90]:
# 5. Search for similar records
from pinecone import SearchQuery, SearchRerank, RerankModel

response = index.search_records(
    namespace="pdf-ns",
    query=SearchQuery(
        inputs={
            "text": "Apple corporation",
        },
        top_k=3
    ),
    rerank=SearchRerank(
        model=RerankModel.Bge_Reranker_V2_M3,
        rank_fields=["category"],
        top_n=3,
    ),
)


PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'Date': 'Mon, 21 Jul 2025 12:13:03 GMT', 'Content-Type': 'text/plain; charset=utf-8', 'Content-Length': '116', 'Connection': 'keep-alive', 'x-envoy-upstream-service-time': '37', 'server': 'envoy'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Integrated inference is not configured for this index"},"status":400}


In [75]:
model = genai.GenerativeModel('models/gemini-1.5-flash')
response = model.generate_content('Tell me a story about a magic backpack')
response.text



In [87]:
response = model.generate_content(
    f"Answer this using the context below:\n{pdf_content}\n\nWhat is the document about?"
)

print(response.text)

This document provides nutrition tips for healthy eating, particularly during isolation or quarantine.  It covers four key tips: eating a variety of foods and drinking plenty of water; planning meals; using nutrition labels to make informed choices; and keeping the plan simple and sustainable.  The tips emphasize incorporating the five food groups (grains, vegetables, fruits, protein, and dairy) in recommended portions, limiting unhealthy fats and sodium, and understanding nutrition labels to make healthier food choices.

