In [1]:
# %pip install -r requirements.txt

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

mistral_key = os.getenv("Mistral")
pinecone_key = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_KEY"] = pinecone_key
os.environ["HF_TOKEN"]=os.getenv("HuggingFace")

In [2]:
from langchain_mistralai.embeddings import MistralAIEmbeddings

embeddings = MistralAIEmbeddings(
    model="mistral-embed",
    api_key=mistral_key,
    max_retries=5,
    # request_timeout=60,
)

embedding_vector = embeddings.embed_query("hello world!")
embedding_dimension = len(embedding_vector)
embedding_dimension 


  from .autonotebook import tqdm as notebook_tqdm


1024

In [3]:
import pypdf
from langchain_experimental.text_splitter import SemanticChunker

def convert_pdf_documents(path,embeddings):
    reader = pypdf.PdfReader(path)
    content = ""
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        content += page.extract_text()

    text_splitter = SemanticChunker(embeddings,
                                breakpoint_threshold_type="gradient",
                                )
    docs = text_splitter.create_documents([content])
    
    return docs


In [4]:
path="resources\jk_agri.pdf"
docs = convert_pdf_documents(path,embeddings)
print(len(docs),type(docs))

90 <class 'list'>


In [6]:
## docs = dos[:50]
# docs = docs[50:] # at a time an't take more than 50 docs, so split
# len(docs)

In [7]:
# from pinecone import ServerlessSpec
# from pinecone import Pinecone

# pc = Pinecone(
#     api_key=pinecone_key)

# index_name = "demo"
# demo_namespace = "example"

# if index_name not in pc.list_indexes().names():
#     pc.create_index(
#       name=index_name,
#       metric="cosine",
#       dimension=embedding_dimension,
#       spec=ServerlessSpec(cloud="aws",region="us-east-1")
# )

# index = pc.Index(index_name)


# from langchain.docstore.document import Document
# from langchain_pinecone import Pinecone

# # If id is not specified, LangChain will assign a random UUID for each document.
# ids = ["1", "2", "3", "4"]
# texts = ["foo", "bar", "world", "hello"]

# meta_key_1 = "meta_1"
# meta_key_2 = "meta_2"
# metadatas = [{meta_key_1: "a1", meta_key_2: "a2"}, {meta_key_1: "b1", meta_key_2: "b2"}, {meta_key_1: "c1", meta_key_2: "c2"}, {meta_key_1: "d1", meta_key_2: "d2"}]

# documents = []
# for i in range(len(ids)):
#   documents.append(Document(ids=ids[i], page_content=texts[i], metadata=metadatas[i]))

# # Existing documents persist and aren't replaced.
# docsearch = Pinecone.from_documents(documents, embeddings, index_name=index_name, namespace=demo_namespace, ids=ids)

# docsearch.add_texts(texts=texts, metadatas=metadatas, ids=ids, namespace=demo_namespace)



#### ===================== Semantic Search ================== #####


# query = "foo2 bar"
# top_k = 2
# docs = docsearch.similarity_search_with_score(query, namespace=demo_namespace, k=top_k)
# print(docs)

In [5]:
from pinecone import ServerlessSpec
from pinecone import Pinecone as PineconeClient

pc = PineconeClient(api_key=os.getenv("PINECONE_API_KEY"))

doc_index_name = "finance-pdf-index"
doc_namespace = "pdf-semantic-search"

if doc_index_name not in pc.list_indexes().names():
    pc.create_index(
      name=doc_index_name,
      metric="cosine",
      dimension=embedding_dimension,
      spec=ServerlessSpec(cloud="aws",region="us-east-1")
)

index = pc.Index(doc_index_name)

In [6]:
from langchain_pinecone import PineconeVectorStore
import time


def add_documents_in_batches(docs, embeddings, batch_size=5):
    total_docs = len(docs)
    
    for i in range(0, total_docs, batch_size):
        batch = docs[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size}")
        try:
            if i == 0:
                # Create the vector store with first batch
                vectorstore = PineconeVectorStore.from_documents(
                    batch,
                    embeddings,
                    index_name=doc_index_name,
                    namespace=doc_namespace,
                )
            else:
                # Add subsequent batches
                vectorstore.add_documents(batch, namespace=doc_namespace)
            
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Exponential backoff
            time.sleep(5 * (2 ** (i//batch_size)))
            continue
    
    return vectorstore


def add_remaining_documents_in_batches(docs, embeddings, batch_size=5):
    total_docs = len(docs)
    
    for i in range(0, total_docs, batch_size):
        batch = docs[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size}")
        try:
            vectorstore = PineconeVectorStore(index, embeddings,
                                  "text", namespace=doc_namespace)
            vectorstore.add_documents(batch, namespace=doc_namespace)
            
            time.sleep(2)
            
        except Exception as e:
            print(f"Error processing batch {i//batch_size + 1}: {e}")
            # Exponential backoff
            time.sleep(5 * (2 ** (i//batch_size)))
            continue
    
    return vectorstore


##### Ingest documents 

In [10]:
# doc_search = add_documents_in_batches(docs, embeddings,batch_size=10)
# doc_search = add_remaining_documents_in_batches(docs, embeddings,
#                                                 batch_size=10)



In [7]:
stats = index.describe_index_stats()
print(stats)

{'dimension': 1024,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'pdf-semantic-search': {'vector_count': 178}},
 'total_vector_count': 178,
 'vector_type': 'dense'}


In [None]:
from langchain_pinecone import PineconeVectorStore

query = "what is the profit after tax ? how much did it change ?"
top_k = 2
vectorstore = PineconeVectorStore(
    index, embeddings, "text", namespace=doc_namespace
)
result_docs = vectorstore.similarity_search_with_score(query, 
                                               namespace=doc_namespace, 
                                               k=top_k)
len(result_docs)

2

In [10]:
# for result_doc in result_docs:
#     print(result_doc[0])
#     print("=========================")

Extract context using SelfQueryRetriever : Retriever that uses a vector store and an LLM to generate the vector store queries

details : https://python.langchain.com/api_reference/langchain/retrievers.html

In [None]:
from langchain_mistralai.chat_models import ChatMistralAI

llm = ChatMistralAI(
    model="mistral-large-latest",  
    mistral_api_key=mistral_key,
    temperature=0
)

In [17]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_pinecone import PineconeVectorStore


vectorstore = PineconeVectorStore(
    index, embeddings, "text", namespace=doc_namespace
)

metadata_field_info = []
document_content_description = "Annual report of J.K Agri 2025"

chunks_query_retriever = SelfQueryRetriever.from_llm(
    llm, 
    vectorstore, 
    document_content_description, 
    metadata_field_info,  # Empty list
    verbose=True,
    enable_limit=True,
    search_kwargs={"k": 2, "namespace": doc_namespace}
)

chunks_query_retriever

SelfQueryRetriever(vectorstore=<langchain_pinecone.vectorstores.PineconeVectorStore object at 0x000002B8A473E010>, query_constructor=RunnableBinding(bound=FewShotPromptTemplate(input_variables=['query'], input_types={}, partial_variables={}, examples=[{'i': 1, 'data_source': '```json\n{{\n    "content": "Lyrics of a song",\n    "attributes": {{\n        "artist": {{\n            "type": "string",\n            "description": "Name of the song artist"\n        }},\n        "length": {{\n            "type": "integer",\n            "description": "Length of the song in seconds"\n        }},\n        "genre": {{\n            "type": "string",\n            "description": "The song genre, one of "pop", "rock" or "rap""\n        }}\n    }}\n}}\n```', 'user_query': 'What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre', 'structured_request': '```json\n{{\n    "query": "teenager love",\n    "filter": "and(or(eq(\\"artist\\", \\"Taylor Swi

In [18]:
query = "what is the profit after tax ? how much did it change ?"

relevant_docs = chunks_query_retriever.invoke(query)
context = [doc.page_content for doc in relevant_docs]

for i, c in enumerate(context):
        print(f"Context {i + 1}:")
        print(c)
        print("\n================\n")

Context 1:
d. Long-term Employee Benefit. Compensated absences which are not expected to occur within twelve months after the end of the period 
in which the employee renders the related services are recognized as a liability at the present value of 
the defined benefit obligation at the balance sheet date. Annual leaves can either be availed or encashed subject to restriction on the maximum accumulation of 
leaves. (viii) Taxes on Income
 a. Current tax: 
  Tax on income for the current period is determined on the basis of estimated taxable income and tax 
credits computed in accordance with the provisions of the relevant tax laws and based on the expected 
outcome of assessments / appeals. Current income tax relating to items recognized directly in equity is recognized in equity and not in the 
statement of profit and loss.


Context 2:
51
Income Tax Expense. i. Amount recognized in statement of profit and loss :- (` In Lacs)
Particulars 2024-25
A) Current Income Tax
Current Year  - 

In [19]:
# Check what metadata your documents actually have
sample_docs = vectorstore.similarity_search("section_type", k=2, 
                                            namespace=doc_namespace)
if sample_docs:
    print("Sample document metadata:", sample_docs[0].metadata)

Sample document metadata: {}


### Using Pincone native model to create index & search on it

In [23]:
docs[0]

Document(metadata={}, page_content='JKAGL: SECTL: SE: 2025  Date: 11th August 2025 \nBSE Ltd. Department of Corporate Services   \n25th Floor, Phiroze Jeejeebhoy Towers \nDalal Street, Mumbai-400001 \nScrip Code: 536493 \nThrough: BSE Listing Centre \nDear Sir/ Madam, \nRe: Regulation 34 of SEBI (Listing Obligations and Disclosure Requirements), \nRegulations, 2015 \n- Annual Report for the financial year 2024-25\n- Notice of 25th Annual General Meeting\n ----------------------------------------------------------------------------------------------------------------- \nPursuant to Regulation 34 of SEBI (Listing Obligations and Disclosure \nRequirements), Regulations, 2015, we enclosed herewith Annual Report of the \nCompany for the financial year 2024-2025 and the Notice of 25 th Annual General \nMeeting of the Company scheduled to be held on Friday, 5 th September 2025 at 3:00 \nP.M.')

In [None]:
import time

class PineConeNative:
    def __init__(self,index_name,namespace,docs) -> None:
        self.index_name = index_name
        self.namespace = namespace
        self.docs = docs
        if not pc.has_index(index_name):
            pc.create_index_for_model(
                name=index_name,
                cloud="aws",
                region="us-east-1",
                embed={
                    "model":"llama-text-embed-v2",
                    "field_map":{"text": "chunk_text"}
                }
            )
        self.dense_index = pc.Index(index_name)

    def summarize_and_categorize(self,text_content):
        llm = ChatMistralAI(
            model="ministral-8b-2410",
            mistral_api_key=mistral_key,
            temperature=0
        )
        output_schema = {
            "name": "SummarizeAndCategorize",
            "description": "Summarizes given content and suggests suitable & very specific category.",
            "parameters": {
                "type": "object",
                "properties": {
                    "text_response": {"type": "string"},
                    "category": {"type": "string"}
                },
                "required": ["text_response", "category"]
            }
        }

        # Create a structured instance of the LLM
        structured_llm = llm.with_structured_output(output_schema)
        prompt = (
            "Summarize the following text in one paragraph and provide a suitable category for it.\n"
            "Return your answer as a dictionary with keys 'text_response' for the summary and 'category' for the category.\n\n"
            f"Text:\n{text_content}\n"
        )        
        return structured_llm.invoke(prompt)

    def transform_records(self,threshold_token_size=1024):
        transformed_docs = []
        for idx,doc in enumerate(self.docs):
            # Need to apply summarization strategy both query + document
            summarized_docs = self.summarize_and_categorize(doc.page_content)
            tmp_docs = {"_id":str(idx),
                        "chunk_text":summarized_docs['text_response'] if len(summarized_docs['text_response'])>0 else doc.page_content[:1000],
                        "category":summarized_docs['category'] if len(summarized_docs['category'])>0 else "General"}
            transformed_docs.append(tmp_docs)
        print(len(transformed_docs),type(transformed_docs[0]),transformed_docs[0])
        return transformed_docs


    def add_documents_in_batches(self,batch_size=5):
        self.docs = self.transform_records()
        total_docs = len(self.docs)
    
        for i in range(0, total_docs, batch_size):
            batch = self.docs[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1}/{(total_docs + batch_size - 1)//batch_size}")
            try:
                self.dense_index.upsert_records(self.namespace, batch)
                time.sleep(2)
            except Exception as e:
                print(f"Error processing batch {i//batch_size + 1}: {e}")
                # Exponential backoff
                time.sleep(5 * (2 ** (i//batch_size)))
                continue
        
        return self.dense_index


In [38]:
native_index_name="finance-pdf-index-native"
doc_namespace_native = "pdf-semantic-search-native"
pinecone_obj = PineConeNative(native_index_name,doc_namespace_native,docs)
dense_index = pinecone_obj.add_documents_in_batches(batch_size=10)

90 <class 'dict'> {'_id': '0', 'chunk_text': 'BSE Ltd. has submitted its annual report for the financial year 2024-25 and a notice for its 25th Annual General Meeting, scheduled for September 5, 2025.', 'category': 'Corporate Announcements'}
Processing batch 1/9
Processing batch 2/9
Processing batch 3/9
Processing batch 4/9
Processing batch 5/9
Processing batch 6/9
Processing batch 7/9
Processing batch 8/9
Processing batch 9/9


#### Update + Filter

In [None]:
# Define the query
query = "what is the profit after tax ? how much did it change ?"

dense_index.update(id="46", 
             set_metadata={"category": "Director's report",}, 
             namespace=doc_namespace_native)


{}

In [52]:
filtering_conditions = {
        "$or": [
            {"category": {"$in": ["Financial Statements","Director's report", "Accounting","Financial Report"]}},
            {"score": {"$gt":0.20}},
        ]
    }

filtered_results = dense_index.search(
    namespace=doc_namespace_native, 
    query={
        "inputs": {"text":query}, 
        "top_k": 5,
        "filter": filtering_conditions,
    },
    fields=["category", "score"]
)

print(filtered_results)

{'result': {'hits': [{'_id': '72',
                      '_score': 0.2803533375263214,
                      'fields': {'category': 'Accounting'}},
                     {'_id': '73',
                      '_score': 0.254242479801178,
                      'fields': {'category': 'Accounting'}},
                     {'_id': '89',
                      '_score': 0.22509200870990753,
                      'fields': {'category': 'Financial Report'}},
                     {'_id': '54',
                      '_score': 0.22414502501487732,
                      'fields': {'category': 'Financial Report'}},
                     {'_id': '46',
                      '_score': 0.2227933555841446,
                      'fields': {'category': "Director's report"}}]},
 'usage': {'embed_total_tokens': 16, 'read_units': 1}}


In [54]:
# Delete by filtering metadata
# dense_index.delete(
#     filter={
#         "category": {"$eq": "Financial Report"},
#     }
# )

In [None]:

# Search the dense index
results = dense_index.search(
    namespace=doc_namespace_native,
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    }
)

# Print the results
for hit in results['result']['hits']:
        print(f"id: {hit['_id']:<5} | score: {round(hit['_score'], 2):<5} | category: {hit['fields']['category']:<10} | text: {hit['fields']['chunk_text']:<50}")

id: 60    | score: 0.3   | category: Finance    | text: The text discusses various financial aspects of the company's operations, including income tax expenses, changes in liabilities, impairment testing of intangible assets, exceptional items, and revenue recognition under contracts. It provides detailed information on the company's financial performance and the factors affecting it.
id: 83    | score: 0.28  | category: Financial Statements, Accounting | text: The text provides a detailed breakdown of the income tax expense, including current income tax, deferred tax, and reconciliation of effective tax rate. It also includes information on deferred tax assets, changes in liabilities arising from financing activities, impairment testing of intangible assets, exceptional items, and revenue recognized under contracts. The text is categorized under 'Financial Statements' and 'Accounting' categories.
id: 72    | score: 0.28  | category: Accounting | text: Long-term Employee Benefit: Compe

##### re-rank search

In [41]:
# Search the dense index and rerank results
reranked_results = dense_index.search(
    namespace=doc_namespace_native,
    query={
        "top_k": 10,
        "inputs": {
            'text': query
        }
    },
    rerank={
        "model": "bge-reranker-v2-m3",
        "top_n": 10,
        "rank_fields": ["chunk_text"]
    }   
)

# Print the reranked results
for hit in reranked_results['result']['hits']:
    print(f"id: {hit['_id']}, score: {round(hit['_score'], 2)}, text: {hit['fields']['chunk_text']}, ")

id: 54, score: 0.01, text: The Company is engaged in Agri & Allied products. It has one business segment, 'Agri & Allied products'. The company reported a loss of 250.44 lacs in 2024-25 and 2,082.08 lacs in 2023-24. The earnings per equity share were -5.40 in 2024-25 and -44.90 in 2023-24. No dividends were declared. The company adopted Ind AS 116 for leases, resulting in recognizing right-of-use assets and corresponding lease liabilities. The carrying value of right-of-use assets changed from 425.62 lacs to 100.71 lacs. The company spent 5.06 lacs on Corporate Social Responsibility activities in 2024-25., 
id: 73, score: 0.0, text: Management periodically evaluates positions taken in the tax returns with respect to situations in which applicable tax regulations are subject to interpretation and establishes provisions where appropriate. Deferred tax is provided using the balance sheet approach on temporary differences at the reporting date between the tax bases of assets and liabilitie

HyDe retriever 