In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
from llama_index.core.schema import MetadataMode

## Extract

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_cloud_services import LlamaParse

parser = LlamaParse(result_type="markdown") # type: ignore

file_extractor = {".pdf": parser}
# documents = SimpleDirectoryReader("./test_docs2").load_data()
documents = SimpleDirectoryReader(
    "./test_docs2",
    file_extractor=file_extractor # type: ignore
).load_data()

Started parsing the file under job_id e72195c8-9ddd-4afe-b941-850ead68c4e3


## Transform

In [4]:
for doc in documents:
    doc.text_template = "Metadata:\n{metadata_str}\n---\nContent:\n{content}"

    if "page_label" not in doc.excluded_embed_metadata_keys:
        doc.excluded_embed_metadata_keys.append("page_label")

## Metadata Extraction

In [7]:
# from langchain_openai import OpenAI
# from langchain_ollama import OllamaLLM
# from llama_index.llms.groq import Groq
from llama_index.llms.ollama import Ollama
import os
from dotenv import load_dotenv

load_dotenv()

llm = Ollama(
    model="mistral",
    request_timeout=300
)

In [None]:
from llama_index.core.extractors import TitleExtractor, QuestionsAnsweredExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Document
import asyncio

text_splitter = SentenceSplitter(
    separator=" ", chunk_size=1024, chunk_overlap=128
)

title_extractor = TitleExtractor(llm=llm, nodes=5)
qa_extractor = QuestionsAnsweredExtractor(llm=llm, questions=3)

from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        title_extractor,
        qa_extractor
    ]
)


nodes = await pipeline.arun(
    documents=documents,
    in_place=True,
    show_progress=True,
    # num_workers=16
)
# nodes = pipeline.run(
#     documents=[Document.example()],
#     in_place=True,
#     show_progress=True,
# )

Parsing nodes:   0%|          | 0/39 [00:00<?, ?it/s]

100%|██████████| 39/39 [06:41<00:00, 10.30s/it]
100%|██████████| 81/81 [28:27<00:00, 21.08s/it]


In [9]:
nodes[0].model_dump()
# print(nodes[0].get_content(metadata_mode=MetadataMode.EMBED))

{'id_': 'bae5d984-9b34-41bf-a3b3-2a58cea6e51f',
 'embedding': None,
 'metadata': {'file_path': '/home/ishant-gupta/Desktop/HackRx-Project/test_docs2/1_rag_doc.pdf',
  'file_name': '1_rag_doc.pdf',
  'file_type': 'application/pdf',
  'file_size': 1329008,
  'creation_date': '2025-08-03',
  'last_modified_date': '2025-08-01',
  'document_title': ' "Regulations for AYUSH Day Care Centers with Cashless Facility under HDFC ERGO General Insurance Policy"',
  'questions_this_excerpt_can_answer': "1. What is the insurance policy that covers all insured persons and what are the terms, conditions, and exclusions governing this policy?\n2. What defines an AYUSH Hospital according to the document? What criteria must an AYUSH Hospital comply with in order to be eligible for this insurance coverage?\n3. How does the policy define 'any one illness' and 'accident' in relation to its terms, conditions, and exclusions? Additionally, what does the policy mean by a 'continuous Period of illness'?"},
 'exc

## Generate embeddings

In [2]:
from llama_index.embeddings.ollama import OllamaEmbedding

embeddings_model = OllamaEmbedding(
    model_name="bge-m3"
)

test_embed = embeddings_model.get_text_embedding("Hello World")
print(test_embed)

[-1.505329966545105, 0.7557322978973389, -0.4548438489437103, 0.5922243595123291, -0.8631601333618164, -1.1510695219039917, -1.483644723892212, -1.0629169940948486, -0.10884019732475281, 0.03572480380535126, -0.07141947001218796, 0.28156158328056335, 0.6840705871582031, -0.22174079716205597, 0.42930278182029724, -0.21458585560321808, 0.5189207196235657, -0.7113049030303955, -1.0071320533752441, -0.6988881230354309, -1.1455819606781006, -0.1445048302412033, 0.7888111472129822, -0.5613049864768982, 0.32356154918670654, 1.3794094324111938, -0.8641065359115601, -0.02112520858645439, -0.023562710732221603, -1.5674747228622437, 1.1761631965637207, 2.094935417175293, -0.2038499116897583, -1.3954715728759766, -0.7186660766601562, -1.054764747619629, 0.07953530550003052, -0.40492767095565796, -1.8114339113235474, 0.52800053358078, 0.10926657915115356, 0.5004262328147888, 0.35009267926216125, -1.6471726894378662, 0.47572755813598633, -1.3718055486679077, -1.2184059619903564, -0.06294786185026169

## Creating vector store index and persisting in chromadb 

In [11]:
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")
chroma_collection = chroma_client.get_or_create_collection(name="rag_embeddings")

vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex(
    nodes=nodes,
    embed_model=embeddings_model,
    storage_context=storage_context
)