In [None]:
# Create a Vectore store for Niva bupa documents

In [1]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders.parsers import RapidOCRBlobParser
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyMuPDFParser

file_path = "Files/goactive-brochure.pdf"
loader =  GenericLoader(
    blob_loader=FileSystemBlobLoader(
    path = "./Files/",
    glob="*.pdf",
    show_progress = True
    ),
    blob_parser=PyMuPDFParser(
        mode="page",
        extract_tables="markdown",
    ),
)


In [2]:
import re
def clean_text(text):
    """Cleans text by removing unwanted characters and excessive whitespace."""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.,;!?-]', '', text)
    return text.strip()

In [10]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

def setup_vectorstore():
    """Creates vector store from processed PDF content."""
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en", model_kwargs = {"device":"cuda"},encode_kwargs={'normalize_embeddings': True})
    vector_store = Chroma(
    collection_name="Insuarance",
    embedding_function=embeddings,
    persist_directory="./chroma_db",
)
    return vector_store
vector_store = setup_vectorstore()

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
pages = []


for i, doc in enumerate(loader.lazy_load()):
    print("---------processing:",doc.metadata["source"])
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap=30)
    chunks = text_splitter.transform_documents([doc])
    for chunk in chunks:
        chunk.page_content = clean_text(chunk.page_content)
        vector_store.add_documents(documents=[chunk], ids=[f"{i}"])
        pages.append(chunk)
        
    

len(pages)

  0%|          | 0/24 [00:00<?, ?it/s]

---------processing: Files/GoActive-Proposal-Form.pdf
---------processing: Files/GoActive-Proposal-Form.pdf
---------processing: Files/GoActive-Proposal-Form.pdf
---------processing: Files/GoActive-Proposal-Form.pdf
---------processing: Files/GoActive-Proposal-Form.pdf
---------processing: Files/GoActive-Proposal-Form.pdf


  4%|▍         | 1/24 [01:09<26:35, 69.37s/it]

---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf
---------processing: Files/heartbeat-claim-form.pdf


  8%|▊         | 2/24 [02:20<25:49, 70.45s/it]

---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf
---------processing: Files/healthpremia brochure.pdf


 12%|█▎        | 3/24 [02:31<15:12, 43.47s/it]

---------processing: Files/healthpremia brochure.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/policy wording.pdf
---------processing: Files/

 17%|█▋        | 4/24 [06:00<36:14, 108.73s/it]

---------processing: Files/combo_hc_hr_brochure.pdf


 21%|██        | 5/24 [06:02<22:11, 70.10s/it] 

---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-health-recharge-t-and-c.pdf
---------processing: Files/nivabupa-heal

 25%|██▌       | 6/24 [07:22<22:05, 73.66s/it]

---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-document.pdf
---------processing: Files/goactive-policy-docum

 29%|██▉       | 7/24 [09:20<24:55, 87.95s/it]

---------processing: Files/NivaBupa-pre-auth-claim-form (1).pdf
---------processing: Files/NivaBupa-pre-auth-claim-form (1).pdf
---------processing: Files/NivaBupa-pre-auth-claim-form (1).pdf
---------processing: Files/NivaBupa-pre-auth-claim-form (1).pdf
---------processing: Files/NivaBupa-pre-auth-claim-form (1).pdf


 33%|███▎      | 8/24 [09:38<17:29, 65.60s/it]

---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf
---------processing: Files/health companion proposal form.pdf


 38%|███▊      | 9/24 [10:12<13:56, 55.78s/it]

---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf
---------processing: Files/health-companion-claim-form.pdf


 42%|████▏     | 10/24 [10:41<11:07, 47.71s/it]

---------processing: Files/hp-single-sheeter.pdf
---------processing: Files/hp-single-sheeter.pdf


 46%|████▌     | 11/24 [10:45<07:26, 34.35s/it]

---------processing: Files/goactive-brochure.pdf
---------processing: Files/goactive-brochure.pdf


 50%|█████     | 12/24 [11:03<05:51, 29.32s/it]

---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf
---------processing: Files/health-companion-claim-form (1).pdf


 54%|█████▍    | 13/24 [11:32<05:21, 29.20s/it]

---------processing: Files/hr-single-sheeter.pdf
---------processing: Files/hr-single-sheeter.pdf


 58%|█████▊    | 14/24 [11:35<03:32, 21.29s/it]

---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf
---------processing: Files/heartbeat-proposal-form.pdf


 62%|██████▎   | 15/24 [12:12<03:54, 26.10s/it]

---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf
---------processing: Files/mbhr-proporsal-form.pdf


 67%|██████▋   | 16/24 [12:43<03:39, 27.42s/it]

---------processing: Files/NivaBupa-pre-auth-claim-form.pdf
---------processing: Files/NivaBupa-pre-auth-claim-form.pdf
---------processing: Files/NivaBupa-pre-auth-claim-form.pdf
---------processing: Files/NivaBupa-pre-auth-claim-form.pdf
---------processing: Files/NivaBupa-pre-auth-claim-form.pdf


 71%|███████   | 17/24 [13:01<02:51, 24.56s/it]

---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Fil

 75%|███████▌  | 18/24 [15:10<05:35, 55.96s/it]

---------processing: Files/nivabupa-health-pulse-t-and-c.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf
---------processing: Files/HPR-RETAIL-AppForm.pdf


 79%|███████▉  | 19/24 [15:37<03:57, 47.44s/it]

---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-wording.pdf
---------processing: Files/health-companion-policy-word

 83%|████████▎ | 20/24 [16:22<03:06, 46.71s/it]

---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia Policy Wording.pdf
---------processing: Files/Health Premia

 88%|████████▊ | 21/24 [19:12<04:10, 83.58s/it]

---------processing: Files/HP-Retail-AppForm.pdf
---------processing: Files/HP-Retail-AppForm.pdf
---------processing: Files/HP-Retail-AppForm.pdf
---------processing: Files/HP-Retail-AppForm.pdf
---------processing: Files/HP-Retail-AppForm.pdf
---------processing: Files/HP-Retail-AppForm.pdf


 92%|█████████▏| 22/24 [19:40<02:13, 66.88s/it]

---------processing: Files/healthcompanionbrochure.pdf
---------processing: Files/healthcompanionbrochure.pdf


 96%|█████████▌| 23/24 [19:43<00:47, 47.71s/it]

---------processing: Files/heartbeat brochure.pdf
---------processing: Files/heartbeat brochure.pdf


100%|██████████| 24/24 [19:59<00:00, 49.98s/it]


12467