In [4]:
!pip install langchain langchain-openai langchain-huggingface "unstructured[pdf,tesseract,easyocr]" chromadb tiktoken langchain_community

Collecting langchain-openai
  Downloading langchain_openai-0.3.21-py3-none-any.whl.metadata (2.3 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.2.0-py3-none-any.whl.metadata (941 bytes)
Collecting chromadb
  Downloading chromadb-1.0.12-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting langchain_community
  Downloading langchain_community-0.3.24-py3-none-any.whl.metadata (2.5 kB)
Collecting unstructured[easyocr,pdf,tesseract]
  Downloading unstructured-0.17.2-py3-none-any.whl.metadata (24 kB)
Collecting langchain-core<1.0.0,>=0.3.58 (from langchain)
  Downloading langchain_core-0.3.64-py3-none-any.whl.metadata (5.8 kB)
Collecting filetype (from unstructured[easyocr,pdf,tesseract])
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured[easyocr,pdf,tesseract])
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured[eas

In [8]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 35 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.8 [186 kB]
Fetched 186 kB in 1s (142 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126111 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.8_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.8) ...
Setting up poppler-utils (22.02.0-2ubuntu0.8) ...
Processing triggers for man-db (2.10.2-1) ...


In [16]:
from unstructured.partition.pdf import partition_pdf
from bs4 import BeautifulSoup
import re
from langchain.schema import Document
from langchain_community.vectorstores import Chroma
from langchain.schema.runnable import RunnablePassthrough
from langchain_huggingface import HuggingFaceEmbeddings

class GenerateDocs:
    def __init__(self, pdfPath):
        self.path = pdfPath
        self.name = pdfPath

    def ParsePDF(self):

        self.chunks = partition_pdf(
            filename=self.path,
            infer_table_structure=True,            # extract tables
            strategy="hi_res",                     # mandatory to infer tables
            extract_image_block_types=["Image", "Table"],   # Add 'Table' to list to extract image of tables
            # image_output_dir_path=output_path,   # if None, images and tables will saved in base64
            extract_image_block_to_payload=True,   # if true, will extract base64 for API usage
            chunking_strategy="by_title",          # or 'basic', by_title
            max_characters=10000,                  # defaults to 500
            combine_text_under_n_chars=2000,       # defaults to 0
            new_after_n_chars=6000,
        )

    def GetItems(self):
        self.resChunks = []
        self.resElms = []

        for chunk in self.chunks:
            elements = chunk.metadata.orig_elements
            resList=[]
            for elm in elements:
                # print(type(elm))
                # continue
                if 'Table' not in str(type(elm)):
                    #print(elm.to_dict())
                    elmDict = elm.to_dict()
                    text = [elmDict['text'] if 'text' in elmDict else '']
                    links = [elmDict['metadata']['links'] if 'links' in elmDict['metadata'] else []]
                    pageNumber = [elmDict['metadata']['page_number'] if 'page_number' in elmDict['metadata'] else []]
                    item = {"text": text, "links" : links, "pageNumber" : pageNumber}
                    resList.append(item)
                    self.resElms.append(item)
                else:
                    elmDict = elm.to_dict()
                    text = [elmDict['text'] if 'text' in elmDict else '']
                    tableHtml = [elmDict['metadata']['text_as_html'] if 'text_as_html' in elmDict['metadata'] else '']
                    links = [elmDict['metadata']['links'] if 'links' in elmDict['metadata'] else []]
                    pageNumber = [elmDict['metadata']['page_number'] if 'page_number' in elmDict['metadata'] else []]
                    item = {"tableHtml": tableHtml, "links" : links, "pageNumber" : pageNumber}
                    resList.append(item)
                    self.resElms.append(item)
            self.resChunks.append(resList)

        return self.resElms

    def clean_text(self, text):
        # Normalize whitespace and remove unwanted characters
        return re.sub(r'\s+', ' ', text.strip().replace('\xa0', ' '))

    def parse_table_to_paragraph(self, table_html):
        soup = BeautifulSoup(table_html, "html.parser")
        rows = soup.find_all("tr")
        lines = []

        for tr in rows:
            cols = tr.find_all(["td", "th"])
            col_texts = [self.clean_text(col.get_text()) for col in cols]

            if col_texts:
                line = " | ".join(col_texts)
                lines.append(line)

        paragraph = "\n".join(lines)
        return paragraph

    def GetDocs(self):
        self.ParsePDF()
        resElms = self.GetItems()
        completeDoc = ''
        docs = []
        smallDocs = []

        for item in resElms:
            if 'text' in item:
                completeDoc += item['text'][0]+ '\n'
            else:
                doc = Document(
                  page_content=completeDoc,
                  metadata = {"Policy": self.name}
                )
                docs.append(doc)
                #Extract Table data
                table = self.parse_table_to_paragraph(item['tableHtml'][0])
                nColumns = len(table.split('\n')[0].split('|'))
                columns = ['Important Questions', 'Answers', 'Why This Matters']

                columns4 = ['Common Medical Event' , 'Services You May Need' , 'Member out of pocket, Limitations, Exceptions', 'Other Important Information']

                rows = table.split('\n')

                for row in rows:
                    rowColsData = row.split('|')
                    if len(rowColsData) == 4:
                        metadata = {"event": rowColsData[0], "services": rowColsData[1]}
                        data = {columns4[0]: columns4[0], columns4[1]: rowColsData[1], columns4[2]: rowColsData[2], columns4[3]: rowColsData[3]}
                        pageContent = "\n".join(f"{key}: {value}" for key, value in data.items())
                    elif len(rowColsData) == 3:
                        metadata = {"event": rowColsData[0]}
                        data = {columns[0]: rowColsData[0], columns[1]: rowColsData[1], columns[2]: rowColsData[2]}
                        pageContent = "\n".join(f"{key}: {value}" for key, value in data.items())
                    else :
                        print(rowColsData)
                        #Examples data
                        if len(rowColsData) == 1 and rowColsData[0] == "Cost Sharing" :
                            #start new Doc
                            doc = Document(
                                page_content = '\n'.join(smallDocs),
                                metadata = {"Policy": self.name}
                            )
                            docs.append(doc)
                            continue
                        else:
                            x = rowColsData[0] + ':' +(rowColsData[1] if len(rowColsData) > 1 else '')
                            smallDocs.append(x)
                            continue

                    metadata['Policy'] = self.name
                    doc = Document(
                        page_content=pageContent,
                        metadata=metadata
                    )
                    docs.append(doc)
                    completeDoc = ''
        return docs


In [10]:
import os
directory_path = "Data"  # change this to your target folder

files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f))]
print(files)

["America's_Choice_7350_Copper_SOB.pdf", "America's_Choice_2500_Gold_SOB.pdf", "America's_Choice_5000_HSA_SOB.pdf", "America's_Choice_5000_Bronze_SOB.pdf"]


In [11]:
from langchain.schema import Document
import json

def save_documents_to_file(doc_list, file_path):
    with open(file_path, 'w', encoding='utf-8') as f:
        for doc in doc_list:
            json.dump({
                "page_content": doc.page_content,
                "metadata": doc.metadata
            }, f)
            f.write('\n')  # JSON Lines format

In [17]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:

allDocs = []
for filePath in files:

  genDocs = GenerateDocs("Data/" + filePath)
  # Usage
  docs = genDocs.GetDocs()
  save_documents_to_file(docs, filePath +'.json')

  allDocs.append(docs)
  DB_PATH = "chroma_db"
  vectorstore = Chroma.from_documents(
          documents=docs,
          embedding=embeddings,
          persist_directory=DB_PATH
  )
  print("---------------------------------------")
  print(f"Ingestion complete. Vector store created at: {DB_PATH}")
  print(f"Total chunks stored: {vectorstore._collection.count()}")
  print("---------------------------------------")




['Preferred brand drugs ', ' America’s Pharmacy Source']
['Non-preferred brand drugs ', ' Not covered']
['']
['Cost Sharing']
['Deductibles ', ' $3,500']
['Copayments ', ' $100']
['Coinsurance ', ' $0']
['What isn’t covered']
['Limits or exclusions ', ' $0']
['The total Peg would pay is ', ' $3,600']
['Cost Sharing']
['Deductibles ', ' $3,500']
['Copayments ', ' $0']
['Coinsurance ', ' $0']
['What isn’t covered']
['Limits or exclusions ', ' $0']
['The total Mia would pay is ', ' $3,500']
---------------------------------------
Ingestion complete. Vector store created at: chroma_db
Total chunks stored: 40
---------------------------------------
['Cost Sharing']
['Deductibles ', ' $2,500']
['Copayments ', ' $40']
['Coinsurance ', ' $0']
['What isn’t covered']
['Limits or exclusions ', ' $0']
['The total Peg would pay is ', ' $2,540']
['Cost Sharing ', ' ']
['Deductibles ', ' $2,500']
['Copayments ', ' $40']
['Coinsurance ', ' $0']
['What isn’t covered']
['Limits or exclusions ', ' $0']
[

**Retrieval Code**

In [19]:
DB_PATH = "chroma_db"

In [20]:
# 1. Load the existing vector store
vectorstore = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})

  vectorstore = Chroma(persist_directory=DB_PATH, embedding_function=embeddings)


In [82]:
from langchain.prompts import ChatPromptTemplate

# 2. Prompt for answering the question using retrieved context
ANSWER_PROMPT_TEMPLATE = """
You are a helpful customer support assistant. Use the following context to answer the question as accurately as possible.

If you cannot find a clear answer, say: "I'm sorry, I don't have that information in my documentation."

Be concise and do not make up information.

Context:
{context}

Question:
{question}

Answer:
"""
ANSWER_PROMPT = ChatPromptTemplate.from_template(ANSWER_PROMPT_TEMPLATE)

In [83]:
!pip install langchain_groq



In [84]:
#3. LLM
from langchain_groq import ChatGroq

GROQ_API_KEY = "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"
llm = ChatGroq(temperature=0, groq_api_key= GROQ_API_KEY, model_name="llama-3.1-8b-instant")


In [85]:
llm.invoke("Hi")

AIMessage(content="It's nice to meet you. Is there something I can help you with or would you like to chat?", additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 36, 'total_tokens': 59, 'completion_time': 0.030666667, 'prompt_time': 0.001674674, 'queue_time': 0.214963797, 'total_time': 0.032341341}, 'model_name': 'llama-3.1-8b-instant', 'system_fingerprint': 'fp_55d70a61e4', 'finish_reason': 'stop', 'logprobs': None}, id='run--d8826859-c8dc-4c2e-b488-c48e8e1e1b75-0', usage_metadata={'input_tokens': 36, 'output_tokens': 23, 'total_tokens': 59})

In [86]:
from langchain.schema.output_parser import StrOutputParser

answer_chain = (
    RunnablePassthrough.assign(
        context=lambda x: retriever.get_relevant_documents(x["question"])
    )
    | ANSWER_PROMPT
    | llm
    | StrOutputParser()
)

In [87]:
 question = "When does copayment or coinsurance apply?"

 streaming_response = answer_chain.invoke({"question": question})

In [88]:
print(streaming_response)

According to the provided documents, a copayment or coinsurance may apply even if you haven't yet met the deductible amount.


In [93]:
response = answer_chain.invoke({"question": "What is meant by the deductible in the policy?"})
print(response)

The deductible in the policy refers to the amount that must be paid out of pocket by the policyholder before the insurance plan starts covering certain costs or services.


In [90]:
retriever.get_relevant_documents("What is the deductible in this policy?")

[Document(metadata={'Policy': "Data/America's_Choice_7350_Copper_SOB.pdf"}, page_content='This plan covers some items and services even if you haven’t yet met the deductible amount.\nBut a copayment or coinsurance may apply. For example, this plan covers certain preventive\n1 of 6\n'),
 Document(metadata={'Policy': "Data/America's_Choice_2500_Gold_SOB.pdf"}, page_content='This plan covers some items and services even if you haven’t yet met the deductible amount.\nBut a copayment or coinsurance may apply. For example, this plan covers certain preventive\n1 of 6\n'),
 Document(metadata={'Policy': "Data/America's_Choice_5000_HSA_SOB.pdf"}, page_content='This plan covers some items and services even if you haven’t yet met the deductible amount.\nBut a copayment or coinsurance may apply. For example, this plan covers certain preventive\n1 of 6\n'),
 Document(metadata={'Policy': "Data/America's_Choice_5000_Bronze_SOB.pdf"}, page_content='This plan covers some items and services even if you h

In [91]:
response = answer_chain.invoke({"question": "What is the overall deductible?"})
print(response)

I can help you find the answer to that question. 

Based on the provided documents, the overall deductible varies depending on the policy. Here are the deductibles for each policy:

- America's Choice 7350 Copper SOB: $7,350/individual or $14,700/family
- America's Choice 2500 Gold SOB: $2,500/individual or $5,000/family
- America's Choice 5000 HSA SOB: $5,000/individual or $10,000/family
- America's Choice 5000 Bronze SOB: $5,000/individual or $10,000/family

Please note that these deductibles are specific to each policy and may not be applicable to other policies. If you have any further questions or would like more information, please let me know.


In [92]:
response = answer_chain.invoke({"question": "What is Indian policy?"})
print(response)

I'm sorry, I don't have that information in my documentation.


In [94]:
response = answer_chain.invoke({"question": "What is Indian policy?"})
print(response)

I'm sorry, I don't have that information in my documentation.


In [95]:
response = answer_chain.invoke({"question": "Give Details on Prescription Drug Coverage"})
print(response)

Based on the provided context, I can give you some details on prescription drug coverage.

For specialty drugs, it is mentioned that they are not covered, with 0% of the plan allowable and deductible.

For generic drugs, it is mentioned that more information about prescription drug coverage is available at www.myfreepharmacy.com. Additionally, it is stated that America's Pharmacy Source is a covered option for generic drugs.

It's worth noting that there is no clear information on the overall prescription drug coverage, such as copays, coinsurance, or deductibles, for all types of prescription drugs.

If you need more specific information, I recommend checking the provided website (www.myfreepharmacy.com) or contacting the plan administrator directly.


In [96]:
!zip -r chroma_db.zip chroma_db/

  adding: chroma_db/ (stored 0%)
  adding: chroma_db/chroma.sqlite3 (deflated 61%)
  adding: chroma_db/bac2c2c1-2be6-4548-a9b5-943bc0ea113a/ (stored 0%)
  adding: chroma_db/bac2c2c1-2be6-4548-a9b5-943bc0ea113a/length.bin (deflated 100%)
  adding: chroma_db/bac2c2c1-2be6-4548-a9b5-943bc0ea113a/header.bin (deflated 61%)
  adding: chroma_db/bac2c2c1-2be6-4548-a9b5-943bc0ea113a/data_level0.bin (deflated 100%)
  adding: chroma_db/bac2c2c1-2be6-4548-a9b5-943bc0ea113a/link_lists.bin (stored 0%)
