In [2]:
!pip install langchain
!pip install pypdf
!pip install unstructured
!pip install -U langchain-community
!pip install faiss-cpu

Collecting pypdf
  Downloading pypdf-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.3.1-py3-none-any.whl (302 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.3.1
Collecting unstructured
  Downloading unstructured-0.17.0-py3-none-any.whl.metadata (24 kB)
Collecting filetype (from unstructured)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting python-magic (from unstructured)
  Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting emoji (from unstructured)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting dataclasses-json (from unstructured)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting python-iso639 (from unstructured)
  Downloading python_iso639-2025.2.18-py3-none-any.whl.metadata (14 kB)
Collecting langdetect 

In [195]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.schema import BaseRetriever
from langchain import PromptTemplate

class CustomQA():

    """
    custom QA chain.
    Checks if the retrriever is able to match any docs.
    If so, the LLM is called, else, No LLM is called.
    A message is returned.
    """
    def __init__(self, llm_model, retriever: BaseRetriever):


        template = """
          System: You are a intelligent question answering bot, that provides short answers to questions.

          Instructions:
          1. Make Use the following context to to answer the user query.
          2. Make sure the answer is short and brief.
          3. Use the context to generate the answer.

          Context: {context}

          Human: {inputs}

          Assistant:"""

        prompt_template = PromptTemplate(
              input_variables=["inputs", "context"],
              template=template,
        )


        self.qa_chain = create_stuff_documents_chain(llm=llm_model, prompt = prompt_template)
        self.retriever = retriever
        self.llm_model = llm_model

    def run(self, inputs):

        question = inputs["query"]
        retrieved_docs = self.retriever.get_relevant_documents(question)

        if not retrieved_docs:
            print("No matching documents found. Skipping LLM call.")
            return {"result": "I couldn't find any information about that.", "source_documents": []}
        else:
            asnwer = self.qa_chain.invoke({
                "inputs": question,
                "context": retrieved_docs
            })
            response = {}
            response["result"] = asnwer
            response["source_documents"] = retrieved_docs
            return response



In [196]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, UnstructuredMarkdownLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI


import os

os.environ["OPENAI_API_KEY"] = "<< -- OPEN AI KEY -- >>"


class RAG:
  def __init__(self, doc_file, query_file):

    if not self.check_file_exists(doc_file):
      raise FileNotFoundError(f"File not found: {doc_file}")

    if not self.check_file_exists(query_file):
      raise FileNotFoundError(f"File not found: {query_file}")

    self.document = UnstructuredMarkdownLoader(doc_file).load()
    self.chunks = self.split_document(self.document)
    self.vector_store = self.create_vector_store(self.chunks)
    self.queries = load_json_file(query_file)
    self.results = []
    self.retriever = self.vector_store.as_retriever(search_type="similarity_score_threshold",
                                                    search_kwargs={"score_threshold": 0.2})
    self.llm = ChatOpenAI(model="gpt-4o")
    self.qa = CustomQA(llm_model=self.llm,
                                retriever=self.retriever)

  def check_file_exists(self, file_path):
    return os.path.exists(file_path)

  def split_document(self, document, chunk_size: int = 500, chunk_overlap: int = 0):
      """
      takes in the .MD file and splits it into chunks of text.
      input:
        file_path: string = path to the .MD file.
        chunk_size: integer = Number of chars in a single chunk.
        chunk_overlap: integer = Number of chars that are borrowed from the previous chunk.
      output:
        texts: list = list of chunks of text.

      """

      text_splitter = RecursiveCharacterTextSplitter(
          chunk_size=chunk_size, chunk_overlap=chunk_overlap
      )
      texts = text_splitter.split_documents(document)
      return texts

  def create_vector_store(self, chunks):
      """
      takes in the chunks of text and creates a FAISS vector store.
      By default, the model used is all-MiniLM-L6-v2.
      input:
        chunks: list = list of chunks of text.
      output:
        db: FAISS = vector store.
      """
      embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
      db = FAISS.from_documents(chunks, embeddings)
      return db

  def get_answer(self, query):
      """
      takes in a query and returns the answer.
      input:
        query: string = query to be answered.
      output:
        result: JSON = anser and source documents.
      """
      result = self.qa.run({"query": query})
      source_docs = result["source_documents"]

      return result

  def load_json_file(self, file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
            return data
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON format in '{file_path}'.")
        return None

  def store_json_file(self):
    """
    stores the results in a JSON file
    stores a list of dictionaries in a JSON file.
    """

    with open("results.json", "w") as f:
        json.dump(self.results, f, indent=4)


  def run(self):
    """
    runs the RAG pipeline on the list of queries.
    stores the results in a JSON file.
    """

    results = []
    for query in self.queries:
        result = self.get_answer(query['text'])
        answer, source_documents = result["result"], result["source_documents"]
        query["answer"] = answer
        query["source_documents"] = [{"id": doc.id, "content": doc.page_content} for doc in source_documents]

        results.append(query)

    self.results = results
    self.store_json_file()
    return self.results




In [197]:
rag_agent = RAG("medicare_comparison.md","queries.json")


In [198]:
res = rag_agent.run()

In [199]:
for i in res:
  print(f"quesiton: {i['text']}")
  print(f"answer: {i['answer']}")
  print("**********************************")
  print("Source documents:")
  for j in i['source_documents']:
    print("---------------------------------------------")
    print(f"documentID: {j['id']}")
    print(f"page content: {j['content']}")
    print("\n")

  print("==============================================\n")

quesiton: Can I see any doctor with Original Medicare?
answer: Yes, you can see any doctor or hospital that accepts Medicare anywhere in the U.S.
**********************************
Source documents:
---------------------------------------------
documentID: 3c5f0cf1-58e4-4f0b-b425-8843032dbde8
page content: Compare Original Medicare & Medicare Advantage

Doctor & hospital choice

Original Medicare

You can go to any doctor or hospital that takes Medicare, anywhere in the U.S.

In most cases you don’t need a referral to see a specialist.

Medicare Advantage

In many cases, you can only use doctors and other providers who are in the plan’s network and service area (for non-emergency care).

You may need to get a referral to see a specialist.

Cost

Original Medicare


---------------------------------------------
documentID: c1e7696b-f595-49c4-97d1-a7944684877a
page content: Original Medicare covers most medically necessary services and supplies in hospitals, doctors’ offices, and other h