In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install langchain pypdf

In [None]:
!pip install -U langchain-community

In [6]:
import os
from langchain.document_loaders import PyPDFLoader

In [7]:
pdf_folder='/content/drive/MyDrive/RCM_chatbot/Knowledge Source (RCM)'

In [8]:
from genericpath import exists
output_folder='/content/drive/MyDrive/RCM_chatbot/processed_texts'
os.makedirs(output_folder, exist_ok=True)

**step 1 : process the PDFs**

In [9]:
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        file_path = os.path.join(pdf_folder, filename)
        loader = PyPDFLoader(file_path)
        documents = loader.load()

        for i, doc in enumerate(documents):
            base_name = os.path.splitext(filename)[0]
            output_filename = f"{base_name}_page_{i+1}.txt"
            output_path = os.path.join(output_folder, output_filename)

            with open(output_path, "w", encoding="utf-8") as f:
                f.write(doc.page_content)

print(" All PDF pages processed and saved as .txt files.")



 All PDF pages processed and saved as .txt files.


**step 2 : Chunk the text**

In [11]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

In [12]:
documents = []

for filename in os.listdir(output_folder):
    if filename.endswith(".txt"):
        file_path = os.path.join(output_folder, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
            documents.append(Document(page_content=text, metadata={"source": filename}))

print(f"Loaded {len(documents)} documents (pages).")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)

chunks = text_splitter.split_documents(documents)

print(f"Total chunks created: {len(chunks)}")
print("\n Example chunk:")
print(chunks[0].page_content[:500])

Loaded 375 documents (pages).
Total chunks created: 1610

 Example chunk:
RS
 & F Healthcare Advisors 130 ADMIRAL COCHRANE DRIVE  SUITE 102  ANNAPOLIS, MARYLAND 21401  PHONE 410.897.9888  FAX 410.897.9889
MARYLAND LOCAL HEALTH DEPARTMENT – BILLING MANUAL 2019 
Revenue Cycle Management Process (RCM) 
The RCM process begins with the first patient contact and ends once their services have been paid. All 
administrative and clinical functions contribute to the capture, management, and collection of revenue.


In [None]:
!pip install -U sentence-transformers

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [15]:
model = SentenceTransformer("all-MiniLM-L6-v2")

texts = [chunk.page_content for chunk in chunks]

embeddings = model.encode(texts, show_progress_bar=True)

print(f"Generated {len(embeddings)} embeddings.")
print(f"Each embedding is a vector of size: {len(embeddings[0])}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Generated 1610 embeddings.
Each embedding is a vector of size: 384


In [16]:
print("Sample Chunk:\n", texts[0][:300])
print("\nCorresponding Embedding Vector:\n", embeddings[0])

Sample Chunk:
 RS
 & F Healthcare Advisors 130 ADMIRAL COCHRANE DRIVE  SUITE 102  ANNAPOLIS, MARYLAND 21401  PHONE 410.897.9888  FAX 410.897.9889
MARYLAND LOCAL HEALTH DEPARTMENT – BILLING MANUAL 2019 
Revenue Cycle Management Process (RCM) 
The RCM process begins with the first patient contact and ends once 

Corresponding Embedding Vector:
 [-4.77940999e-02  1.53839318e-02 -7.93074444e-02 -2.94060148e-02
  3.83993238e-02  4.84619439e-02 -6.98309988e-02  1.08991466e-01
 -4.04133834e-03  6.35655746e-02 -1.48862954e-02  3.82212587e-02
  3.86398882e-02  1.21291913e-02  6.35735691e-03  4.49669780e-03
 -2.23755054e-02 -3.77841778e-02  7.17569739e-02  4.89624292e-02
 -2.59813666e-02  7.99802393e-02  6.56910287e-03 -5.00023402e-02
 -4.69716042e-02  1.86322480e-02 -2.46111974e-02  8.41809623e-03
  2.60479655e-02  4.67024781e-02  1.40784755e-01  2.18367856e-02
  1.87280551e-02  2.78528724e-02  2.76996661e-02  3.27492133e-02
 -6.74936995e-02  1.10143898e-02  2.38514990e-02  1.08918034e-01


In [19]:
!pip install faiss-cpu



In [20]:
import faiss
print(faiss.__version__)

1.11.0


In [21]:
import faiss
import pickle

embedding_dim = len(embeddings[0])
embeddings_np = np.array(embeddings).astype("float32")

index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings_np)

save_dir = "/content/drive/MyDrive/RCM_chatbot/faiss_manual_index"
os.makedirs(save_dir, exist_ok=True)

faiss.write_index(index, os.path.join(save_dir, "index.faiss"))

metadata = [{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks]
with open(os.path.join(save_dir, "metadata.pkl"), "wb") as f:
    pickle.dump(metadata, f)

print(f"FAISS index and metadata saved in folder: {save_dir}")

FAISS index and metadata saved in folder: /content/drive/MyDrive/RCM_chatbot/faiss_manual_index


In [22]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.13-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.0 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-6.0.1-py3-none-any.whl.metadata (6.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-api>=1.2.0 (from chromadb)
  Downloading opentelemetry_api-1.34.1-py3-none-any.whl.metadata (1.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.34.1-py3-none-any.whl.metadata (2.4 kB)
Collecting opentelemetry-sdk>=1.2.0 (from chromadb)
  Downloading opentelemetry_sdk-1.34.1-py3-none-any.whl.metadata (1.6 kB)
Coll