# Import Required Libraries
This section installs and imports all the essential libraries for PDF processing, semantic search, and machine learning workflow. If running in Kaggle, some packages may already be installed.

In [1]:
# Install required packages (uncomment if running in a fresh environment)
!pip install fastapi uvicorn pdfplumber faiss-cpu sentence-transformers pydantic pytest fastapi uvicorn

Collecting fastapi
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.34.3-py3-none-any.whl.metadata (6.5 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.7-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting starlette<0.47.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.46.2-py3-none-any.whl.metadata (6.2 kB)
Collecting pdfminer.six==20250506 (from pdfplumber)
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m

In [2]:
import os
import numpy as np
import pandas as pd
import pdfplumber
import faiss
import pickle
import warnings

# Ignore TensorFlow INFO and WARNING logs:
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # 0 = all messages, 1 = INFO, 2 = WARNING, 3 = ERROR
# Suppress the oneDNN custom operations message
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

from sentence_transformers import SentenceTransformer

# Ignore deprecation warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

E0000 00:00:1750100274.867573      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750100274.922457      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load and Explore the Dataset
In this section, we will load a sample PDF document, extract its text, and perform basic exploration. For demonstration, you can upload your own PDF file to the notebook environment.

In [3]:
# Upload a PDF file (if running in Jupyter/Kaggle)
from pathlib import Path

# If running in Kaggle, place your PDF in the working directory or use the file browser to upload
pdf_path = '/kaggle/input/assignment-document-file/Evidence of Coverage Document assignment.pdf'  # Change this if needed

def extract_text_from_pdf(pdf_path):
    documents = []
    if not os.path.exists(pdf_path):
        print(f"Error: File {pdf_path} does not exist.")
        return documents
    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:
                documents.append({
                    "filename": os.path.basename(pdf_path),
                    "page": page_num + 1,
                    "text": text.strip()
                })
    return documents

documents = extract_text_from_pdf(pdf_path)
print(f"Extracted: {len(documents)} pages from {pdf_path}")
# Show first page text
# if documents:
#     print(documents[0]["text"][:1000])

Extracted: 134 pages from /kaggle/input/assignment-document-file/Evidence of Coverage Document assignment.pdf


# Data Preprocessing
Now, we will split the extracted text into manageable chunks for embedding and search. This step is crucial for semantic search over long documents.

In [4]:
def split_text_into_chunks(text, max_length=500):
    sentences = text.split('. ')
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) < max_length:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

chunks = []
for doc in documents:
    text_chunks = split_text_into_chunks(doc["text"])
    for chunk in text_chunks:
        chunks.append({
            "filename": doc["filename"],
            "page": doc["page"],
            "text": chunk
        })
print(f"Split into [{len(chunks)}] chunks")
# Show first chunk
# if chunks:
#     print(chunks[0]["text"][:500])

Split into [1003] chunks


# Model Building
We will use a pre-trained SentenceTransformer model to generate embeddings for each text chunk. These embeddings will be used for semantic search.

In [5]:
model = SentenceTransformer('all-roberta-large-v1')
texts = [chunk["text"] for chunk in chunks]
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
print(f"Generated embeddings with shape: {embeddings.shape}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.68k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/328 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Generated embeddings with shape: (1003, 1024)


# Model Training
For semantic search, we do not need to train a model from scratch. Instead, we build a FAISS index from the generated embeddings for fast similarity search.

In [6]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors.")
# Optionally save index and chunks for later use
faiss.write_index(index, "faiss.index")

with open("chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
print("Index and chunks saved to disk.")

FAISS index built with 1003 vectors.
Index and chunks saved to disk.


# Model Evaluation
We will now perform semantic search using the FAISS index and evaluate the results for a sample query.

In [7]:
def perform_search(query, top_k=3):
    query_embedding = model.encode([query])[0].astype('float32')
    distances, indices = index.search(np.array([query_embedding]), top_k)
    results = []
    for idx in indices[0]:
        if idx < len(chunks):
            chunk = chunks[idx]
            results.append(chunk)
    return results

# Test the semantic search functionality with a sample query
query = "Diabetes services"
top_k = 3
results = perform_search(query, top_k)

print("Response Text (Excerpts):")
print("-" * 50)
for result in results:
    print(result["text"])
    print("-" * 50)

print("\nDocuments:")
print("-" * 50)
for result in results:
    print(f"Document: {result['filename']}, Page: {result['page']}")
    print("-" * 50)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Response Text (Excerpts):
--------------------------------------------------
Covered services $0 copayment for preferred brand
include: Medicare-covered diabetic monitoring
· Supplies to monitor your blood glucose: Blood glucose monitor, blood glucose test supplies.
--------------------------------------------------
Examples of urgently needed
services are unforeseen medical illnesses and injuries or unexpected flare-ups of existing conditions. However, medically
necessary routine provider visits, such as annual checkups, are not considered urgently needed even if you are outside the
service area of the plan or the plan network is temporarily unavailable.
· Kidney dialysis services that you get at a Medicare-certified dialysis facility when you are temporarily outside the plan’s
service area.
--------------------------------------------------
A physician must prescribe these services
and renew their order yearly if your treatment is needed into the next calendar year.
Medicare Diabetes

In [8]:
# Try your own query here
user_query = "What are the coverage details for emergency services?"
user_results = perform_search(user_query, top_k=1)
for result in user_results:
    print(f"Page: {result['page']}\n \nExcerpt:\n{result['text'][:500]}\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Page: 60
 
Excerpt:
Your plan must cover urgently needed services and only charge you $50,000 (USD) combined limit per year
in-network cost sharing. Examples of urgently needed services are unforeseen medical for emergency and urgent care
illnesses and injuries, or unexpected flare-ups of existing conditions. However, medically services provided outside the U.S.

