In [1]:
# For reading PDF Files
from PyPDF2 import PdfReader

#For reading Word document
from docx import Document

#For creating embeddings
from sentence_transformers import SentenceTransformer

#For using OpenAI AI Models
import openai

#For Vector DB
import faiss

#Helper libraries
import numpy as np
import os
from pathlib import Path
import re

print("Libraries imported successfully")

Libraries imported successfully


In [2]:
#Create a folder to store uploaded documents
document_path = 'QnADocuments'
if not os.path.exists(document_path):
    os.makedirs('QnADocuments')
    print("Document folder created")
else:
    print("Document folder exists")

print(f"Place document in {document_path} folder") 


Document folder exists
Place document in QnADocuments folder


In [3]:
def load_document(filename):   
    file_path = os.path.join(document_path, filename)
    
    if not os.path.exists(file_path):
        print(f"Error: File '{filename}' not found in '{DOCUMENTS_FOLDER}' folder")
        return None, None
    
    file_extension = filename.lower().split('.')[-1]
    
    if file_extension == 'pdf':
        file_type = 'pdf'
        print(f"Found PDF file: {filename}")
    elif file_extension == 'docx':
        file_type = 'docx'
        print(f"Found Word document: {filename}")
    elif file_extension == 'txt':
        file_type = 'txt'
        print(f"Found text file: {filename}")
    else:
        print(f"Error: Unsupported file type '.{file_extension}'")
        return None, None
    
    file_size = os.path.getsize(file_path)
    file_size_kb = file_size / 1024
    print(f"File size: {file_size_kb:.2f} KB")
    
    return file_path, file_type

In [4]:
file_path, file_type = load_document("insurance_faq_knowledge_base.pdf")

Found PDF file: insurance_faq_knowledge_base.pdf
File size: 3.13 KB


In [5]:
def extract_text_from_document(file_path, file_type):
    extracted_text = ""

    if file_type == 'pdf':
        try:
            pdf_reader = PdfReader(file_path)
            total_pages = len(pdf_reader.pages)
            print(f"Total pages in PDF: {total_pages}")
            
            for page_num in range(total_pages):
                page = pdf_reader.pages[page_num]
                page_text = page.extract_text()
                extracted_text += page_text + "\n"
            print("Successfully extracted text from PDF")            
        except Exception as e:
            print(f"Error reading PDF: {e}")
            return None
    elif file_type == 'docx':
        try:
            doc = Document(file_path)
            total_paragraphs = len(doc.paragraphs)
            print(f"Total paragraphs in document: {total_paragraphs}")
            
            for i, paragraph in enumerate(doc.paragraphs):
                extracted_text += paragraph.text + "\n"

            print("Successfully extracted text from Word document")           
        except Exception as e:
            print(f"Error reading Word document: {e}")
            return None
    elif file_type == 'txt':
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                extracted_text = file.read()
            
            print("Successfully extracted text from TXT file")           
        except Exception as e:
            print(f"Error reading text file: {e}")
            return None    
    else:
        print(f"Unsupported file type: {file_type}")
        return None

    
    if len(extracted_text.strip()) == 0:
        print("Warning: No text was extracted from the document")
        return None
        
    # Show statistics about extracted text
    print("-" * 50)
    print(f"Extraction Statistics:")
    print(f"   â€¢ Total characters: {len(extracted_text):,}")
    print(f"   â€¢ Total words (approx): {len(extracted_text.split()):,}")
    print(f"   â€¢ Total lines: {len(extracted_text.splitlines()):,}")    
    return extracted_text

In [6]:
extracted_text = extract_text_from_document(file_path, file_type)

Total pages in PDF: 2
Successfully extracted text from PDF
--------------------------------------------------
Extraction Statistics:
   â€¢ Total characters: 1,330
   â€¢ Total words (approx): 220
   â€¢ Total lines: 25


In [7]:
import tiktoken

def chunk_text(text, chunk_tokens=450, overlap_tokens=80) -> List[str]:
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    total_tokens = len(tokens)
    print(f"ðŸ“Š Total tokens in document: {total_tokens}")
    chunks = []
    start = 0

    while start < len(tokens):
        end = start + chunk_tokens
        chunk_tokens_slice = tokens[start:end]
        chunk = enc.decode(chunk_tokens_slice)
        chunks.append(chunk)
        start = end - overlap_tokens
        if start <= 0:
            start = end
    
    return chunks

In [8]:
text_chunks = chunk_text(extracted_text, chunk_tokens=150, overlap_tokens=80)
print(f"Statistics:")
print(f"   â€¢ Total chunks created: {len(text_chunks)}")
print(f"   â€¢ Average chunk size: {sum(len(c) for c in text_chunks) // len(text_chunks)} characters")
print("-" * 50)


print("Sample chunk:")
    
for i in range(min(4, len(text_chunks))):
    print(f"\nChunk {i+1}:")
    print(f"{text_chunks[i]}")  # Show first 200 characters
    
print("-" * 50)

ðŸ“Š Total tokens in document: 277
Statistics:
   â€¢ Total chunks created: 4
   â€¢ Average chunk size: 608 characters
--------------------------------------------------
Sample chunk:

Chunk 1:
Insurance Agency â€“ Customer Knowledge Base
Q: How do I file an insurance claim?
You can file a claim by calling our 24/7 claims support line or submitting a claim through our online
customer portal. Please keep photos, receipts, and incident details ready.
Q: What is a deductible?
A deductible is the amount you pay out of pocket before your insurance coverage starts paying for a
claim.
Q: How long does claim processing take?
Most claims are processed within 7â€“10 business days once all required documents are received.
Q: How do I add a driver to my auto insurance policy?
You need to provide the driverâ€™s full name, date of birth, license number, and the effective date you
want coverage to begin

Chunk 2:
 pocket before your insurance coverage starts paying for a
claim.
Q: How long does clai

In [9]:
from sentence_transformers import SentenceTransformer
import numpy as np

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for all chunks
embeddings = embedding_model.encode(
    text_chunks,
    show_progress_bar=True,
    convert_to_numpy=True
)

# Display results
print(f"\nEmbeddings created successfully!")
print(f"   â€¢ Shape: {embeddings.shape}")
print(f"   â€¢ Each chunk is now a {embeddings.shape[1]}-dimensional vector")
print(f"   â€¢ Memory used: {embeddings.nbytes / 1024:.2f} KB")

# Show sample
print(f"\nSample - First 10 dimensions of chunk 1:")
print(embeddings[0][:10])

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Embeddings created successfully!
   â€¢ Shape: (4, 384)
   â€¢ Each chunk is now a 384-dimensional vector
   â€¢ Memory used: 6.00 KB

Sample - First 10 dimensions of chunk 1:
[-0.05833827  0.01332624  0.02325287  0.01146739  0.04456546  0.02528911
  0.03604465  0.09132441 -0.04313792  0.05709969]


In [10]:
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)

index.add(embeddings)

In [11]:
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
qa_model = pipeline(
    "text2text-generation",
    model="google/flan-t5-large",  # 780M params - better quality
    device=device
)

Device set to use cpu


In [12]:
def retrieve(query, index, chunks, k=4):
    qvec = embedding_model.encode([query], convert_to_numpy=True)
    scores, ids = index.search(qvec, k)
    results = []
    for i in ids[0]:
        if i == -1:
            continue
        results.append(chunks[i])
    return results

In [13]:
def generate_answer(user_question, retrieved_chunks):
    context = "\n\n".join(retrieved_chunks)
    
    if len(context) > 800:
        context = context[:800] + "..."
    
    prompt = f"""You are a helpful assistant. Based on the context provided, answer the user's question clearly and completely. Use specific details from the context.

    Context: {context}
    
    Question: {user_question}
    
    Provide a clear, detailed answer:"""
    
    result = qa_model(
        prompt,
        max_length=300,
        min_length=30,
        do_sample=True,
        temperature=0.8,
        top_p=0.95,
        repetition_penalty=1.2
    )
    
    answer = result[0]['generated_text']
    return answer

In [14]:
def ask_question(question):
    relevant_chunks = retrieve(question, index, text_chunks, k=4)
    for i, chunk in enumerate(relevant_chunks, 1):
        preview = chunk[:150].replace('\n', ' ')
    
    answer = generate_answer(question, relevant_chunks)
    
    return answer

In [15]:
ask_question("What is a deductible?")

Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'A deductible is the amount you pay out of pocket before your insurance coverage starts paying for a claim. It is the amount you pay out of pocket before your insurance coverage starts paying for a claim.'

In [16]:
ask_question("How long does claim processing take?")

Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'Most claims are processed within 7â€“10 business days once all required documents are received. Typically required documents include ID proof, address proof, vehicle or property details, and prior insurance history.'

In [17]:
ask_question("How do I file an insurance claim?")

Both `max_new_tokens` (=256) and `max_length`(=300) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


'You can file a claim by calling our 24/7 claims support line or submitting a claim through our online customer portal. Please keep photos, receipts, and incident details ready.'