In [2]:
# =========================
# Cell 1: Imports
# =========================
import os
import tempfile
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from pdf2image import convert_from_path
import pytesseract


In [3]:
# =========================
# Cell 2: Paths
# =========================
DATA_DIR = "./data/docs"
OUTPUT_DIR = "./vectorstore"


In [4]:
# =========================
# Cell 3: OCR fallback function
# =========================
def extract_text_ocr(pdf_path):
    text = ""
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=200, output_folder=path)
        for i, img in enumerate(images):
            text += f"\n--- Page {i+1} ---\n" + pytesseract.image_to_string(img)
    return text


In [5]:
# =========================
# Cell 4: Load PDFs
# =========================
def load_pdfs():
    docs = []
    for file in os.listdir(DATA_DIR):
        if not file.endswith(".pdf"):
            continue
        pdf_path = os.path.join(DATA_DIR, file)
        try:
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
            text = " ".join([p.page_content for p in pages])
            if len(text.strip()) < 50:  # too little text, use OCR
                text = extract_text_ocr(pdf_path)
            docs.append(Document(page_content=text, metadata={"source": file}))
        except Exception:
            text = extract_text_ocr(pdf_path)
            docs.append(Document(page_content=text, metadata={"source": file}))
    return docs


In [6]:
# =========================
# Cell 5: Build vectorstore
# =========================
def build_vectorstore():
    docs = load_pdfs()
    print(f"✅ Loaded {len(docs)} PDFs")
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)
    print(f"✅ Total chunks created: {len(chunks)}")
    
    embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedder)
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    vectorstore.save_local(OUTPUT_DIR)
    print(f"✅ Vectorstore saved at {OUTPUT_DIR}")
    
    return vectorstore, chunks


In [7]:
# =========================
# Cell 6: Run build
# =========================
vectorstore, chunks = build_vectorstore()


✅ Loaded 5 PDFs
✅ Total chunks created: 181


  embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


✅ Vectorstore saved at ./vectorstore


In [8]:
# =========================
# Cell 7: Interactive chunk inspection
# =========================
print("Sample chunks and their source PDFs:\n")
for i, chunk in enumerate(chunks[:10]):  # first 10 chunks
    print(f"Chunk {i+1} | Source: {chunk.metadata['source']}")
    print(chunk.page_content[:200], "\n---\n")  # first 200 characters

print(f"Total chunks loaded: {len(chunks)}")


Sample chunks and their source PDFs:

Chunk 1 | Source: gsh-13_v1_panic_january24_final.pdf
Panic
A self-help guide
Workbook 13
Revised  January 2024 v1 2
 Introduction……………………………………………………3
 Panic attacks………………………………………………….4
 Stress and hyperventilation……………………………….....6
 Panic attack v 
---

Chunk 2 | Source: gsh-13_v1_panic_january24_final.pdf
 Facing your fears…………………………………………....19
 Graded exposure…………………………………………….21
 My graded hierarchy………………………………………...24
 My exposure diary……………………………………..........25
 Tips for dealing with panic……… 
---

Chunk 3 | Source: gsh-13_v1_panic_january24_final.pdf
Contents of 
       this booklet 3
Most people will feel anxious at some point in their 
lives, but anxiety becomes a problem when it starts to 
negatively impact your day-to-day life.
This workbook w 
---

Chunk 4 | Source: gsh-13_v1_panic_january24_final.pdf
threatening and we don’t feel we have the skills we need to cope 
with it.
If something causes us to feel afraid or scared, t

In [9]:
# =========================
# Cell 8: Similarity search example
# =========================
query = "Example question about PDFs"
results = vectorstore.similarity_search(query, k=3)

print(f"Top 3 results for query: '{query}'\n")
for r in results:
    print(f"Source: {r.metadata['source']}")
    print(r.page_content[:300], "\n---\n")  # first 300 characters


Top 3 results for query: 'Example question about PDFs'

Source: Panic-ER-final-2022.pdf
Review date 2025 
ISBN:978-1-909664-34-0
 
Follow us on Twitter @cntwnhs and Facebook CNTWNHS 
---

Source: Panic-ER-final-2022.pdf
this leaflet please get in touch.  
This information can be made available in a range of formats on 
request (eg Braille, audio, larger print, BSL or other languages). 
Please contact the Patient Information Centre Tel: 0191 246 7288 
Published by the Patient Information Centre 
2022 Copyright, Cumb 
---

Source: gsh-13_v1_panic_january24_final.pdf
– either by leaving or using a safety behaviour. 
Each time we face that same situation our anxiety is equally 
high the next time round:
Facing your fears 20
What would actually happen?
This graph shows what would actually happen if 
we were to face an anxiety provoking situation 
without the suppo 
---



In [11]:
# =========================
# Complete RAG Preprocessing Script
# =========================
import os
import tempfile
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pdf2image import convert_from_path
import pytesseract

# -------------------------
# Paths
# -------------------------
DATA_DIR = "./data/docs"
OUTPUT_DIR = "./vectorstore"

# -------------------------
# OCR fallback function
# -------------------------
def extract_text_ocr(pdf_path):
    text = ""
    with tempfile.TemporaryDirectory() as path:
        images = convert_from_path(pdf_path, dpi=200, output_folder=path)
        for i, img in enumerate(images):
            text += f"\n--- Page {i+1} ---\n" + pytesseract.image_to_string(img)
    return text

# -------------------------
# Load PDFs
# -------------------------
def load_pdfs():
    docs = []
    for file in os.listdir(DATA_DIR):
        if not file.endswith(".pdf"):
            continue
        pdf_path = os.path.join(DATA_DIR, file)
        try:
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
            text = " ".join([p.page_content for p in pages])
            if len(text.strip()) < 50:  # too little text, use OCR
                text = extract_text_ocr(pdf_path)
            docs.append(Document(page_content=text, metadata={"source": file}))
        except Exception:
            text = extract_text_ocr(pdf_path)
            docs.append(Document(page_content=text, metadata={"source": file}))
    return docs

# -------------------------
# Build Vectorstore
# -------------------------
def build_vectorstore():
    docs = load_pdfs()
    print(f"✅ Loaded {len(docs)} PDFs")
    
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = splitter.split_documents(docs)
    print(f"✅ Total chunks created: {len(chunks)}")
    
    embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedder)
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    vectorstore.save_local(OUTPUT_DIR)
    print(f"✅ Vectorstore saved at {OUTPUT_DIR}")
    
    # -------------------------
    # Inspect Chunks
    # -------------------------
    df_chunks = pd.DataFrame({
        "source_pdf": [c.metadata["source"] for c in chunks],
        "chunk_text": [c.page_content for c in chunks],
        "chunk_length": [len(c.page_content) for c in chunks]
    })
    
    print("\n=== Sample chunks (first 5) ===")
    print(df_chunks.head(5).to_string(index=False))
    
    stats = df_chunks.groupby("source_pdf").agg(
        total_chunks=("chunk_text", "count"),
        avg_chunk_length=("chunk_length", "mean")
    ).sort_values(by="total_chunks", ascending=False)
    
    print("\n=== Chunk stats per PDF ===")
    print(stats)
    
    # -------------------------
    # Similarity search example
    # -------------------------
    query = "Example question about PDFs"
    results = vectorstore.similarity_search(query, k=3)
    print(f"\n=== Top 3 results for query: '{query}' ===")
    for r in results:
        print(f"\nSource: {r.metadata['source']}")
        print(r.page_content[:300], "\n---\n")
    
    return vectorstore, chunks, df_chunks

# -------------------------
# Run
# -------------------------
vectorstore, chunks, df_chunks = build_vectorstore()


✅ Loaded 5 PDFs
✅ Total chunks created: 181
✅ Vectorstore saved at ./vectorstore

=== Sample chunks (first 5) ===
                         source_pdf                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      chunk_text  chunk_length
gsh-13_v1_panic_january24_final.pdf Panic\nA self-help guide\nWorkbook 13\nRevised  January 2024 v1 2\n Introduction……………………………………………………3\n Panic attacks………………………………………………….4\n Stress and hyperventilation……………………………….....6\n Panic attack vs heart attack…............................................8\n What keeps panic 

In [12]:
# =========================
# Smart RAG with Auto-OCR Detection
# =========================
import os
import re
import tempfile
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pdf2image import convert_from_path
import pytesseract

# -------------------------
# Configuration
# -------------------------
DATA_DIR = "./data/docs"
OUTPUT_DIR = "./vectorstore"
MIN_TEXT_THRESHOLD = 100  # If extracted text < this, use OCR
OCR_DPI = 200  # Higher = better quality but slower

# -------------------------
# Text cleaning helper
# -------------------------
def clean_ocr_text(text):
    """Remove OCR artifacts, excess whitespace, and common noise"""
    # Remove excessive newlines
    text = re.sub(r'\n{3,}', '\n\n', text)
    # Remove page markers if they're noise
    text = re.sub(r'\n--- Page \d+ ---\n', '\n', text)
    # Remove lines with only special chars/numbers (often headers/footers)
    lines = [line for line in text.split('\n') 
             if len(line.strip()) > 3 and not re.match(r'^[\d\s\-_|]+$', line.strip())]
    text = '\n'.join(lines)
    # Normalize whitespace
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n ', '\n', text)
    return text.strip()

# -------------------------
# OCR extraction with cleaning
# -------------------------
def extract_text_ocr(pdf_path, clean=True):
    """Extract text via OCR with optional cleaning"""
    text = ""
    try:
        with tempfile.TemporaryDirectory() as path:
            images = convert_from_path(pdf_path, dpi=OCR_DPI, output_folder=path)
            for i, img in enumerate(images):
                page_text = pytesseract.image_to_string(img, lang='eng')
                text += f"\n--- Page {i+1} ---\n{page_text}"
        
        if clean:
            text = clean_ocr_text(text)
        
        return text
    except Exception as e:
        print(f"⚠️  OCR failed for {os.path.basename(pdf_path)}: {e}")
        return ""

# -------------------------
# Smart PDF loading with auto-OCR detection
# -------------------------
def load_pdfs_smart():
    """Load PDFs with intelligent OCR fallback"""
    docs = []
    processing_log = []
    
    pdf_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".pdf")]
    
    print(f"📂 Found {len(pdf_files)} PDF files\n")
    
    for file in pdf_files:
        pdf_path = os.path.join(DATA_DIR, file)
        method_used = "PyPDFLoader"
        text = ""
        
        try:
            # Step 1: Try standard PDF extraction
            loader = PyPDFLoader(pdf_path)
            pages = loader.load()
            text = " ".join([p.page_content for p in pages])
            
            # Step 2: Check if extraction was successful
            text_length = len(text.strip())
            
            if text_length < MIN_TEXT_THRESHOLD:
                # Text too short → likely image-based PDF, use OCR
                print(f"🔍 {file}: Extracted only {text_length} chars, switching to OCR...")
                text = extract_text_ocr(pdf_path)
                method_used = "OCR (low text)"
            else:
                # Check text quality (ratio of alphanumeric to total)
                alphanumeric = sum(c.isalnum() or c.isspace() for c in text)
                quality_ratio = alphanumeric / len(text) if len(text) > 0 else 0
                
                if quality_ratio < 0.5:
                    # Mostly garbage characters → use OCR
                    print(f"🔍 {file}: Poor text quality ({quality_ratio:.2%}), switching to OCR...")
                    text = extract_text_ocr(pdf_path)
                    method_used = "OCR (poor quality)"
                else:
                    print(f"✅ {file}: Standard extraction OK ({text_length} chars)")
                    
        except Exception as e:
            # Step 3: Fallback to OCR on any error
            print(f"⚠️  {file}: PyPDFLoader failed ({e}), using OCR...")
            text = extract_text_ocr(pdf_path)
            method_used = "OCR (extraction error)"
        
        # Store document
        if text.strip():
            docs.append(Document(
                page_content=text,
                metadata={"source": file, "extraction_method": method_used}
            ))
            processing_log.append({
                "file": file,
                "method": method_used,
                "text_length": len(text)
            })
        else:
            print(f"❌ {file}: No text extracted (skipping)")
    
    return docs, pd.DataFrame(processing_log)

# -------------------------
# Build Vectorstore
# -------------------------
def build_vectorstore():
    """Build FAISS vectorstore with diagnostics"""
    
    print("=" * 60)
    print("STEP 1: Loading PDFs")
    print("=" * 60)
    
    docs, log_df = load_pdfs_smart()
    
    if not docs:
        print("\n❌ No documents loaded! Check your PDF directory.")
        return None, None, None
    
    print(f"\n✅ Successfully loaded {len(docs)} PDFs\n")
    
    # Show processing summary
    print("=" * 60)
    print("Processing Summary")
    print("=" * 60)
    print(log_df.to_string(index=False))
    print(f"\nOCR Usage: {log_df['method'].str.contains('OCR').sum()} / {len(log_df)} PDFs")
    
    # -------------------------
    # Step 2: Chunk documents
    # -------------------------
    print("\n" + "=" * 60)
    print("STEP 2: Chunking Documents")
    print("=" * 60)
    
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = splitter.split_documents(docs)
    print(f"✅ Created {len(chunks)} chunks\n")
    
    # -------------------------
    # Step 3: Create embeddings
    # -------------------------
    print("=" * 60)
    print("STEP 3: Creating Vector Embeddings")
    print("=" * 60)
    
    embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedder)
    
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    vectorstore.save_local(OUTPUT_DIR)
    print(f"✅ Vectorstore saved at {OUTPUT_DIR}\n")
    
    # -------------------------
    # Step 4: Inspect chunks
    # -------------------------
    print("=" * 60)
    print("STEP 4: Chunk Analysis")
    print("=" * 60)
    
    df_chunks = pd.DataFrame({
        "source_pdf": [c.metadata["source"] for c in chunks],
        "extraction_method": [c.metadata.get("extraction_method", "unknown") for c in chunks],
        "chunk_text": [c.page_content for c in chunks],
        "chunk_length": [len(c.page_content) for c in chunks]
    })
    
    print("\n📄 Sample chunks (first 3):")
    print("-" * 60)
    for i, row in df_chunks.head(3).iterrows():
        print(f"\nSource: {row['source_pdf']} ({row['extraction_method']})")
        print(f"Text: {row['chunk_text'][:200]}...")
        print("-" * 60)
    
    stats = df_chunks.groupby("source_pdf").agg(
        total_chunks=("chunk_text", "count"),
        avg_chunk_length=("chunk_length", "mean"),
        method=("extraction_method", "first")
    ).sort_values(by="total_chunks", ascending=False)
    
    print("\n📊 Chunk Statistics by PDF:")
    print(stats.to_string())
    
    # -------------------------
    # Step 5: Test retrieval
    # -------------------------
    print("\n" + "=" * 60)
    print("STEP 5: Testing Retrieval")
    print("=" * 60)
    
    test_query = "What information is available in these documents?"
    results = vectorstore.similarity_search(test_query, k=3)
    
    print(f"\n🔎 Query: '{test_query}'")
    print(f"📌 Top 3 results:\n")
    
    for i, r in enumerate(results, 1):
        print(f"{i}. Source: {r.metadata['source']}")
        print(f"   Method: {r.metadata.get('extraction_method', 'unknown')}")
        print(f"   Text: {r.page_content[:250]}...")
        print()
    
    return vectorstore, chunks, df_chunks

# -------------------------
# Run the pipeline
# -------------------------
if __name__ == "__main__":
    vectorstore, chunks, df_chunks = build_vectorstore()

STEP 1: Loading PDFs
📂 Found 5 PDF files

✅ gsh-13_v1_panic_january24_final.pdf: Standard extraction OK (23841 chars)
✅ How-can-I-help-someone-having-a-panic-attack-Accessible.pdf: Standard extraction OK (2751 chars)
✅ Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf: Standard extraction OK (13366 chars)
✅ panic-a-self-help-guide.pdf: Standard extraction OK (34936 chars)
✅ Panic-ER-final-2022.pdf: Standard extraction OK (6360 chars)

✅ Successfully loaded 5 PDFs

Processing Summary
                                                            file      method  text_length
                             gsh-13_v1_panic_january24_final.pdf PyPDFLoader        23842
     How-can-I-help-someone-having-a-panic-attack-Accessible.pdf PyPDFLoader         2751
Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf PyPDFLoader        13366
                                     panic-a-self-help-guide.pdf PyPDFLoader        34936
                                         Panic-

In [None]:
# =========================
# Smart RAG with Hybrid OCR Detection + Chunk Efficiency Check + Sample Display
# =========================
import os
import re
import tempfile
import pandas as pd
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
from pdfminer.high_level import extract_text as pdfminer_extract_text

# -------------------------
# Configuration
# -------------------------
DATA_DIR = "./data/docs"
OUTPUT_DIR = "./vectorstore"
MIN_TEXT_THRESHOLD = 100
OCR_DPI = 200
TARGET_CHUNK_SIZE = 500  # for efficiency comparison

# -------------------------
# Text cleaning helper
# -------------------------
def clean_ocr_text(text):
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'\n--- Page \d+ ---\n', '\n', text)
    lines = [
        line for line in text.split('\n')
        if len(line.strip()) > 3 and not re.match(r'^[\d\s\-_|]+$', line.strip())
    ]
    text = '\n'.join(lines)
    text = re.sub(r'[ \t]+', ' ', text)
    text = re.sub(r'\n ', '\n', text)
    return text.strip()

# -------------------------
# OCR extraction with cleaning
# -------------------------
def extract_text_ocr(pdf_path, clean=True):
    text = ""
    try:
        with tempfile.TemporaryDirectory() as path:
            images = convert_from_path(pdf_path, dpi=OCR_DPI, output_folder=path)
            for i, img in enumerate(images):
                page_text = pytesseract.image_to_string(img, lang='eng')
                text += f"\n--- Page {i+1} ---\n{page_text}"
        if clean:
            text = clean_ocr_text(text)
        return text
    except Exception as e:
        print(f"⚠️  OCR failed for {os.path.basename(pdf_path)}: {e}")
        return ""

# -------------------------
# Helper: Check if a PDF has a text layer
# -------------------------
def has_text_layer(pdf_path):
    try:
        text = pdfminer_extract_text(pdf_path)
        return len(text.strip()) > 200
    except Exception:
        return False

# -------------------------
# Helper: Detect image-heavy PDFs
# -------------------------
def is_image_based(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        image_pages = 0
        for page in reader.pages:
            if "/XObject" in page.get("/Resources", {}):
                xObject = page["/Resources"]["/XObject"].get_object()
                for obj in xObject:
                    if xObject[obj].get("/Subtype") == "/Image":
                        image_pages += 1
        return image_pages / max(len(reader.pages), 1) > 0.7
    except Exception:
        return False

# -------------------------
# Helper: Detect low-quality text
# -------------------------
def is_low_quality_text(text):
    if len(text.strip()) < 100:
        return True
    alpha_ratio = sum(c.isalpha() for c in text) / max(len(text), 1)
    if alpha_ratio < 0.2:
        return True
    words = text.split()
    avg_word_len = sum(len(w) for w in words) / max(len(words), 1)
    if avg_word_len < 3:
        return True
    non_ascii_ratio = sum(ord(c) > 127 for c in text) / len(text)
    if non_ascii_ratio > 0.2:
        return True
    return False

# -------------------------
# Smart PDF loading with auto-OCR detection
# -------------------------
def load_pdfs_smart():
    docs = []
    processing_log = []
    pdf_files = [f for f in os.listdir(DATA_DIR) if f.endswith(".pdf")]

    print(f"📂 Found {len(pdf_files)} PDF files\n")

    for file in pdf_files:
        pdf_path = os.path.join(DATA_DIR, file)
        method_used = "PyPDFLoader"
        text = ""

        try:
            # Step 1: Quick pre-check before extraction
            if is_image_based(pdf_path):
                print(f"🖼️  {file}: Detected image-heavy PDF → using OCR directly.")
                text = extract_text_ocr(pdf_path)
                method_used = "OCR (image-based)"
            elif not has_text_layer(pdf_path):
                print(f"🔍 {file}: No text layer detected → using OCR.")
                text = extract_text_ocr(pdf_path)
                method_used = "OCR (no text layer)"
            else:
                # Step 2: Try standard text extraction
                loader = PyPDFLoader(pdf_path)
                pages = loader.load()
                text = " ".join([p.page_content for p in pages])
                text_length = len(text.strip())
                if text_length < MIN_TEXT_THRESHOLD or is_low_quality_text(text):
                    print(f"🔍 {file}: Weak or low-quality text → using OCR.")
                    text = extract_text_ocr(pdf_path)
                    method_used = "OCR (low quality)"
                else:
                    print(f"✅ {file}: Standard extraction OK ({text_length} chars)")
        except Exception as e:
            print(f"⚠️  {file}: Extraction error ({e}) → using OCR.")
            text = extract_text_ocr(pdf_path)
            method_used = "OCR (error fallback)"

        if text.strip():
            docs.append(Document(
                page_content=text,
                metadata={"source": file, "extraction_method": method_used}
            ))
            processing_log.append({
                "file": file,
                "method": method_used,
                "text_length": len(text)
            })
        else:
            print(f"❌ {file}: No text extracted (skipping)")

    return docs, pd.DataFrame(processing_log)

# -------------------------
# Build Vectorstore
# -------------------------
def build_vectorstore():
    print("=" * 60)
    print("STEP 1: Loading PDFs")
    print("=" * 60)

    docs, log_df = load_pdfs_smart()
    if not docs:
        print("\n❌ No documents loaded! Check your PDF directory.")
        return None, None, None

    print(f"\n✅ Successfully loaded {len(docs)} PDFs\n")
    print("=" * 60)
    print("Processing Summary")
    print("=" * 60)
    print(log_df.to_string(index=False))
    print(f"\nOCR Usage: {log_df['method'].str.contains('OCR').sum()} / {len(log_df)} PDFs")

    # Step 2: Chunking
    print("\n" + "=" * 60)
    print("STEP 2: Chunking Documents")
    print("=" * 60)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=TARGET_CHUNK_SIZE,
        chunk_overlap=50,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    chunks = splitter.split_documents(docs)
    print(f"✅ Created {len(chunks)} chunks\n")

    # Step 3: Embeddings
    print("=" * 60)
    print("STEP 3: Creating Vector Embeddings")
    print("=" * 60)
    embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    vectorstore = FAISS.from_documents(chunks, embedder)
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    vectorstore.save_local(OUTPUT_DIR)
    print(f"✅ Vectorstore saved at {OUTPUT_DIR}\n")

    # Step 4: Chunk Analysis
    print("=" * 60)
    print("STEP 4: Chunk Analysis")
    print("=" * 60)
    df_chunks = pd.DataFrame({
        "source_pdf": [c.metadata["source"] for c in chunks],
        "extraction_method": [c.metadata.get("extraction_method", "unknown") for c in chunks],
        "chunk_text": [c.page_content for c in chunks],
        "chunk_length": [len(c.page_content) for c in chunks]
    })

    # === Chunking Quality Check ===
    print("\n📊 CHUNKING EFFICIENCY REPORT")
    print("-" * 60)
    for pdf, group in df_chunks.groupby("source_pdf"):
        avg_len = group["chunk_length"].mean()
        std_len = group["chunk_length"].std()
        num_chunks = len(group)
        efficiency = (avg_len / TARGET_CHUNK_SIZE) * 100
        print(f"📘 {pdf}: {num_chunks} chunks | Avg {avg_len:.1f} chars | StdDev {std_len:.1f} | Efficiency {efficiency:.1f}%")
        if efficiency < 60:
            print("  ⚠️  Too small chunks → consider increasing chunk_size.")
        elif efficiency > 120:
            print("  ⚠️  Chunks too large → may affect retrieval speed.")
        else:
            print("  ✅ Balanced chunking.")

    # Step 4.5: Show Random Samples per PDF
    print("\n" + "=" * 60)
    print("STEP 4.5: Sample Texts from Each PDF")
    print("=" * 60)

    for pdf, group in df_chunks.groupby("source_pdf"):
        print(f"\n📘 {pdf} — showing 5 random chunks:\n" + "-" * 60)
        samples = group.sample(min(5, len(group)), random_state=42)
        for i, (_, row) in enumerate(samples.iterrows(), 1):
            preview = row["chunk_text"][:400].replace("\n", " ")
            print(f"{i}. [{row['extraction_method']}] {preview}...\n")

    # Step 5: Test retrieval
    print("\n" + "=" * 60)
    print("STEP 5: Testing Retrieval")
    print("=" * 60)
    test_query = "What information is available in these documents?"
    results = vectorstore.similarity_search(test_query, k=3)
    print(f"\n🔎 Query: '{test_query}'\n📌 Top 3 results:\n")
    for i, r in enumerate(results, 1):
        print(f"{i}. Source: {r.metadata['source']}")
        print(f"   Method: {r.metadata.get('extraction_method', 'unknown')}")
        print(f"   Text: {r.page_content[:250]}...\n")

    return vectorstore, chunks, df_chunks

# -------------------------
# Run the pipeline
# -------------------------
if __name__ == "__main__":
    vectorstore, chunks, df_chunks = build_vectorstore()


STEP 1: Loading PDFs
📂 Found 5 PDF files

✅ gsh-13_v1_panic_january24_final.pdf: Standard extraction OK (23841 chars)
🖼️  How-can-I-help-someone-having-a-panic-attack-Accessible.pdf: Detected image-heavy PDF → using OCR directly.


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


⚠️  OCR failed for How-can-I-help-someone-having-a-panic-attack-Accessible.pdf: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\lenovo\\AppData\\Local\\Temp\\tmphh3kl5hi\\bd2897ef-219d-423e-90ba-1b7b6d386cd2-2.ppm'
❌ How-can-I-help-someone-having-a-panic-attack-Accessible.pdf: No text extracted (skipping)


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Cannot set gray stroke color because /'P19' is an invalid float value
Cannot set gray non-stroke color because /'P19' is an invalid float value
Cannot set gray stroke color because /'P24' is an invalid float value
Cannot set gray non-stroke color because /'P24' is an invalid float value
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None can

✅ Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf: Standard extraction OK (13366 chars)


The PDF <_io.BufferedReader name='./data/docs\\Panic-ER-final-2022.pdf'> contains a metadata field indicating that it should not allow text extraction. Ignoring this field and proceeding. Use the check_extractable if you want to raise an error in this case


✅ panic-a-self-help-guide.pdf: Standard extraction OK (34936 chars)
✅ Panic-ER-final-2022.pdf: Standard extraction OK (6360 chars)

✅ Successfully loaded 4 PDFs

Processing Summary
                                                            file      method  text_length
                             gsh-13_v1_panic_january24_final.pdf PyPDFLoader        23842
Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf PyPDFLoader        13366
                                     panic-a-self-help-guide.pdf PyPDFLoader        34936
                                         Panic-ER-final-2022.pdf PyPDFLoader         6360

OCR Usage: 0 / 4 PDFs

STEP 2: Chunking Documents
✅ Created 175 chunks

STEP 3: Creating Vector Embeddings


  embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")


✅ Vectorstore saved at ./vectorstore

STEP 4: Chunk Analysis

📊 CHUNKING EFFICIENCY REPORT
------------------------------------------------------------
📘 Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf: 30 chunks | Avg 454.9 chars | StdDev 63.2 | Efficiency 91.0%
  ✅ Balanced chunking.
📘 Panic-ER-final-2022.pdf: 15 chunks | Avg 449.9 chars | StdDev 99.6 | Efficiency 90.0%
  ✅ Balanced chunking.
📘 gsh-13_v1_panic_january24_final.pdf: 53 chunks | Avg 467.6 chars | StdDev 43.1 | Efficiency 93.5%
  ✅ Balanced chunking.
📘 panic-a-self-help-guide.pdf: 77 chunks | Avg 461.0 chars | StdDev 38.5 | Efficiency 92.2%
  ✅ Balanced chunking.

STEP 4.5: Sample Texts from Each PDF

📘 Panic Attacks & Panic Disorder_ Causes, Symptoms & Treatment.pdf — showing 5 random chunks:
------------------------------------------------------------
1. [PyPDFLoader] Medically Reviewed Last reviewed on 02/12/2023. Learn more about the Health Library and our editorial process. References Appointments86

In [8]:
# --- Gemini RAG Integration in Jupyter ---

# Install dependencies (run once per environment)
#ip install --quiet google-genai langchain_community sentence-transformers faiss-cpu

# --- Imports ---
import os
from google import genai
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings

# --- SETUP ---
GOOGLE_API_KEY = "AIzaSyBF0VcET3kuCYmrdixWz3oz1nHzoC8wfoI"
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

# Initialize Gemini client
client = genai.Client(api_key=GOOGLE_API_KEY)

# Load your vectorstore (adjust path if needed)
VECTORSTORE_PATH = "./vectorstore"
embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(VECTORSTORE_PATH, embedder, allow_dangerous_deserialization=True)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# --- PROMPT TEMPLATE ---
SYSTEM_PROMPT = (
    "You are an empathetic assistant helping users manage anxiety and panic.\n"
    "Provide calm, practical, and kind guidance based on the provided context.\n"
    "If a user mentions crisis or self-harm, encourage professional help or helplines."
)

def build_prompt(question, context):
    return f"{SYSTEM_PROMPT}\n\nContext:\n{context}\n\nUser question:\n{question}\n\nAnswer:"

# --- GEMINI CALL FUNCTION ---
from google.genai import types

def call_gemini(prompt):
    config = types.GenerateContentConfig(
        system_instruction=SYSTEM_PROMPT,
        max_output_tokens=400,
        temperature=0.2,
        response_mime_type="application/json"
    )
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=prompt,
        config=config
    )
    return response.text



# --- MAIN QA FUNCTION ---
def ask(question):
    docs = retriever.get_relevant_documents(question)
    context = "\n\n".join([d.page_content for d in docs])
    prompt = build_prompt(question, context)
    answer = call_gemini(prompt)
    return answer

# --- TEST RUN ---
question = "What are some grounding techniques to stop a panic attack?"
print("🧩 Question:", question)
print("\n💬 Answer:\n", ask(question))


🧩 Question: What are some grounding techniques to stop a panic attack?

💬 Answer:
 {
  "response": "While it's difficult to immediately stop a panic attack once it begins, there are techniques to help you feel safer and manage the experience. Some suggestions include: \n\n*   **Avoid caffeine, alcohol, and smoking:** These substances can worsen panic attacks.\n*   **Make lifestyle changes:** Incorporate regular exercise to manage stress, relieve tension, and boost your mood. Eating regular, healthy meals and avoiding processed foods and drinks can also help stabilize blood sugar levels.\n*   **Challenge your thoughts:** Avoid scanning your body for evidence of something being wrong. Instead, focus on something to make yourself feel safe, such as sitting down if you feel faint or lying down if you think you're having a heart attack.\n\nIf panic attacks are a recurring issue, consider exploring resources like the Centre for Clinical Interventions (CCI) workbooks for additional support: h

In [10]:

# --- SETUP ---
GOOGLE_API_KEY = "AIzaSyBF0VcET3kuCYmrdixWz3oz1nHzoC8wfoI"
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY

In [12]:
# --- Gemini RAG Integration in Jupyter ---

# Install dependencies (run once per environment)
# !pip install --quiet google-genai langchain_community sentence-transformers faiss-cpu

# --- Imports ---
import os
from google import genai
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import SentenceTransformerEmbeddings
from google.genai import types

# Initialize Gemini client
client = genai.Client(api_key=GOOGLE_API_KEY)

# Load your vectorstore (adjust path if needed)
VECTORSTORE_PATH = "./vectorstore"  # path where your FAISS vectorstore is saved
embedder = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.load_local(VECTORSTORE_PATH, embedder, allow_dangerous_deserialization=True)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

# --- PROMPT TEMPLATE ---
SYSTEM_PROMPT = (
    "You are an empathetic assistant helping users manage anxiety and panic.\n"
    "Provide calm, practical, and kind guidance based on the provided context.\n"
    "If a user mentions crisis or self-harm, encourage professional help or helplines."
)

# --- GEMINI CALL FUNCTION (flash model, system prompt merged) ---
def call_gemini_with_context(context, question):
    # Merge system prompt, context, and user question into a single string
    full_prompt = f"{SYSTEM_PROMPT}\n\nContext:\n{context}\n\nUser question:\n{question}\n\nAnswer:"

    # Generate response using flash model
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        contents=[full_prompt],
        config=types.GenerateContentConfig(
            temperature=0.2,
            max_output_tokens=400
        )
    )
    return response.text

# --- MAIN QA FUNCTION ---
def ask(question):
    # Step 1: Retrieve relevant PDF chunks
    docs = retriever.get_relevant_documents(question)
    context = "\n\n".join([d.page_content for d in docs])
    
    # Step 2: Call Gemini with context + question
    answer = call_gemini_with_context(context, question)
    return answer

# --- TEST RUN ---
question = "What are some grounding techniques to stop a panic attack?"
print("🧩 Question:", question)
print("\n💬 Answer:\n", ask(question))


🧩 Question: What are some grounding techniques to stop a panic attack?

💬 Answer:
 It sounds like you're looking for ways to manage panic attacks, and grounding techniques can be really helpful. It's great that you're being proactive about this.

Here are a few ideas based on the information you shared, keeping in mind that everyone is different, and it might take some experimenting to find what works best for you:

*   **Focus on your senses:** The goal is to bring you back to the present moment. You can try the 5-4-3-2-1 method. Name 5 things you can see, 4 things you can touch, 3 things you can hear, 2 things you can smell, and 1 thing you can taste.

*   **Safe actions:** Is there something you can do to make yourself feel safer in the moment? The information mentions that some people try gulping air if they feel like they are suffocating, sitting down if they feel faint, or lying down if they think they are having a heart attack.

It's important to remember that panic attacks can 