In [8]:
#This utility opens the leafra SQLLite DB - dumps the chunks of a document from the DB - which can have overlaps - and merges them. 
#then it compares with the original document. 
#it uses the schema defined in bool SQLiteDatabase::createRAGTables() in leafra_sqlite.cpp 

leafradbpath = "/Users/arifdikici/Library/Application Support/LeafraSDK/leafra.db" 

#open the sqllite db given by leafradbpath
import sqlite3

# Open the SQLite database
try:
    conn = sqlite3.connect(leafradbpath)
    cursor = conn.cursor()
    print(f"Successfully opened database at {leafradbpath}")
except sqlite3.Error as e:
    print(f"Error opening database: {e}")
    raise

#check the schema defined in bool SQLiteDatabase::createRAGTables() in leafra_sqlite.cpp  
#open the doc table 
#go through docs one by one 
#read the chunks of the document using the chunks table into memory 
# Get schema info
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='docs'")
docs_schema = cursor.fetchone()
cursor.execute("SELECT sql FROM sqlite_master WHERE type='table' AND name='chunks'")
chunks_schema = cursor.fetchone()

print("\nDocs table schema:")
print(docs_schema[0])
print("\nChunks table schema:")
print(chunks_schema[0])

# Query all documents
cursor.execute("SELECT id, filename, url, creation_date, size FROM docs")
documents = cursor.fetchall()

print(f"\nFound {len(documents)} documents")

for doc in documents:
    doc_id, filename, url, creation_date, size = doc
    print(f"\nDocument {doc_id}:")
    print(f"Filename: {filename}")
    print(f"URL: {url}")
    print(f"Created: {creation_date}")
    print(f"Size: {size}")
    
    # Get all chunks for this document, ordered by chunk number (include page numbers)
    cursor.execute("""
        SELECT chunk_no, chunk_text, chunk_token_size, chunk_size, chunk_page_number 
        FROM chunks 
        WHERE doc_id = ? 
        ORDER BY chunk_no
    """, (doc_id,))
    
    chunks = cursor.fetchall()
    print(f"Number of chunks: {len(chunks)}")
    
    # Store chunks in memory
    doc_chunks = []
    for chunk in chunks:
        chunk_no, chunk_text, token_size, size, page_number = chunk
        doc_chunks.append({
            'chunk_no': chunk_no,
            'text': chunk_text,
            'token_size': token_size,
            'size': size,
            'page_number': page_number
        })
        print(f"Chunk {chunk_no}: {token_size} tokens, {size} bytes, page {page_number}")

# Helper function to find which chunk contains a specific position
def find_chunk_for_position(chunks, position, merged_text):
    """Find which chunk likely contains the text at the given position"""
    # This is an approximation since we've merged the chunks
    current_pos = 0
    
    for chunk in chunks:
        chunk_len = len(chunk['text'])
        if current_pos <= position < current_pos + chunk_len:
            # Get page number from database if available
            return {
                'chunk_no': chunk['chunk_no'],
                'page': chunk.get('page_number', 'unknown'),
                'start_pos': current_pos,
                'end_pos': current_pos + chunk_len
            }
        current_pos += chunk_len + 1  # +1 for space added during merging
    
    return None

#for each document - merge the chunks, note that chunks can have overlaps - need figure out the overlaps and merge them without duplicating text. 
print("\nMerging chunks and verifying overlaps...")

for doc in documents:
    doc_id = doc[0]
    filename = doc[1]
    print(f"\nProcessing document {doc_id}: {filename}")
    
    # Get the doc_chunks list we created above for this document
    chunks = doc_chunks # We already have this from above
    
    if not chunks:
        print("No chunks found for document")
        continue
        
    # Merge chunks handling overlaps
    merged_text = chunks[0]['text']  # Start with first chunk
    
    for i in range(1, len(chunks)):
        current_chunk = chunks[i]['text']
        chunk_no = chunks[i]['chunk_no']
        
        # Split chunks into words
        prev_words = merged_text.split()
        curr_words = current_chunk.split()
        
        # Find overlap by comparing word sequences from end of previous to start of current
        max_overlap_len = min(len(prev_words), len(curr_words))
        overlap_size = 0
        
        for overlap_len in range(max_overlap_len, 0, -1):
            if prev_words[-overlap_len:] == curr_words[:overlap_len]:
                overlap_size = overlap_len
                break
        
        if overlap_size == 0:
            print(f"WARNING: No word overlap found between chunks {chunk_no-1} and {chunk_no}")
            # Just concatenate with space
            merged_text += " " + current_chunk
        else:
            # Check if overlap preserves word boundaries
            overlap_text_prev = " ".join(prev_words[-overlap_size:])
            overlap_text_curr = " ".join(curr_words[:overlap_size])
            
            if overlap_text_prev != overlap_text_curr:
                print(f"WARNING: Overlap text mismatch between chunks {chunk_no-1} and {chunk_no}")
                print(f"Previous chunk overlap: '{overlap_text_prev}'")
                print(f"Current chunk overlap: '{overlap_text_curr}'")
            
            # Merge by removing overlap from current chunk
            merged_text += " " + " ".join(curr_words[overlap_size:])
    
    print(f"Original total chunks length: {sum(len(c['text']) for c in chunks)}")
    print(f"Merged text length: {len(merged_text)}")
    
    # Store merged text for later use
    doc_merged_texts = {
        'doc_id': doc_id,
        'merged_text': merged_text
    }

#now for all documents open the original document - and compare the merged text with the original document. original document's URL is stored in the docs table. 
#use a helper library for pdfs to parse the original document. Indicate any differences... 
# Get original document URL from docs table
query = f"SELECT url FROM docs WHERE id = {doc_id}"
cursor.execute(query)
doc_url = cursor.fetchone()[0]

if not doc_url:
    print(f"Warning: No URL found for document {doc_id}")
    # Skip processing this document since no URL is available
else:
    # Parse PDF from local file or URL
    try:
        import PyPDF2
        from urllib.parse import urlparse
        import os
        
        # Check if it's a local file path or URL
        parsed_url = urlparse(doc_url)
        
        if parsed_url.scheme in ('http', 'https'):
            # It's a web URL - download it
            import requests
            from io import BytesIO
            response = requests.get(doc_url)
            pdf_file = BytesIO(response.content)
            pdf_reader = PyPDF2.PdfReader(pdf_file)
        else:
            # It's a local file path
            if os.path.exists(doc_url):
                pdf_reader = PyPDF2.PdfReader(doc_url)
            else:
                print(f"Error: Local file not found: {doc_url}")
                pdf_reader = None
        
        if pdf_reader is not None:
            original_text = ""
            
                        # Extract text from all pages
            for page in pdf_reader.pages:
                original_text += page.extract_text()
                
            # Basic text normalization for comparison - preserving case
            def normalize_text(text):
                # Remove extra whitespace only
                return " ".join(text.split())
            
            original_normalized = normalize_text(original_text)
            merged_normalized = normalize_text(merged_text)
            
            # Compare texts
            if original_normalized == merged_normalized:
                print(f"Document {doc_id}: Merged chunks match original document exactly!")
            else:
                print(f"Document {doc_id}: Differences found between merged chunks and original")
                
                # Calculate similarity percentage
                from difflib import SequenceMatcher
                similarity = SequenceMatcher(None, original_normalized, merged_normalized).ratio() * 100
                print(f"Text similarity: {similarity:.2f}%")
                
                # Find major differences
                if len(original_normalized) != len(merged_normalized):
                    print(f"Length difference: Original={len(original_normalized)}, Merged={len(merged_normalized)}")
                
                # Show all differences with detailed analysis
                from difflib import unified_diff, SequenceMatcher
                
                # Split into words for better difference detection
                original_words = original_normalized.split()
                merged_words = merged_normalized.split()
                
                print(f"\nDetailed difference analysis:")
                print(f"Original word count: {len(original_words)}")
                print(f"Merged word count: {len(merged_words)}")
                
                # Find word-level differences
                matcher = SequenceMatcher(None, original_words, merged_words)
                differences_found = 0
                
                for tag, i1, i2, j1, j2 in matcher.get_opcodes():
                    if tag != 'equal':
                        differences_found += 1
                        print(f"\nDifference #{differences_found} - {tag.upper()}:")
                        
                        if tag == 'delete':
                            print(f"  Missing in merged (words {i1}-{i2}): {' '.join(original_words[i1:i2])}")
                        elif tag == 'insert':
                            print(f"  Extra in merged (words {j1}-{j2}): {' '.join(merged_words[j1:j2])}")
                        elif tag == 'replace':
                            print(f"  Original (words {i1}-{i2}): {' '.join(original_words[i1:i2])}")
                            print(f"  Merged (words {j1}-{j2}): {' '.join(merged_words[j1:j2])}")
                        
                        # Try to map differences back to chunks
                        if tag in ['delete', 'replace'] and i1 < len(original_words):
                            # Find which chunk(s) contain this difference
                            word_position = len(' '.join(original_words[:i1]))
                            chunk_info = find_chunk_for_position(chunks, word_position, merged_text)
                            if chunk_info:
                                print(f"  -> Likely in chunk {chunk_info['chunk_no']} (page {chunk_info.get('page', 'unknown')})")
                
                if differences_found == 0:
                    print("No word-level differences found (this shouldn't happen if texts don't match)")
                else:
                    print(f"\nTotal differences found: {differences_found}")
                
                # Show character-level first difference for debugging
                print(f"\nFirst character difference:")
                i = 0
                while i < min(len(original_normalized), len(merged_normalized)):
                    if original_normalized[i] != merged_normalized[i]:
                        context = 100  # More context for debugging
                        start = max(0, i - context)
                        end = min(len(original_normalized), i + context)
                        print(f"Position {i}:")
                        print(f"Original: ...{original_normalized[start:end]}...")
                        print(f"Merged:   ...{merged_normalized[start:end]}...")
                        break
                    i += 1

    except Exception as e:
        print(f"Error processing document {doc_id}: {str(e)}")











Successfully opened database at /Users/arifdikici/Library/Application Support/LeafraSDK/leafra.db

Docs table schema:
CREATE TABLE docs (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            filename TEXT NOT NULL,
            url TEXT,
            creation_date DATETIME DEFAULT CURRENT_TIMESTAMP,
            size INTEGER NOT NULL
        )

Chunks table schema:
CREATE TABLE chunks (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            doc_id INTEGER NOT NULL,
            chunk_page_number INTEGER NOT NULL,
            chunk_faiss_id INTEGER,
            chunk_no INTEGER NOT NULL,
            chunk_token_size INTEGER NOT NULL,
            chunk_size INTEGER NOT NULL,
            chunk_text TEXT NOT NULL,
            chunk_embedding BLOB,
            FOREIGN KEY (doc_id) REFERENCES docs(id) ON DELETE CASCADE
        )

Found 1 documents

Document 1:
Filename: cs229_lecture_notes.pdf
URL: /Users/arifdikici/Documents/Squirrel/LeafraSDK/example/example_files/cs229_lecture