# Library Manager Test - With Progress Tracking

This notebook tests the LibraryManager with better progress visibility.

## Setup

In [1]:
import sys
from pathlib import Path

project_root = str(Path.cwd().parent.parent.parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"‚úÖ Project root: {project_root}")

‚úÖ Project root: /home/moad/desktop/open-books


## Import LibraryManager

In [2]:
from src.utils.config import settings, LibreryConfig

print("‚úÖ Imports successful")

‚úÖ Imports successful


In [6]:
"""Library management for syncing PDFs to vector store."""

import hashlib
import json
from pathlib import Path
from typing import Dict

from src.ingestion.chunking.get_chunker import get_chunker
from src.ingestion.embedding.get_embbedder import get_embedder
from src.ingestion.parsers.get_parser import get_parser
from src.ingestion.vector_store.stores import get_store, ChromaStore
from src.utils.logger import logger


class LibraryManager:
    """Manages PDF library syncing with vector store."""

    def __init__(self, config: LibreryConfig) -> None:
        self.books_dir = Path(config.books_paths)
        self.manifest_path = Path(config.manifest_path)

        logger.info("Initializing LibraryManager...")

        # Initialize components (embedder loaded once here)
        self.store = ChromaStore(settings.vector_store)
        self.parser = get_parser()
        self.chunker = get_chunker()

        # Load manifest
        self.manifest = self._load_manifest()
        logger.info(f"Loaded manifest with {len(self.manifest)} entries")

    def _load_manifest(self) -> Dict[str, str]:
        """Load manifest file or create empty one."""
        if self.manifest_path.exists():
            try:
                return json.loads(self.manifest_path.read_text())
            except Exception as e:
                logger.warning(f"Failed to load manifest: {e}. Starting fresh.")
                return {}
        return {}

    def _save_manifest(self) -> None:
        """Save manifest to disk."""
        try:
            self.manifest_path.parent.mkdir(parents=True, exist_ok=True)
            self.manifest_path.write_text(json.dumps(self.manifest, indent=2))
            logger.debug("Manifest saved")
        except Exception as e:
            logger.error(f"Failed to save manifest: {e}")

    def _calculate_hash(self, file_path: Path) -> str:
        """Calculate SHA256 hash of file."""
        sha256 = hashlib.sha256()
        with open(file_path, "rb") as f:
            while chunk := f.read(8192):
                sha256.update(chunk)
        return sha256.hexdigest()

    def sync(self) -> None:
        """Sync books directory with vector store."""
        logger.info(f"Starting sync from: {self.books_dir}")

        # Get current PDF files
        current_files = list(self.books_dir.glob("*.pdf"))
        logger.info(f"Found {len(current_files)} PDF files")

        if not current_files:
            logger.warning("No PDF files found in books directory")
            return

        found_filenames = {f.name for f in current_files}

        self._cleanup_deleted_files(found_filenames)

        self._process_files(current_files)

        self._save_manifest()

        logger.success(f"Sync complete! Vector store has {self.store.count()} chunks")

    def _cleanup_deleted_files(self, found_filenames: set) -> None:
        for filename in list(self.manifest.keys()):
            if filename not in found_filenames:
                logger.info(f"File removed: {filename}")
                try:
                    self.store.delete_by_filename(filename)
                    del self.manifest[filename]
                    logger.info(f"Cleaned up {filename} from index")
                except Exception as e:
                    logger.error(f"Failed to clean up {filename}: {e}")

    def _process_files(self, current_files: list) -> None:
        total = len(current_files)

        for idx, file_path in enumerate(current_files, 1):
            name = file_path.name
            logger.info(f"\n[{idx}/{total}] Processing: {name}")

            try:
                # Calculate current hash
                logger.debug(f"Calculating hash for {name}...")
                current_hash = self._calculate_hash(file_path)

                # Check if file changed
                if self.manifest.get(name) == current_hash:
                    logger.info(f"Skipping {name} (unchanged)")
                    continue

                # File is new or changed
                if name in self.manifest:
                    logger.info(f"Content changed: {name}")
                    self.store.delete_by_filename(name)
                else:
                    logger.info(f"New file: {name}")

                # Process file
                self._index_file(file_path, name, current_hash)

            except Exception as e:
                logger.error(f"Failed to process {name}: {e}")
                logger.exception("Full traceback:")
                # Continue with next file instead of crashing

    def _index_file(self, file_path: Path, name: str, file_hash: str) -> None:
        # Parse
        logger.info(f"  Parsing {name}...")
        parsed_doc = self.parser.parse(file_path)
        logger.info(f"  Parsed {parsed_doc.metadata.nbr_pages} pages")

        # Chunk
        logger.info(f"  Chunking {name}...")
        chunked_doc = self.chunker.chunk(parsed_doc)
        logger.info(f"  Created {len(chunked_doc)} chunks")

        

        # Ingest
        logger.info(f"  Storing {name}...")
        self.store.ingest(chunks=chunked_doc)
        logger.info(f"  Stored in vector DB")

        # Update manifest
        self.manifest[name] = file_hash
        logger.success(f"Successfully indexed: {name}")

    def get_stats(self) -> Dict[str, int]:
        """Get library statistics."""
        return {
            "indexed_files": len(self.manifest),
            "total_chunks": self.store.count(),
        }

    def force_reindex(self, filename: str) -> None:
        """Force reindex a specific file."""
        logger.info(f"Force reindexing: {filename}")

        file_path = self.books_dir / filename
        if not file_path.exists():
            logger.error(f"File not found: {filename}")
            return

        # Remove from index
        if filename in self.manifest:
            self.store.delete_by_filename(filename)
            del self.manifest[filename]

        # Reindex
        current_hash = self._calculate_hash(file_path)
        self._index_file(file_path, filename, current_hash)
        self._save_manifest()

    def clear_all(self) -> None:
        """Clear all indexed data."""
        logger.warning("Clearing all indexed data...")
        self.store.clear()
        self.manifest = {}
        self._save_manifest()
        logger.success("All data cleared")

## Check Configuration

In [7]:
# Verify config
config = settings.librery

print(f"üìÅ Books directory: {config.books_paths}")
print(f"üìÑ Manifest path: {config.manifest_path}")

# Check if directory exists
books_path = Path(config.books_paths)
if books_path.exists():
    pdf_files = list(books_path.glob("*.pdf"))
    print(f"‚úÖ Found {len(pdf_files)} PDF files:")
    for f in pdf_files:
        print(f"   - {f.name}")
else:
    print(f"‚ùå Directory not found: {books_path}")

üìÅ Books directory: /home/moad/desktop/open-books/books
üìÑ Manifest path: /home/moad/desktop/open-books/config/manifest.json
‚úÖ Found 2 PDF files:
   - 1912_lora_low_rank_adaptation_of_la.pdf
   - Adam A Method for Stochastic Optimization.pdf


## Initialize LibraryManager

In [8]:
print("Initializing LibraryManager...\n")

manager = LibraryManager(config)

print("\n‚úÖ LibraryManager initialized!")

[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m22[0m - [1mInitializing LibraryManager...[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m25[0m - [1mcreating or getting the collection[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m31[0m - [1mgetting the embedder[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m__init__[0m:[36m31[0m - [1mLoaded manifest with 2 entries[0m


Initializing LibraryManager...


‚úÖ LibraryManager initialized!


## Check Current Stats

In [9]:
stats = manager.get_stats()

print("üìä Current Stats:")
print(f"   Indexed files: {stats['indexed_files']}")
print(f"   Total chunks: {stats['total_chunks']}")

üìä Current Stats:
   Indexed files: 2
   Total chunks: 304


## Option 1: Sync Library (Incremental)

This will:
- Skip unchanged files
- Index new files
- Re-index changed files
- Remove deleted files

In [10]:
print("Starting incremental sync...\n")
print("=" * 60)

manager.sync()

print("=" * 60)
print("\n‚úÖ Sync complete!")

# Show updated stats
stats = manager.get_stats()
print(f"\nüìä Updated Stats:")
print(f"   Indexed files: {stats['indexed_files']}")
print(f"   Total chunks: {stats['total_chunks']}")

[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36msync[0m:[36m62[0m - [1mStarting sync from: /home/moad/desktop/open-books/books[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36msync[0m:[36m66[0m - [1mFound 2 PDF files[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_files[0m:[36m98[0m - [1m
[1/2] Processing: 1912_lora_low_rank_adaptation_of_la.pdf[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_files[0m:[36m107[0m - [1mSkipping 1912_lora_low_rank_adaptation_of_la.pdf (unchanged)[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_files[0m:[36m98[0m - [1m
[2/2] Processing: Adam A Method for Stochastic Optimization.pdf[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36m__main__[0m:[36m_process_files[0m:[36m107[0m - [1mSkipping Adam A Method for Stochastic Optimization.pdf (unchanged)[0m
[32m202

Starting incremental sync...


‚úÖ Sync complete!

üìä Updated Stats:
   Indexed files: 2
   Total chunks: 304


## Option 2: Force Reindex Single File

In [11]:
# Uncomment to force reindex a specific file
# manager.force_reindex("Word2Vec.pdf")

## Option 3: Clear All Data (‚ö†Ô∏è Destructive)

In [12]:
# Uncomment to clear all indexed data
# manager.clear_all()

## Test Search

In [13]:
# Test if indexing worked
test_query = "word embeddings"
results = manager.store.query([test_query], n_result=3)

print(f"üîç Search results for: '{test_query}'\n")
for i, result in enumerate(results, 1):
    print(f"[{i}] Score: {result.score:.4f}")
    print(f"    File: {result.metadata.source_doc_title}")
    print(f"    Preview: {result.content[:150]}...")
    print()

[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m60[0m - [1mquerying the results[0m
[32m2026-01-31 20:35:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mquery[0m:[36m96[0m - [1mfinished the querying - found 3 unique results[0m


üîç Search results for: 'word embeddings'

[1] Score: 0.7569
    File: BERT Pre-training of Deep Bidirectional Transformers for Language Understanding
    Preview: ## 2.1 Unsupervised Feature-based Approaches

Learning widely applicable representations of words has been an active area of research for decades, inc...

[2] Score: 0.8421
    File: BERT Pre-training of Deep Bidirectional Transformers for Language Understanding
    Preview: arned in translation: Contextualized word vectors. In NIPS .
- Oren Melamud, Jacob Goldberger, and Ido Dagan. 2016. context2vec: Learning generic cont...

[3] Score: 1.0083
    File: BERT Pre-training of Deep Bidirectional Transformers for Language Understanding
    Preview: ## A.2 Pre-training Procedure

To generate each training input sequence, we sample two spans of text from the corpus, which we refer to as 'sentences'...



## Debug: View Manifest

In [14]:
import json

print("üìã Current Manifest:")
print(json.dumps(manager.manifest, indent=2))

üìã Current Manifest:
{
  "1912_lora_low_rank_adaptation_of_la.pdf": "6154f901b7873cfc910e5d57156907f7b776f83f0c5f532a6b721bc614f0be35",
  "Adam A Method for Stochastic Optimization.pdf": "eab9c73ae2ceda884b94830bda99312254bac4806f6c9f045cbab90721ecda31"
}
