In [1]:
import sys
from pathlib import Path

project_root = str(Path.cwd().parent.parent.parent)
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from src.ingestion.vector_store.stores import ChromaStore 

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from src.utils.config import settings, LibreryConfig

In [4]:
vector_store = ChromaStore(settings.vector_store)


[32m2026-01-31 18:56:00[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m24[0m - [1mcreating or getting the collection[0m
[32m2026-01-31 18:56:00[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m30[0m - [1mgetting the embedder[0m
[32m2026-01-31 18:56:00[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m
[32m2026-01-31 18:56:04[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m


In [5]:
vector_store.count()

0

In [6]:
vector_store.delete_by_filename(filename="Word2Vec")

[32m2026-01-31 18:56:04[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36mdelete_by_filename[0m:[36m116[0m - [1mDeleting all chunks for: Word2Vec[0m


In [7]:
vector_store.count()

0

In [8]:
from src.ingestion.vector_store.stores import get_store 
from src.ingestion.parsers.get_parser import get_parser
from src.ingestion.embedding.get_embbedder import get_embedder
from src.ingestion.chunking.get_chunker import get_chunker

In [11]:
import hashlib
import json
from pathlib import Path
from src.utils.logger import logger

class LibreryManager:
    def __init__(self, config:LibreryConfig) -> None:
        self.books_dir = config.books_paths
        self.manifest_path = config.manifest_path
        self.store , _ = get_store()
        self.parser = get_parser()
        self.chunker = get_chunker()
        self.embedder = get_embedder()
        self.manifest = self._load_manifest()
    
    def _load_manifest(self) -> dict:
        logger.info("loading the mainfest")
        if self.manifest_path.exists():
            logger.info("finished loading the mainfest")
            return json.loads(self.manifest_path.read_text())
        logger.info("finished loading the mainfest")   
        return {}
    
    def _calculate_hash(self, file_path: Path) -> str:
        sha256 = hashlib.sha256()
        with open(file_path, "rb") as f:
            while chunk := f.read(8192):
                sha256.update(chunk)
        return sha256.hexdigest()
    def sync(self):
        current_files = list(self.books_dir.glob("*.pdf"))
        found_filenames = {f.name for f in current_files}

        for filename in list(self.manifest.keys()):
            if filename not in found_filenames:
                self.store.delete_by_filename(filename)
                del self.manifest[filename]
                logger.info(f"Cleaned up {filename} from index.")
        for file_path in current_files:
            name = file_path.name
            current_hash = self._calculate_hash(file_path=file_path)

            if self.manifest.get(name, "") != current_hash:
                if name in self.manifest:
                    logger.info(f"Content change detected in {name}. Re-indexing...")
                    self.store.delete_by_filename(name)
                
                parsed_doc = self.parser.parse(file_path)
                chunked_doc = self.chunker.chunk(parsed_doc)
                embeddings = self.embedder.embed_chunk(chunks=chunked_doc)
                self.store.ingest(embch=embeddings)
                self.manifest[name] = current_hash
                logger.success(f"Indexed {name}")
        self.manifest_path.write_text(json.dumps(self.manifest, indent=4))

In [None]:
manager = LibreryManager(settings.librery)

[32m2026-01-31 18:58:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m24[0m - [1mcreating or getting the collection[0m
[32m2026-01-31 18:58:33[0m | [1mINFO    [0m | [36msrc.ingestion.vector_store.stores[0m:[36m__init__[0m:[36m30[0m - [1mgetting the embedder[0m
[32m2026-01-31 18:58:33[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m
[32m2026-01-31 18:58:36[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cpu[0m
[32m2026-01-31 18:58:36[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m19[0m - [1mLoading SentenceTransformer model: all-MiniLM-L6-v2[0m
[32m2026-01-31 18:58:40[0m | [1mINFO    [0m | [36msrc.ingestion.embedding.embedder[0m:[36m__init__[0m:[36m27[0m - [1mModel loaded: 384d on cp