Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion graphgen/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,5 +33,5 @@
from .searcher.web.bing_search import BingSearch
from .searcher.web.google_search import GoogleSearch
from .splitter import ChineseRecursiveTextSplitter, RecursiveCharacterSplitter
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage
from .storage import JsonKVStorage, JsonListStorage, NetworkXStorage, RocksDBCache
from .tokenizer import Tokenizer
1 change: 1 addition & 0 deletions graphgen/models/storage/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .json_storage import JsonKVStorage, JsonListStorage
from .networkx_storage import NetworkXStorage
from .rocksdb_cache import RocksDBCache
43 changes: 43 additions & 0 deletions graphgen/models/storage/rocksdb_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import Path
from typing import Any, Iterator, Optional
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

To support a more flexible type hint in __init__, please import Union from the typing module.

Suggested change
from typing import Any, Iterator, Optional
from typing import Any, Iterator, Optional, Union


# rocksdict is a lightweight C wrapper around RocksDB for Python, pylint may not recognize it
# pylint: disable=no-name-in-module
from rocksdict import Rdict


class RocksDBCache:
def __init__(self, cache_dir: str):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For improved flexibility and to align with modern Python practices using pathlib, it's better to accept both string paths and Path objects for cache_dir. Path(cache_dir) already handles both types correctly.

Suggested change
def __init__(self, cache_dir: str):
def __init__(self, cache_dir: Union[str, Path]):

self.db_path = Path(cache_dir)
self.db = Rdict(str(self.db_path))

def get(self, key: str) -> Optional[Any]:
return self.db.get(key)

def set(self, key: str, value: Any):
self.db[key] = value

def delete(self, key: str):
try:
del self.db[key]
except KeyError:
# If the key does not exist, do nothing (deletion is idempotent for caches)
pass

def close(self):
if hasattr(self, "db") and self.db is not None:
self.db.close()
self.db = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better resource management and to make RocksDBCache more robust and reusable, consider implementing the context manager protocol (__enter__ and __exit__). This ensures that the database connection is always closed, even if errors occur.

This would allow using RocksDBCache directly with a with statement, which is idiomatic Python for managing resources.

Suggested change
self.db = None
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.close()


def __del__(self):
# Ensure the database is closed when the object is destroyed
self.close()

def __enter__(self):
return self

def __exit__(self, exc_type, exc_val, exc_tb):
self.close()

def __iter__(self) -> Iterator[str]:
return iter(self.db.keys())
5 changes: 2 additions & 3 deletions graphgen/operators/read/parallel_file_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,15 @@
from pathlib import Path
from typing import Any, Dict, List, Set, Union

from diskcache import Cache

from graphgen.models import RocksDBCache
from graphgen.utils import logger


class ParallelFileScanner:
def __init__(
self, cache_dir: str, allowed_suffix, rescan: bool = False, max_workers: int = 4
):
self.cache = Cache(cache_dir)
self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Since pathlib is already used in this file, it would be more consistent to use it for path manipulation here as well, instead of os.path.join. This improves readability and maintains a consistent style.

To make this even cleaner, you could consider updating RocksDBCache.__init__ to accept pathlib.Path objects directly, which would remove the need for str() conversion here.

Suggested change
self.cache = RocksDBCache(os.path.join(cache_dir, "file_paths_cache"))
self.cache = RocksDBCache(str(Path(cache_dir) / "file_paths_cache"))

self.allowed_suffix = set(allowed_suffix) if allowed_suffix else None
self.rescan = rescan
self.max_workers = max_workers
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,15 @@ requests
fastapi
trafilatura
aiohttp
diskcache
socksio

leidenalg
igraph
python-louvain

# storage
rocksdict

# KG
rdflib

Expand Down