Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 18 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Icebug is a standardized graph format designed for efficient graph data intercha
| **icebug-disk** | Parquet files | Object storage, persistence |
| **icebug-memory** | Apache Arrow tables | In-process, zero-copy access |

Both represent graphs in [CSR (Compressed Sparse Row)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format, which enables fast adjacency-list traversal.
Both represent *directed* graphs in [CSR (Compressed Sparse Row)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) format, which enables fast adjacency-list traversal.

---

Expand Down Expand Up @@ -74,13 +74,20 @@ Table: follows (FROM user TO user)
Convert Arrow tables directly into an in-memory CSR graph

```python
from icebug_format import IcebugMemGraph, convert_arrow_tables_to_csr
from icebug_format import IcebugMemGraph

graph: IcebugMemGraph = convert_arrow_tables_to_csr(
# Directed heterogeneous graph (different node types on each end)
graph: IcebugMemGraph = IcebugMemGraph.from_arrow_tables(
from_node_arrow_table=users, # pa.Table, first column is the primary key
to_node_arrow_table=cities, # pa.Table, first column is the primary key
rel_arrow_table=livesin, # pa.Table with 'source' and 'target' columns
directed=True,
to_node_arrow_table=cities, # pa.Table, first column is the primary key
)

# Directed or undirected homogeneous graph (same node type on both ends)
graph: IcebugMemGraph = IcebugMemGraph.from_arrow_tables(
from_node_arrow_table=users, # pa.Table, first column is the primary key
rel_arrow_table=follows, # pa.Table with 'source' and 'target' columns
undirected=True, # undirected=True for undirected (to_node_arrow_table must be omitted)
)

# Node tables are passed through unchanged
Expand All @@ -101,7 +108,12 @@ The `rel_arrow_table` source and target columns are resolved by name in priority

Any remaining columns are preserved as edge properties in `graph.indices`.

Set `directed=False` to automatically add reverse edges (undirected graph).
Set `undirected=True` to automatically add reverse edges (undirected graph). For undirected graphs, `to_node_arrow_table` must be omitted; the same node table is used for both sides of every edge.

## Caveats

- icebug-format will always output a directed graph
- If you want an undirected graph to be converted, pass undirected=True to the CLI or Python API, and the reverse edges will be added automatically. But do note that undirected graphs are supported for rel tables with same node type on both ends only

---

Expand Down
4 changes: 2 additions & 2 deletions icebug_format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from icebug_format.cli import main
from icebug_format.memory import IcebugMemGraph, convert_arrow_tables_to_csr
from icebug_format.memory import IcebugMemGraph

__all__ = ["main", "IcebugMemGraph", "convert_arrow_tables_to_csr"]
__all__ = ["main", "IcebugMemGraph"]
46 changes: 31 additions & 15 deletions icebug_format/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ def create_csr_graph_to_duckdb(
source_db_path: str,
output_db_path: str,
limit_rels: int | None = None,
directed: bool = False,
undirected: bool = False,
csr_table_name: str = "csr_graph",
node_table: str | None = None,
edge_table: str | None = None,
Expand All @@ -400,7 +400,7 @@ def create_csr_graph_to_duckdb(
source_db_path: Path to source DuckDB with edges table
output_db_path: Path to output DuckDB for CSR data
limit_rels: Limit number of relationships for testing
directed: Whether graph is directed
undirected: Whether graph is undirected
csr_table_name: Name of table to store CSR data
node_table: Specific node table to use (default: auto-discover)
edge_table: Specific edge table to use (default: auto-discover)
Expand Down Expand Up @@ -508,6 +508,17 @@ def create_csr_graph_to_duckdb(
src_csr_table = f"{csr_table_name}_{src_table}"
dst_csr_table = f"{csr_table_name}_{dst_table}"

# For undirected graphs, from and to node tables must be the same.
if undirected:
if src_table != dst_table:
raise ValueError(
f"Undirected graphs require the same node table on both sides of an "
f"edge, but edge table '{et}' connects '{src_table}' -> '{dst_table}'. "
f"Use --undirected for homogeneous edge tables."
)
dst_pk = src_pk
dst_csr_table = src_csr_table

# Inline id→csr_index mapping as CTEs — no separate mapping tables needed
map_cte = f"""
src_map AS (
Expand Down Expand Up @@ -536,21 +547,24 @@ def create_csr_graph_to_duckdb(
if edge_cols:
reverse_cols += ", " + ", ".join(edge_cols)

# Self-loops are not filtered from directed graphs.
# For undirected graphs, the reverse UNION excludes self-loops so
# each self-loop appears exactly once (forward only).
join_clause = f"""
FROM orig.{et} e
JOIN src_map m1 ON e.source = m1.original_node_id
JOIN dst_map m2 ON e.target = m2.original_node_id
WHERE e.source != e.target"""
JOIN dst_map m2 ON e.target = m2.original_node_id"""

if limit_rels:
limit_per_table = limit_rels // len(edge_tables)
if directed:
if not undirected:
rel_query = f"""
WITH {map_cte}
SELECT {select_cols} {join_clause}
LIMIT {limit_per_table}
"""
else:
# Reverse self-loops using CSR indices (already mapped)
rel_query = f"""
WITH {map_cte},
limited AS (
Expand All @@ -560,9 +574,10 @@ def create_csr_graph_to_duckdb(
SELECT * FROM limited
UNION ALL
SELECT {reverse_cols} FROM limited
WHERE csr_source != csr_target
"""
else:
if directed:
if not undirected:
rel_query = f"""
WITH {map_cte}
SELECT {select_cols} {join_clause}
Expand All @@ -573,6 +588,7 @@ def create_csr_graph_to_duckdb(
SELECT {select_cols} {join_clause}
UNION ALL
SELECT {reverse_select_cols} {join_clause}
WHERE e.source != e.target
"""

con.execute(f"CREATE TABLE relations_{edge_name} AS {rel_query};")
Expand Down Expand Up @@ -609,9 +625,9 @@ def create_csr_graph_to_duckdb(
# Recreate with leading zero
con.execute(f"""
CREATE OR REPLACE TABLE {indptr_table} AS
SELECT 0::BIGINT AS ptr
SELECT 0::UBIGINT AS ptr
UNION ALL
SELECT ptr::int64 FROM {indptr_table}
SELECT ptr::UBIGINT FROM {indptr_table}
ORDER BY ptr;
""")

Expand All @@ -623,7 +639,7 @@ def create_csr_graph_to_duckdb(
indices_table = f"{csr_table_name}_indices_{edge_name}"
con.execute(f"""
CREATE TABLE {indices_table} AS
SELECT csr_target AS target{', ' + ', '.join(edge_cols) if edge_cols else ''}
SELECT csr_target::UBIGINT AS target{', ' + ', '.join(edge_cols) if edge_cols else ''}
FROM relations_{edge_name}
ORDER BY csr_source, csr_target;
""")
Expand Down Expand Up @@ -714,9 +730,9 @@ def main():
help="Number of edges to use in test mode (default: 50000)",
)
parser.add_argument(
"--directed",
"--undirected",
action="store_true",
help="Treat graph as directed (default: undirected)",
help="Treat graph as undirected (default: directed)",
)
parser.add_argument(
"--storage",
Expand Down Expand Up @@ -757,7 +773,7 @@ def main():
print(f"GraphAr directory: {args.graphar}")
print(f"CSR output database: {args.output_db}")
print(f"CSR table prefix: {args.csr_table}")
print(f"Directed: {args.directed}")
print(f"Undirected: {args.undirected}")
print(f"DuckDB memory limit: {args.memory_limit}")

try:
Expand All @@ -772,7 +788,7 @@ def main():
graphar_dir=args.graphar,
output_db_path=args.output_db,
csr_table_name=args.csr_table,
directed=args.directed,
undirected=args.undirected,
memory_limit=args.memory_limit,
)

Expand All @@ -796,7 +812,7 @@ def main():
print(f"Source database: {source_db_path}")
print(f"CSR output database: {args.output_db}")
print(f"CSR table prefix: {args.csr_table}")
print(f"Directed: {args.directed}")
print(f"Undirected: {args.undirected}")
print(f"DuckDB memory limit: {args.memory_limit}")

# Compute default storage path from output_db if not specified
Expand All @@ -816,7 +832,7 @@ def main():
source_db_path=source_db_path,
output_db_path=args.output_db,
limit_rels=test_limit,
directed=args.directed,
undirected=args.undirected,
csr_table_name=args.csr_table,
node_table=args.node_table,
edge_table=args.edge_table,
Expand Down
22 changes: 11 additions & 11 deletions icebug_format/graphar.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def convert_graphar_to_graph_std(
graphar_dir: str,
output_db_path: str,
csr_table_name: str = "graph",
directed: bool = False,
undirected: bool = False,
memory_limit: str = "80%",
) -> None:
"""
Expand All @@ -167,7 +167,7 @@ def convert_graphar_to_graph_std(
graphar_dir: Path to directory with GraphAr data
output_db_path: Path to output DuckDB database
csr_table_name: Name prefix for CSR tables
directed: Whether graph is directed
undirected: Whether graph is undirected
memory_limit: DuckDB memory limit setting
"""
print("\n=== Converting GraphAr to Graph-Std Format ===")
Expand Down Expand Up @@ -398,22 +398,22 @@ def convert_graphar_to_graph_std(
# Add leading zero
temp_table = f"{indptr_table}_temp"
con.execute(f"DROP TABLE IF EXISTS {temp_table}")
con.execute(f"CREATE TABLE {temp_table} (ptr BIGINT)")
con.execute(f"INSERT INTO {temp_table} VALUES (CAST(0 AS BIGINT))")
con.execute(f"INSERT INTO {temp_table} SELECT ptr FROM {indptr_table}")
con.execute(f"CREATE TABLE {temp_table} (ptr UBIGINT)")
con.execute(f"INSERT INTO {temp_table} VALUES (CAST(0 AS UBIGINT))")
con.execute(f"INSERT INTO {temp_table} SELECT CAST(ptr AS UBIGINT) FROM {indptr_table}")
con.execute(f"DROP TABLE {indptr_table}")
con.execute(f"ALTER TABLE {temp_table} RENAME TO {indptr_table}")

# Build CSR indices
indices_table = f"{csr_table_name}_indices_{edge_type}"

col_defs = "target BIGINT"
col_defs = "target UBIGINT"
for prop in prop_cols:
col_defs += f", {prop} BIGINT"

con.execute(f"""
CREATE TABLE {indices_table} AS
SELECT csr_target AS target{', ' + ', '.join(prop_cols) if prop_cols else ''}
SELECT CAST(csr_target AS UBIGINT) AS target{', ' + ', '.join(prop_cols) if prop_cols else ''}
FROM {rel_table_name}
ORDER BY csr_source, csr_target
""")
Expand Down Expand Up @@ -569,9 +569,9 @@ def main():
help="Table name prefix for CSR data (default: graph)",
)
parser.add_argument(
"--directed",
"--undirected",
action="store_true",
help="Treat graph as directed (default: undirected)",
help="Treat graph as undirected (default: directed)",
)

args = parser.parse_args()
Expand All @@ -580,13 +580,13 @@ def main():
print(f"GraphAr directory: {args.graphar_dir}")
print(f"CSR output database: {args.output_db}")
print(f"CSR table prefix: {args.csr_table}")
print(f"Directed: {args.directed}")
print(f"Undirected: {args.undirected}")

convert_graphar_to_graph_std(
graphar_dir=args.graphar_dir,
output_db_path=args.output_db,
csr_table_name=args.csr_table,
directed=args.directed,
undirected=args.undirected,
)

print("\n=== Conversion Completed Successfully! ===")
Expand Down
Loading
Loading