diff --git a/sqlit/domains/connections/providers/duckdb/adapter.py b/sqlit/domains/connections/providers/duckdb/adapter.py index e70fbaa7..0bc97462 100644 --- a/sqlit/domains/connections/providers/duckdb/adapter.py +++ b/sqlit/domains/connections/providers/duckdb/adapter.py @@ -2,6 +2,7 @@ from __future__ import annotations +from pathlib import Path from typing import TYPE_CHECKING, Any from sqlit.domains.connections.providers.adapters.base import ( @@ -13,6 +14,11 @@ TriggerInfo, resolve_file_path, ) +from sqlit.domains.connections.providers.duckdb.data_files import ( + get_read_function, + sidecar_path_for, + table_name_for, +) if TYPE_CHECKING: from sqlit.domains.connections.domain.config import ConnectionConfig @@ -85,8 +91,45 @@ def connect(self, config: ConnectionConfig) -> Any: duckdb_any: Any = duckdb connect_args: dict[str, Any] = {} connect_args.update(config.extra_options) + + read_fn = get_read_function(file_path) + if read_fn is not None: + return self._connect_data_file( + duckdb_any, file_path, read_fn, connect_args + ) + return duckdb_any.connect(str(file_path), **connect_args) + def _connect_data_file( + self, + duckdb_any: Any, + file_path: Path, + read_fn: str, + connect_args: dict[str, Any], + ) -> Any: + """Connect to a per-process sidecar `.duckdb` backed by a data file. + + On first connect within a sqlit process the source file is loaded + into a real table so the user can run UPDATE/INSERT/DELETE against + it. Subsequent connects in the same process reuse the sidecar so + in-session edits persist across query Runs. The sidecar lives under + a PID-scoped temp dir; a new process gets a fresh load from source. + Writing back to the source is explicit: `COPY TO ''`. + """ + sidecar = sidecar_path_for(file_path) + sidecar.parent.mkdir(parents=True, exist_ok=True) + needs_load = not sidecar.exists() + + conn = duckdb_any.connect(str(sidecar), **connect_args) + if needs_load: + table = table_name_for(file_path) + path_literal = str(file_path).replace("'", "''") + conn.execute( + f'CREATE TABLE "{table}" AS ' + f"SELECT * FROM {read_fn}('{path_literal}')" + ) + return conn + def get_databases(self, conn: Any) -> list[str]: """DuckDB doesn't support multiple databases - return empty list.""" return [] diff --git a/sqlit/domains/connections/providers/duckdb/data_files.py b/sqlit/domains/connections/providers/duckdb/data_files.py new file mode 100644 index 00000000..beedc268 --- /dev/null +++ b/sqlit/domains/connections/providers/duckdb/data_files.py @@ -0,0 +1,106 @@ +"""Detection of data files queryable directly by DuckDB. + +DuckDB's `read_csv_auto`, `read_parquet`, `read_json_auto` etc. let you +query a raw data file as if it were a table. When a sqlit DuckDB connection +points at one of these files instead of a `.duckdb` database, the adapter: + +1. Picks a per-process sidecar `.duckdb` file in the OS temp dir. +2. Loads the source file into a real TABLE in the sidecar on first connect. +3. Lets the user CRUD the table freely; edits persist for the lifetime of + the sqlit process and are wiped on restart. +4. Writing back to the source file is explicit (`COPY
TO ''`). +""" + +from __future__ import annotations + +import hashlib +import os +import re +import tempfile +from pathlib import Path + + +# File extension -> DuckDB table function that can read it. +_READ_FUNCTIONS: dict[str, str] = { + ".csv": "read_csv_auto", + ".tsv": "read_csv_auto", + ".parquet": "read_parquet", + ".pq": "read_parquet", + ".json": "read_json_auto", + ".jsonl": "read_json_auto", + ".ndjson": "read_json_auto", +} + +# Allowed compression suffixes that wrap the data extensions above. DuckDB's +# auto-readers transparently decompress these. +_COMPRESSION_SUFFIXES: frozenset[str] = frozenset({".gz", ".zst", ".bz2"}) + + +def get_read_function(path: Path) -> str | None: + """Return the DuckDB table function for this file, or None if not a known + data file extension. + + Handles compressed forms like `.csv.gz` by looking past the compression + suffix. + """ + suffixes = [s.lower() for s in path.suffixes] + if not suffixes: + return None + + last = suffixes[-1] + if last in _COMPRESSION_SUFFIXES: + if len(suffixes) >= 2: + return _READ_FUNCTIONS.get(suffixes[-2]) + return None + return _READ_FUNCTIONS.get(last) + + +def is_data_file(path: Path) -> bool: + """True if the file extension is one DuckDB can query directly.""" + return get_read_function(path) is not None + + +def table_name_for(path: Path) -> str: + """Build a SQL-safe table name from a file path basename. + + Strips the data and (optional) compression extension, then sanitizes + non-identifier characters to underscores. + + Examples: + sales.csv -> sales + sales-2024.csv -> sales_2024 + events.json.gz -> events + 123-data.parquet -> _123_data + """ + stem = path.name + # Strip compression suffix if present. + lower = stem.lower() + for comp in _COMPRESSION_SUFFIXES: + if lower.endswith(comp): + stem = stem[: -len(comp)] + break + # Strip data extension. + dot = stem.rfind(".") + if dot > 0: + stem = stem[:dot] + + sanitized = re.sub(r"[^A-Za-z0-9_]+", "_", stem).strip("_") + if not sanitized: + sanitized = "data" + if sanitized[0].isdigit(): + sanitized = "_" + sanitized + return sanitized.lower() + + +def sidecar_path_for(source_path: Path) -> Path: + """Per-process scratch `.duckdb` path for a data-file source. + + Each sqlit process gets its own directory under the OS temp dir. The + sidecar persists for the lifetime of the process so edits within a + sqlit session survive across query Runs. A fresh process gets a fresh + sidecar, so source-file changes are picked up on restart and unsaved + edits are wiped (the user opted into "re-load from source each time"). + """ + digest = hashlib.sha1(str(source_path.resolve()).encode()).hexdigest()[:16] + base = Path(tempfile.gettempdir()) / f"sqlit-{os.getpid()}" / "data-files" + return base / f"{digest}.duckdb" diff --git a/tests/unit/test_duckdb_data_files.py b/tests/unit/test_duckdb_data_files.py new file mode 100644 index 00000000..64e05640 --- /dev/null +++ b/tests/unit/test_duckdb_data_files.py @@ -0,0 +1,257 @@ +"""Tests for DuckDB direct querying of data files (CSV, Parquet, JSON, ...). + +Pointing a DuckDB connection at a data file loads it into a real TABLE +inside a per-process sidecar `.duckdb`. The user can then run full CRUD +against that table; writing back to the source file is explicit via +`COPY
TO ''`. +""" + +from __future__ import annotations + +import json +import os +from pathlib import Path + +import pytest + +from sqlit.domains.connections.domain.config import ConnectionConfig +from sqlit.domains.connections.providers.duckdb.adapter import DuckDBAdapter +from sqlit.domains.connections.providers.duckdb.data_files import ( + get_read_function, + is_data_file, + sidecar_path_for, + table_name_for, +) + + +class TestGetReadFunction: + """Map file extension → DuckDB table function.""" + + @pytest.mark.parametrize( + "filename, expected", + [ + ("sales.csv", "read_csv_auto"), + ("sales.CSV", "read_csv_auto"), + ("data.tsv", "read_csv_auto"), + ("events.parquet", "read_parquet"), + ("events.pq", "read_parquet"), + ("users.json", "read_json_auto"), + ("events.jsonl", "read_json_auto"), + ("events.ndjson", "read_json_auto"), + ("sales.csv.gz", "read_csv_auto"), + ("events.json.gz", "read_json_auto"), + ("data.csv.zst", "read_csv_auto"), + ], + ) + def test_recognized_extensions(self, filename: str, expected: str): + assert get_read_function(Path(filename)) == expected + + @pytest.mark.parametrize( + "filename", + [ + "data.duckdb", + "data.db", + "data.sqlite", + "data", + "data.unknown", + "archive.tar.gz", + ], + ) + def test_unrecognized_extensions(self, filename: str): + assert get_read_function(Path(filename)) is None + assert is_data_file(Path(filename)) is False + + +class TestTableNameFor: + @pytest.mark.parametrize( + "filename, expected", + [ + ("sales.csv", "sales"), + ("Sales.CSV", "sales"), + ("sales-2024.csv", "sales_2024"), + ("events.json.gz", "events"), + ("123-data.parquet", "_123_data"), + ("weird name!.csv", "weird_name"), + ("/abs/path/to/orders.parquet", "orders"), + ], + ) + def test_table_name(self, filename: str, expected: str): + assert table_name_for(Path(filename)) == expected + + def test_empty_or_pure_punctuation_falls_back(self): + assert table_name_for(Path("---.csv")) == "data" + + +class TestSidecarPath: + def test_path_is_under_per_process_dir(self, tmp_path): + sc = sidecar_path_for(tmp_path / "sales.csv") + assert sc.suffix == ".duckdb" + # The PID is in the path so different processes don't collide. + assert f"sqlit-{os.getpid()}" in sc.parts + + def test_different_sources_have_different_sidecars(self, tmp_path): + a = sidecar_path_for(tmp_path / "a.csv") + b = sidecar_path_for(tmp_path / "b.csv") + assert a != b + + def test_same_source_is_stable_within_process(self, tmp_path): + src = tmp_path / "sales.csv" + src.write_text("a,b\n1,2\n") + assert sidecar_path_for(src) == sidecar_path_for(src) + + +class TestAdapterWithDataFile: + """End-to-end against a real DuckDB.""" + + @pytest.fixture(autouse=True) + def _require_duckdb(self): + pytest.importorskip("duckdb") + + def _make_config(self, file_path: Path) -> ConnectionConfig: + return ConnectionConfig.from_dict({ + "name": "test", + "db_type": "duckdb", + "file_path": str(file_path), + }) + + def _cleanup_sidecar(self, file_path: Path) -> None: + sc = sidecar_path_for(file_path) + if sc.exists(): + sc.unlink() + + def test_csv_file_loaded_as_queryable_table(self, tmp_path): + csv_file = tmp_path / "sales.csv" + csv_file.write_text("region,quantity\nnorth,3\nsouth,7\neast,2\n") + self._cleanup_sidecar(csv_file) + + conn = DuckDBAdapter().connect(self._make_config(csv_file)) + rows = conn.execute("SELECT region, quantity FROM sales ORDER BY region").fetchall() + assert rows == [("east", 2), ("north", 3), ("south", 7)] + + def test_table_appears_in_schema_listing(self, tmp_path): + csv_file = tmp_path / "events.csv" + csv_file.write_text("id,name\n1,alpha\n2,beta\n") + self._cleanup_sidecar(csv_file) + + adapter = DuckDBAdapter() + conn = adapter.connect(self._make_config(csv_file)) + tables = adapter.get_tables(conn) + table_names = [name for _schema, name in tables] + assert "events" in table_names + + def test_json_file_loaded_as_table(self, tmp_path): + json_file = tmp_path / "users.json" + json_file.write_text(json.dumps([ + {"id": 1, "name": "Alice"}, + {"id": 2, "name": "Bob"}, + ])) + self._cleanup_sidecar(json_file) + + conn = DuckDBAdapter().connect(self._make_config(json_file)) + rows = conn.execute("SELECT id, name FROM users ORDER BY id").fetchall() + assert rows == [(1, "Alice"), (2, "Bob")] + + def test_filename_with_dashes_becomes_underscored_table(self, tmp_path): + csv_file = tmp_path / "monthly-sales.csv" + csv_file.write_text("month,total\n2024-01,100\n2024-02,200\n") + self._cleanup_sidecar(csv_file) + + conn = DuckDBAdapter().connect(self._make_config(csv_file)) + rows = conn.execute("SELECT total FROM monthly_sales ORDER BY month").fetchall() + assert rows == [(100,), (200,)] + + def test_duckdb_file_unchanged_behavior(self, tmp_path): + """A `.duckdb` file path should still produce a normal DuckDB + connection with no auto-loaded data-file table.""" + import duckdb + + db_file = tmp_path / "scratch.duckdb" + duckdb.connect(str(db_file)).close() + + conn = DuckDBAdapter().connect(self._make_config(db_file)) + rows = conn.execute( + "SELECT count(*) FROM information_schema.tables " + "WHERE table_name = 'scratch' AND table_type = 'BASE TABLE'" + ).fetchall() + assert rows == [(0,)] + + +class TestCRUDAgainstDataFile: + """CRUD (UPDATE/INSERT/DELETE) must work against the loaded table, and + edits must persist across consecutive connections in the same process.""" + + @pytest.fixture(autouse=True) + def _require_duckdb(self): + pytest.importorskip("duckdb") + + def _make_config(self, file_path: Path) -> ConnectionConfig: + return ConnectionConfig.from_dict({ + "name": "test", + "db_type": "duckdb", + "file_path": str(file_path), + }) + + def _cleanup_sidecar(self, file_path: Path) -> None: + sc = sidecar_path_for(file_path) + if sc.exists(): + sc.unlink() + + def test_updates_persist_across_reconnects_in_same_process(self, tmp_path): + csv_file = tmp_path / "sales.csv" + csv_file.write_text("region,amount\nnorth,100\nsouth,200\n") + self._cleanup_sidecar(csv_file) + + adapter = DuckDBAdapter() + cfg = self._make_config(csv_file) + + # First connect loads the file; we modify the table. + conn1 = adapter.connect(cfg) + conn1.execute("UPDATE sales SET amount = amount * 2 WHERE region = 'north'") + conn1.execute("INSERT INTO sales VALUES ('west', 999)") + conn1.close() + + # Second connect to the same source must see the modifications, + # because the sidecar persists for this process's lifetime. + conn2 = adapter.connect(cfg) + rows = conn2.execute("SELECT region, amount FROM sales ORDER BY region").fetchall() + assert rows == [("north", 200), ("south", 200), ("west", 999)] + + def test_fresh_sidecar_reloads_from_source(self, tmp_path): + """Simulates a new sqlit process by deleting the sidecar — the + adapter should re-read the source file and the user's previous + edits should be gone.""" + csv_file = tmp_path / "sales.csv" + csv_file.write_text("region,amount\nnorth,100\n") + self._cleanup_sidecar(csv_file) + + adapter = DuckDBAdapter() + cfg = self._make_config(csv_file) + + conn = adapter.connect(cfg) + conn.execute("UPDATE sales SET amount = 9999") + conn.close() + + # Simulate process restart. + sidecar = sidecar_path_for(csv_file) + sidecar.unlink() + + conn = adapter.connect(cfg) + rows = conn.execute("SELECT region, amount FROM sales").fetchall() + assert rows == [("north", 100)] + + def test_explicit_copy_writes_back_to_source(self, tmp_path): + csv_file = tmp_path / "sales.csv" + csv_file.write_text("region,amount\nnorth,100\nsouth,200\n") + self._cleanup_sidecar(csv_file) + + conn = DuckDBAdapter().connect(self._make_config(csv_file)) + conn.execute("UPDATE sales SET amount = amount + 1") + # Path is fine to interpolate here since it came from tmp_path. + conn.execute( + f"COPY sales TO '{csv_file}' (FORMAT CSV, HEADER)" + ) + + # Source file on disk now reflects the modification. + contents = csv_file.read_text().strip().splitlines() + assert contents[0] == "region,amount" + assert sorted(contents[1:]) == ["north,101", "south,201"]