diff --git a/sqlit/domains/connections/providers/duckdb/adapter.py b/sqlit/domains/connections/providers/duckdb/adapter.py
index e70fbaa7..0bc97462 100644
--- a/sqlit/domains/connections/providers/duckdb/adapter.py
+++ b/sqlit/domains/connections/providers/duckdb/adapter.py
@@ -2,6 +2,7 @@
from __future__ import annotations
+from pathlib import Path
from typing import TYPE_CHECKING, Any
from sqlit.domains.connections.providers.adapters.base import (
@@ -13,6 +14,11 @@
TriggerInfo,
resolve_file_path,
)
+from sqlit.domains.connections.providers.duckdb.data_files import (
+ get_read_function,
+ sidecar_path_for,
+ table_name_for,
+)
if TYPE_CHECKING:
from sqlit.domains.connections.domain.config import ConnectionConfig
@@ -85,8 +91,45 @@ def connect(self, config: ConnectionConfig) -> Any:
duckdb_any: Any = duckdb
connect_args: dict[str, Any] = {}
connect_args.update(config.extra_options)
+
+ read_fn = get_read_function(file_path)
+ if read_fn is not None:
+ return self._connect_data_file(
+ duckdb_any, file_path, read_fn, connect_args
+ )
+
return duckdb_any.connect(str(file_path), **connect_args)
+ def _connect_data_file(
+ self,
+ duckdb_any: Any,
+ file_path: Path,
+ read_fn: str,
+ connect_args: dict[str, Any],
+ ) -> Any:
+ """Connect to a per-process sidecar `.duckdb` backed by a data file.
+
+ On first connect within a sqlit process the source file is loaded
+ into a real table so the user can run UPDATE/INSERT/DELETE against
+ it. Subsequent connects in the same process reuse the sidecar so
+ in-session edits persist across query Runs. The sidecar lives under
+ a PID-scoped temp dir; a new process gets a fresh load from source.
+ Writing back to the source is explicit: `COPY
TO ''`.
+ """
+ sidecar = sidecar_path_for(file_path)
+ sidecar.parent.mkdir(parents=True, exist_ok=True)
+ needs_load = not sidecar.exists()
+
+ conn = duckdb_any.connect(str(sidecar), **connect_args)
+ if needs_load:
+ table = table_name_for(file_path)
+ path_literal = str(file_path).replace("'", "''")
+ conn.execute(
+ f'CREATE TABLE "{table}" AS '
+ f"SELECT * FROM {read_fn}('{path_literal}')"
+ )
+ return conn
+
def get_databases(self, conn: Any) -> list[str]:
"""DuckDB doesn't support multiple databases - return empty list."""
return []
diff --git a/sqlit/domains/connections/providers/duckdb/data_files.py b/sqlit/domains/connections/providers/duckdb/data_files.py
new file mode 100644
index 00000000..beedc268
--- /dev/null
+++ b/sqlit/domains/connections/providers/duckdb/data_files.py
@@ -0,0 +1,106 @@
+"""Detection of data files queryable directly by DuckDB.
+
+DuckDB's `read_csv_auto`, `read_parquet`, `read_json_auto` etc. let you
+query a raw data file as if it were a table. When a sqlit DuckDB connection
+points at one of these files instead of a `.duckdb` database, the adapter:
+
+1. Picks a per-process sidecar `.duckdb` file in the OS temp dir.
+2. Loads the source file into a real TABLE in the sidecar on first connect.
+3. Lets the user CRUD the table freely; edits persist for the lifetime of
+ the sqlit process and are wiped on restart.
+4. Writing back to the source file is explicit (`COPY TO ''`).
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import re
+import tempfile
+from pathlib import Path
+
+
+# File extension -> DuckDB table function that can read it.
+_READ_FUNCTIONS: dict[str, str] = {
+ ".csv": "read_csv_auto",
+ ".tsv": "read_csv_auto",
+ ".parquet": "read_parquet",
+ ".pq": "read_parquet",
+ ".json": "read_json_auto",
+ ".jsonl": "read_json_auto",
+ ".ndjson": "read_json_auto",
+}
+
+# Allowed compression suffixes that wrap the data extensions above. DuckDB's
+# auto-readers transparently decompress these.
+_COMPRESSION_SUFFIXES: frozenset[str] = frozenset({".gz", ".zst", ".bz2"})
+
+
+def get_read_function(path: Path) -> str | None:
+ """Return the DuckDB table function for this file, or None if not a known
+ data file extension.
+
+ Handles compressed forms like `.csv.gz` by looking past the compression
+ suffix.
+ """
+ suffixes = [s.lower() for s in path.suffixes]
+ if not suffixes:
+ return None
+
+ last = suffixes[-1]
+ if last in _COMPRESSION_SUFFIXES:
+ if len(suffixes) >= 2:
+ return _READ_FUNCTIONS.get(suffixes[-2])
+ return None
+ return _READ_FUNCTIONS.get(last)
+
+
+def is_data_file(path: Path) -> bool:
+ """True if the file extension is one DuckDB can query directly."""
+ return get_read_function(path) is not None
+
+
+def table_name_for(path: Path) -> str:
+ """Build a SQL-safe table name from a file path basename.
+
+ Strips the data and (optional) compression extension, then sanitizes
+ non-identifier characters to underscores.
+
+ Examples:
+ sales.csv -> sales
+ sales-2024.csv -> sales_2024
+ events.json.gz -> events
+ 123-data.parquet -> _123_data
+ """
+ stem = path.name
+ # Strip compression suffix if present.
+ lower = stem.lower()
+ for comp in _COMPRESSION_SUFFIXES:
+ if lower.endswith(comp):
+ stem = stem[: -len(comp)]
+ break
+ # Strip data extension.
+ dot = stem.rfind(".")
+ if dot > 0:
+ stem = stem[:dot]
+
+ sanitized = re.sub(r"[^A-Za-z0-9_]+", "_", stem).strip("_")
+ if not sanitized:
+ sanitized = "data"
+ if sanitized[0].isdigit():
+ sanitized = "_" + sanitized
+ return sanitized.lower()
+
+
+def sidecar_path_for(source_path: Path) -> Path:
+ """Per-process scratch `.duckdb` path for a data-file source.
+
+ Each sqlit process gets its own directory under the OS temp dir. The
+ sidecar persists for the lifetime of the process so edits within a
+ sqlit session survive across query Runs. A fresh process gets a fresh
+ sidecar, so source-file changes are picked up on restart and unsaved
+ edits are wiped (the user opted into "re-load from source each time").
+ """
+ digest = hashlib.sha1(str(source_path.resolve()).encode()).hexdigest()[:16]
+ base = Path(tempfile.gettempdir()) / f"sqlit-{os.getpid()}" / "data-files"
+ return base / f"{digest}.duckdb"
diff --git a/tests/unit/test_duckdb_data_files.py b/tests/unit/test_duckdb_data_files.py
new file mode 100644
index 00000000..64e05640
--- /dev/null
+++ b/tests/unit/test_duckdb_data_files.py
@@ -0,0 +1,257 @@
+"""Tests for DuckDB direct querying of data files (CSV, Parquet, JSON, ...).
+
+Pointing a DuckDB connection at a data file loads it into a real TABLE
+inside a per-process sidecar `.duckdb`. The user can then run full CRUD
+against that table; writing back to the source file is explicit via
+`COPY TO ''`.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+from pathlib import Path
+
+import pytest
+
+from sqlit.domains.connections.domain.config import ConnectionConfig
+from sqlit.domains.connections.providers.duckdb.adapter import DuckDBAdapter
+from sqlit.domains.connections.providers.duckdb.data_files import (
+ get_read_function,
+ is_data_file,
+ sidecar_path_for,
+ table_name_for,
+)
+
+
+class TestGetReadFunction:
+ """Map file extension → DuckDB table function."""
+
+ @pytest.mark.parametrize(
+ "filename, expected",
+ [
+ ("sales.csv", "read_csv_auto"),
+ ("sales.CSV", "read_csv_auto"),
+ ("data.tsv", "read_csv_auto"),
+ ("events.parquet", "read_parquet"),
+ ("events.pq", "read_parquet"),
+ ("users.json", "read_json_auto"),
+ ("events.jsonl", "read_json_auto"),
+ ("events.ndjson", "read_json_auto"),
+ ("sales.csv.gz", "read_csv_auto"),
+ ("events.json.gz", "read_json_auto"),
+ ("data.csv.zst", "read_csv_auto"),
+ ],
+ )
+ def test_recognized_extensions(self, filename: str, expected: str):
+ assert get_read_function(Path(filename)) == expected
+
+ @pytest.mark.parametrize(
+ "filename",
+ [
+ "data.duckdb",
+ "data.db",
+ "data.sqlite",
+ "data",
+ "data.unknown",
+ "archive.tar.gz",
+ ],
+ )
+ def test_unrecognized_extensions(self, filename: str):
+ assert get_read_function(Path(filename)) is None
+ assert is_data_file(Path(filename)) is False
+
+
+class TestTableNameFor:
+ @pytest.mark.parametrize(
+ "filename, expected",
+ [
+ ("sales.csv", "sales"),
+ ("Sales.CSV", "sales"),
+ ("sales-2024.csv", "sales_2024"),
+ ("events.json.gz", "events"),
+ ("123-data.parquet", "_123_data"),
+ ("weird name!.csv", "weird_name"),
+ ("/abs/path/to/orders.parquet", "orders"),
+ ],
+ )
+ def test_table_name(self, filename: str, expected: str):
+ assert table_name_for(Path(filename)) == expected
+
+ def test_empty_or_pure_punctuation_falls_back(self):
+ assert table_name_for(Path("---.csv")) == "data"
+
+
+class TestSidecarPath:
+ def test_path_is_under_per_process_dir(self, tmp_path):
+ sc = sidecar_path_for(tmp_path / "sales.csv")
+ assert sc.suffix == ".duckdb"
+ # The PID is in the path so different processes don't collide.
+ assert f"sqlit-{os.getpid()}" in sc.parts
+
+ def test_different_sources_have_different_sidecars(self, tmp_path):
+ a = sidecar_path_for(tmp_path / "a.csv")
+ b = sidecar_path_for(tmp_path / "b.csv")
+ assert a != b
+
+ def test_same_source_is_stable_within_process(self, tmp_path):
+ src = tmp_path / "sales.csv"
+ src.write_text("a,b\n1,2\n")
+ assert sidecar_path_for(src) == sidecar_path_for(src)
+
+
+class TestAdapterWithDataFile:
+ """End-to-end against a real DuckDB."""
+
+ @pytest.fixture(autouse=True)
+ def _require_duckdb(self):
+ pytest.importorskip("duckdb")
+
+ def _make_config(self, file_path: Path) -> ConnectionConfig:
+ return ConnectionConfig.from_dict({
+ "name": "test",
+ "db_type": "duckdb",
+ "file_path": str(file_path),
+ })
+
+ def _cleanup_sidecar(self, file_path: Path) -> None:
+ sc = sidecar_path_for(file_path)
+ if sc.exists():
+ sc.unlink()
+
+ def test_csv_file_loaded_as_queryable_table(self, tmp_path):
+ csv_file = tmp_path / "sales.csv"
+ csv_file.write_text("region,quantity\nnorth,3\nsouth,7\neast,2\n")
+ self._cleanup_sidecar(csv_file)
+
+ conn = DuckDBAdapter().connect(self._make_config(csv_file))
+ rows = conn.execute("SELECT region, quantity FROM sales ORDER BY region").fetchall()
+ assert rows == [("east", 2), ("north", 3), ("south", 7)]
+
+ def test_table_appears_in_schema_listing(self, tmp_path):
+ csv_file = tmp_path / "events.csv"
+ csv_file.write_text("id,name\n1,alpha\n2,beta\n")
+ self._cleanup_sidecar(csv_file)
+
+ adapter = DuckDBAdapter()
+ conn = adapter.connect(self._make_config(csv_file))
+ tables = adapter.get_tables(conn)
+ table_names = [name for _schema, name in tables]
+ assert "events" in table_names
+
+ def test_json_file_loaded_as_table(self, tmp_path):
+ json_file = tmp_path / "users.json"
+ json_file.write_text(json.dumps([
+ {"id": 1, "name": "Alice"},
+ {"id": 2, "name": "Bob"},
+ ]))
+ self._cleanup_sidecar(json_file)
+
+ conn = DuckDBAdapter().connect(self._make_config(json_file))
+ rows = conn.execute("SELECT id, name FROM users ORDER BY id").fetchall()
+ assert rows == [(1, "Alice"), (2, "Bob")]
+
+ def test_filename_with_dashes_becomes_underscored_table(self, tmp_path):
+ csv_file = tmp_path / "monthly-sales.csv"
+ csv_file.write_text("month,total\n2024-01,100\n2024-02,200\n")
+ self._cleanup_sidecar(csv_file)
+
+ conn = DuckDBAdapter().connect(self._make_config(csv_file))
+ rows = conn.execute("SELECT total FROM monthly_sales ORDER BY month").fetchall()
+ assert rows == [(100,), (200,)]
+
+ def test_duckdb_file_unchanged_behavior(self, tmp_path):
+ """A `.duckdb` file path should still produce a normal DuckDB
+ connection with no auto-loaded data-file table."""
+ import duckdb
+
+ db_file = tmp_path / "scratch.duckdb"
+ duckdb.connect(str(db_file)).close()
+
+ conn = DuckDBAdapter().connect(self._make_config(db_file))
+ rows = conn.execute(
+ "SELECT count(*) FROM information_schema.tables "
+ "WHERE table_name = 'scratch' AND table_type = 'BASE TABLE'"
+ ).fetchall()
+ assert rows == [(0,)]
+
+
+class TestCRUDAgainstDataFile:
+ """CRUD (UPDATE/INSERT/DELETE) must work against the loaded table, and
+ edits must persist across consecutive connections in the same process."""
+
+ @pytest.fixture(autouse=True)
+ def _require_duckdb(self):
+ pytest.importorskip("duckdb")
+
+ def _make_config(self, file_path: Path) -> ConnectionConfig:
+ return ConnectionConfig.from_dict({
+ "name": "test",
+ "db_type": "duckdb",
+ "file_path": str(file_path),
+ })
+
+ def _cleanup_sidecar(self, file_path: Path) -> None:
+ sc = sidecar_path_for(file_path)
+ if sc.exists():
+ sc.unlink()
+
+ def test_updates_persist_across_reconnects_in_same_process(self, tmp_path):
+ csv_file = tmp_path / "sales.csv"
+ csv_file.write_text("region,amount\nnorth,100\nsouth,200\n")
+ self._cleanup_sidecar(csv_file)
+
+ adapter = DuckDBAdapter()
+ cfg = self._make_config(csv_file)
+
+ # First connect loads the file; we modify the table.
+ conn1 = adapter.connect(cfg)
+ conn1.execute("UPDATE sales SET amount = amount * 2 WHERE region = 'north'")
+ conn1.execute("INSERT INTO sales VALUES ('west', 999)")
+ conn1.close()
+
+ # Second connect to the same source must see the modifications,
+ # because the sidecar persists for this process's lifetime.
+ conn2 = adapter.connect(cfg)
+ rows = conn2.execute("SELECT region, amount FROM sales ORDER BY region").fetchall()
+ assert rows == [("north", 200), ("south", 200), ("west", 999)]
+
+ def test_fresh_sidecar_reloads_from_source(self, tmp_path):
+ """Simulates a new sqlit process by deleting the sidecar — the
+ adapter should re-read the source file and the user's previous
+ edits should be gone."""
+ csv_file = tmp_path / "sales.csv"
+ csv_file.write_text("region,amount\nnorth,100\n")
+ self._cleanup_sidecar(csv_file)
+
+ adapter = DuckDBAdapter()
+ cfg = self._make_config(csv_file)
+
+ conn = adapter.connect(cfg)
+ conn.execute("UPDATE sales SET amount = 9999")
+ conn.close()
+
+ # Simulate process restart.
+ sidecar = sidecar_path_for(csv_file)
+ sidecar.unlink()
+
+ conn = adapter.connect(cfg)
+ rows = conn.execute("SELECT region, amount FROM sales").fetchall()
+ assert rows == [("north", 100)]
+
+ def test_explicit_copy_writes_back_to_source(self, tmp_path):
+ csv_file = tmp_path / "sales.csv"
+ csv_file.write_text("region,amount\nnorth,100\nsouth,200\n")
+ self._cleanup_sidecar(csv_file)
+
+ conn = DuckDBAdapter().connect(self._make_config(csv_file))
+ conn.execute("UPDATE sales SET amount = amount + 1")
+ # Path is fine to interpolate here since it came from tmp_path.
+ conn.execute(
+ f"COPY sales TO '{csv_file}' (FORMAT CSV, HEADER)"
+ )
+
+ # Source file on disk now reflects the modification.
+ contents = csv_file.read_text().strip().splitlines()
+ assert contents[0] == "region,amount"
+ assert sorted(contents[1:]) == ["north,101", "south,201"]