From d7bbfc0a3f8348d6253843ed62bfed8efc8a3961 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Tue, 15 Jul 2025 13:59:29 +0200 Subject: [PATCH 01/36] Introduce file diff table Expand diff json column into separate table --- server/mergin/sync/files.py | 4 +- server/mergin/sync/models.py | 181 ++++++++++++++---- server/mergin/sync/public_api_controller.py | 1 + server/mergin/sync/tasks.py | 2 +- server/mergin/sync/utils.py | 5 +- .../mergin/tests/test_project_controller.py | 12 +- server/mergin/tests/test_workspace.py | 6 +- server/mergin/tests/utils.py | 10 +- 8 files changed, 174 insertions(+), 47 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 12b30afe..5015e626 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -116,7 +116,9 @@ def create_obj(self, data, **kwargs): class ProjectFileSchema(FileSchema): mtime = DateTimeWithZ() - diff = fields.Nested(FileSchema()) + diff = fields.Nested( + FileSchema(), + ) @post_dump def patch_field(self, data, **kwargs): diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 3854e4d2..8c859994 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -164,14 +164,18 @@ def files(self) -> List[ProjectFile]: SELECT fp.path, fh.size, - fh.diff, fh.location, fh.checksum, - pv.created AS mtime + pv.created AS mtime, + fd.path as diff_path, + fd.size as diff_size, + fd.checksum as diff_checksum, + fd.location as diff_location FROM files_ids LEFT OUTER JOIN file_history fh ON fh.id = files_ids.fh_id LEFT OUTER JOIN project_file_path fp ON fp.id = fh.file_path_id - LEFT OUTER JOIN project_version pv ON pv.id = fh.version_id; + LEFT OUTER JOIN project_version pv ON pv.id = fh.version_id + LEFT OUTER JOIN file_diff fd ON fd.file_path_id = fh.file_path_id AND fd.version = fh.project_version_name and fd.rank = 0; """ params = {"project_id": self.id} files = [ @@ -181,7 +185,16 @@ def files(self) -> List[ProjectFile]: checksum=row.checksum, location=row.location, mtime=row.mtime, - diff=File(**row.diff) if row.diff else None, + diff=( + File( + path=row.diff_path, + size=row.diff_size, + checksum=row.diff_checksum, + location=row.diff_location, + ) + if row.diff_path + else None + ), ) for row in db.session.execute(query, params).fetchall() ] @@ -436,7 +449,6 @@ class FileHistory(db.Model): location = db.Column(db.String) size = db.Column(db.BigInteger, nullable=False) checksum = db.Column(db.String, nullable=False) - diff = db.Column(JSONB) change = db.Column( ENUM( *PushChangeType.values(), @@ -470,17 +482,6 @@ class FileHistory(db.Model): file_path_id, project_version_name.desc(), ), - db.CheckConstraint( - text( - """ - CASE - WHEN (change = 'update_diff') THEN diff IS NOT NULL - ELSE diff IS NULL - END - """ - ), - name="changes_with_diff", - ), ) def __init__( @@ -491,22 +492,55 @@ def __init__( location: str, change: PushChangeType, diff: dict = None, + version_name: int = None, ): self.file = file self.size = size self.checksum = checksum self.location = location - self.diff = diff if diff is not None else null() self.change = change.value + self.project_version_name = version_name + + if diff is not None: + basefile = FileHistory.get_basefile(file.id, version_name) + diff_file = FileDiff( + basefile, + diff.get("path"), + diff.get("size"), + diff.get("checksum"), + rank=0, + version=version_name, + ) + db.session.add(diff_file) @property def path(self) -> str: return self.file.path + @property + def diff(self) -> Optional[FileDiff]: + """Diff file pushed with UPDATE_DIFF change type. + + In FileDiff table it is defined as diff related to file, saved for the same project version with rank 0 (elementar diff) + """ + if self.change != PushChangeType.UPDATE_DIFF.value: + return + + return FileDiff.query.filter_by( + file_path_id=self.file_path_id, version=self.project_version_name, rank=0 + ).first() + @property def diff_file(self) -> Optional[File]: - if self.diff: - return File(**self.diff) + if not self.diff: + return + + return File( + path=self.diff.path, + size=self.diff.size, + checksum=self.diff.checksum, + location=self.diff.location, + ) @property def mtime(self) -> datetime: @@ -518,7 +552,7 @@ def abs_path(self) -> str: @property def expiration(self) -> Optional[datetime]: - if not self.diff: + if not self.diff_file: return if os.path.exists(self.abs_path): @@ -564,11 +598,7 @@ def changes( break # if we are interested only in 'diffable' history (not broken with forced update) - if ( - diffable - and item.change == PushChangeType.UPDATE.value - and not item.diff - ): + if diffable and item.change == PushChangeType.UPDATE.value: break return history @@ -615,7 +645,7 @@ def diffs_chain( if history: first_change = history[-1] # we have either full history of changes or v_x = v_x+n => no basefile in way, it is 'diffable' from the end - if first_change.diff: + if first_change.change == PushChangeType.UPDATE_DIFF.value: # omit diff for target version as it would lead to previous version if reconstructed backward diffs = [ value.diff_file @@ -665,6 +695,75 @@ def diffs_chain( return basefile, diffs + @classmethod + def get_basefile(cls, file_path_id: int, version: int) -> Optional[FileHistory]: + """Get basefile (start of file diffable history) for diff file change at some version""" + return ( + FileHistory.query.filter_by(file_path_id=file_path_id) + .filter( + FileHistory.project_version_name < version, + FileHistory.change.in_( + [PushChangeType.CREATE.value, PushChangeType.UPDATE.value] + ), + ) + .order_by(desc(FileHistory.project_version_name)) + .first() + ) + + +class FileDiff(db.Model): + """File diffs related to versioned files, also contain higher order (rank) merged diffs""" + + id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) + file_path_id = db.Column( + db.BigInteger, + db.ForeignKey("project_file_path.id", ondelete="CASCADE"), + nullable=False, + ) + # reference to actual full gpkg file + basefile_id = db.Column( + db.BigInteger, + db.ForeignKey("file_history.id", ondelete="CASCADE"), + index=True, + nullable=False, + ) + path = db.Column(db.String, nullable=False, index=True) + # exponential order of merged diff, 0 is a source diff file uploaded by user, > 0 is merged diff + rank = db.Column(db.Integer, nullable=False, index=True) + # to which project version is this linked + version = db.Column(db.Integer, nullable=False, index=True) + # path on FS relative to project directory + location = db.Column(db.String) + size = db.Column(db.BigInteger, nullable=False) + checksum = db.Column(db.String, nullable=False) + + __table_args__ = ( + db.UniqueConstraint("file_path_id", "rank", "version", name="unique_diff"), + db.Index("ix_file_diff_file_path_id_version_rank", file_path_id, version, rank), + ) + + def __init__( + self, + basefile: FileHistory, + path: str, + size: int, + checksum: str, + rank: int, + version: int, + ): + self.basefile_id = basefile.id + self.file_path_id = basefile.file_path_id + self.path = path + self.size = size + self.checksum = checksum + self.rank = rank + self.version = version + + if rank > 0: + self.location = f"diffs/{path}" + else: + self.location = f"v{version}/{path}" + class ProjectVersion(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) @@ -760,11 +859,12 @@ def __init__( diff=( asdict(upload_file.diff) if (is_diff_change and upload_file.diff) - else null() + else None ), change=( PushChangeType.UPDATE_DIFF if is_diff_change else change_type ), + version_name=self.name, ) fh.version = self fh.project_version_name = self.name @@ -822,14 +922,18 @@ def _files_from_start(self): SELECT fp.path, fh.size, - fh.diff, fh.location, fh.checksum, - pv.created AS mtime + pv.created AS mtime, + fd.path as diff_path, + fd.size as diff_size, + fd.checksum as diff_checksum, + fd.location as diff_location FROM latest_changes ch LEFT OUTER JOIN file_history fh ON (fh.file_path_id = ch.id AND fh.project_version_name = ch.version) LEFT OUTER JOIN project_file_path fp ON fp.id = fh.file_path_id LEFT OUTER JOIN project_version pv ON pv.id = fh.version_id + LEFT OUTER JOIN file_diff fd ON fd.file_path_id = fh.file_path_id AND fd.version = fh.project_version_name and fd.rank = 0 WHERE fh.change != 'delete'; """ params = {"project_id": self.project_id, "version": self.name} @@ -878,14 +982,18 @@ def _files_from_end(self): SELECT fp.path, fh.size, - fh.diff, fh.location, fh.checksum, - pv.created AS mtime + pv.created AS mtime, + fd.path as diff_path, + fd.size as diff_size, + fd.checksum as diff_checksum, + fd.location as diff_location FROM files_changes_before_version ch INNER JOIN file_history fh ON (fh.file_path_id = ch.file_id AND fh.project_version_name = ch.version) INNER JOIN project_file_path fp ON fp.id = fh.file_path_id INNER JOIN project_version pv ON pv.id = fh.version_id + LEFT OUTER JOIN file_diff fd ON fd.file_path_id = fh.file_path_id AND fd.version = fh.project_version_name and fd.rank = 0 WHERE fh.change != 'delete' ORDER BY fp.path; """ @@ -909,7 +1017,16 @@ def files(self) -> List[ProjectFile]: checksum=row.checksum, location=row.location, mtime=row.mtime, - diff=File(**row.diff) if row.diff else None, + diff=( + File( + path=row.diff_path, + checksum=row.diff_checksum, + size=row.diff_size, + location=row.diff_location, + ) + if row.diff_path + else None + ), ) for row in result ] diff --git a/server/mergin/sync/public_api_controller.py b/server/mergin/sync/public_api_controller.py index 9fd229a1..b3fb3c1b 100644 --- a/server/mergin/sync/public_api_controller.py +++ b/server/mergin/sync/public_api_controller.py @@ -40,6 +40,7 @@ from ..auth import auth_required from ..auth.models import User from .models import ( + FileDiff, Project, ProjectVersion, Upload, diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index f56fb273..f96ebaa2 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -89,7 +89,7 @@ def optimize_storage(project_id): for item in f_history: # no diffs, it is a basefile for geodiff - if not item.diff: + if not item.diff_file: continue # skip the latest file version (high chance of being used) diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index c4d5fa16..3235b97f 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -345,9 +345,8 @@ def files_size(): WHERE change = 'create'::push_change_type OR change = 'update'::push_change_type UNION SELECT - SUM(COALESCE((diff ->> 'size')::bigint, 0)) - FROM file_history - WHERE change = 'update_diff'::push_change_type + SUM(size) + FROM file_diff UNION SELECT SUM(size) diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index b1f60a8f..a5c22a5b 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -25,6 +25,7 @@ from sqlalchemy import desc from ..app import db from ..sync.models import ( + FileDiff, Project, Upload, ProjectVersion, @@ -443,7 +444,7 @@ def test_add_project(client, app, data, expected): assert not any(file.diff for file in proj_files) assert not any(file.diff for file in pv.files) assert all( - item.change == PushChangeType.CREATE.value and not item.diff + item.change == PushChangeType.CREATE.value and not item.diff_file for item in pv.changes.all() ) # cleanup @@ -1597,7 +1598,7 @@ def test_push_no_diff_finish(client): file_meta = latest_version.changes.filter( FileHistory.change == PushChangeType.UPDATE_DIFF.value ).first() - assert file_meta.diff is not None + assert file_meta.diff_file is not None assert os.path.exists( os.path.join(upload.project.storage.project_dir, file_meta.diff_file.location) ) @@ -2370,7 +2371,8 @@ def test_version_files(client, diff_project): x.checksum == y.checksum and x.path == y.path and x.location == y.location - and x.diff == y.diff + and x.diff_path == y.diff_path + and x.diff_checksum == y.diff_checksum for x, y in zip( sorted(forward_search, key=lambda f: f.path), sorted(backward_search, key=lambda f: f.path), @@ -2397,7 +2399,7 @@ def test_delete_diff_file(client): project_version_name=upload.project.latest_version, change=PushChangeType.UPDATE_DIFF.value, ).first() - assert fh.diff is not None + assert fh.diff_file is not None # delete file diff_change = next( @@ -2424,7 +2426,7 @@ def test_delete_diff_file(client): project_version_name=upload.project.latest_version, change=PushChangeType.DELETE.value, ).first() - assert fh.path == "base.gpkg" and fh.diff is None + assert fh.path == "base.gpkg" and fh.diff_file is None def test_cache_files_ids(client): diff --git a/server/mergin/tests/test_workspace.py b/server/mergin/tests/test_workspace.py index 2aafc268..e25f4365 100644 --- a/server/mergin/tests/test_workspace.py +++ b/server/mergin/tests/test_workspace.py @@ -52,6 +52,7 @@ def test_workspace_implementation(client): Configuration.GLOBAL_ADMIN = True # create project with dummy file to count for workspace usage project = create_project("test_permissions", ws, user) + latest_version = project.get_latest_version() file = ProjectFilePath(project.id, path="some_file.txt") file_history = FileHistory( file, @@ -60,12 +61,11 @@ def test_workspace_implementation(client): ), checksum="89469a6482267de394c7c7270cb7ffafe694ea76", size=1024, - diff=null(), + diff=None, change=PushChangeType.CREATE, + version_name=latest_version.name, ) - latest_version = project.get_latest_version() file_history.version = latest_version - file_history.project_version_name = file_history.version.name default_project_usage = ws.disk_usage() db.session.add(file_history) project.disk_usage = 1024 diff --git a/server/mergin/tests/utils.py b/server/mergin/tests/utils.py index 94fc033f..a0bbf3fb 100644 --- a/server/mergin/tests/utils.py +++ b/server/mergin/tests/utils.py @@ -19,7 +19,13 @@ from ..auth.models import User, UserProfile from ..sync.utils import generate_location, generate_checksum -from ..sync.models import Project, ProjectVersion, FileHistory, ProjectRole +from ..sync.models import ( + Project, + ProjectVersion, + FileHistory, + ProjectRole, + PushChangeType, +) from ..sync.files import UploadChanges, ChangesSchema from ..sync.workspace import GlobalWorkspace from ..app import db @@ -186,7 +192,7 @@ def initialize(): # make sure for history without diff there is a proper Null in database jsonb column assert FileHistory.query.filter_by(version_id=pv.id).filter( - FileHistory.diff.is_(None) + FileHistory.changes != PushChangeType.UPDATE_DIFF.value ).count() == len(project_files) # mimic files were uploaded From f50eb9684f327975cc71f3aab9d2682581506e6f Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 17 Jul 2025 14:39:29 +0200 Subject: [PATCH 02/36] Add alembic migration for file diff table --- .../bd1ec73db389_create_file_diff_table.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 server/migrations/community/bd1ec73db389_create_file_diff_table.py diff --git a/server/migrations/community/bd1ec73db389_create_file_diff_table.py b/server/migrations/community/bd1ec73db389_create_file_diff_table.py new file mode 100644 index 00000000..78ce673c --- /dev/null +++ b/server/migrations/community/bd1ec73db389_create_file_diff_table.py @@ -0,0 +1,140 @@ +"""create file diff table + +Revision ID: bd1ec73db389 +Revises: 6cb54659c1de +Create Date: 2025-07-17 14:17:02.373645 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "bd1ec73db389" +down_revision = "6cb54659c1de" +branch_labels = None +depends_on = None + + +def upgrade(): + op.create_table( + "file_diff", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("file_path_id", sa.BigInteger(), nullable=False), + sa.Column("basefile_id", sa.BigInteger(), nullable=False), + sa.Column("path", sa.String(), nullable=False), + sa.Column("rank", sa.Integer(), nullable=False), + sa.Column("version", sa.Integer(), nullable=False), + sa.Column("location", sa.String(), nullable=True), + sa.Column("size", sa.BigInteger(), nullable=False), + sa.Column("checksum", sa.String(), nullable=False), + sa.ForeignKeyConstraint( + ["basefile_id"], + ["file_history.id"], + name=op.f("fk_file_diff_basefile_id_file_history"), + ondelete="CASCADE", + ), + sa.ForeignKeyConstraint( + ["file_path_id"], + ["project_file_path.id"], + name=op.f("fk_file_diff_file_path_id_project_file_path"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_file_diff")), + sa.UniqueConstraint("file_path_id", "rank", "version", name="unique_diff"), + ) + op.create_index( + op.f("ix_file_diff_basefile_id"), "file_diff", ["basefile_id"], unique=False + ) + op.create_index( + "ix_file_diff_file_path_id_version_rank", + "file_diff", + ["file_path_id", "version", "rank"], + unique=False, + ) + op.create_index(op.f("ix_file_diff_path"), "file_diff", ["path"], unique=False) + op.create_index(op.f("ix_file_diff_rank"), "file_diff", ["rank"], unique=False) + op.create_index( + op.f("ix_file_diff_version"), "file_diff", ["version"], unique=False + ) + + # migrate data + conn = op.get_bind() + conn.execute( + """ + WITH diffs AS ( + SELECT * + FROM file_history + WHERE diff IS NOT NULL + ), + basefiles AS ( + SELECT DISTINCT + fh.id AS basefile_id, + fh.file_path_id, + fh.project_version_name AS basefile_version + FROM diffs d + LEFT OUTER JOIN file_history fh ON fh.file_path_id = d.file_path_id + WHERE + fh.change = ANY(ARRAY['create'::push_change_type, 'update'::push_change_type]) + ), + relevant_basefiles AS ( + SELECT + d.id, + d.project_version_name, + b.basefile_id, + b.basefile_version + FROM diffs d + LEFT OUTER JOIN basefiles b ON b.file_path_id = d.file_path_id AND b.basefile_version < d.project_version_name + ) + INSERT INTO file_diff (file_path_id, basefile_id, rank, version, path, size, checksum, location) + SELECT DISTINCT + d.file_path_id, + FIRST_VALUE(rb.basefile_id) OVER (PARTITION BY rb.id ORDER BY rb.basefile_version DESC) as basefile_id, + 0 AS rank, + d.project_version_name AS version, + (d.diff ->> 'path') AS path, + (d.diff ->> 'size')::bigint AS size, + d.diff ->> 'checksum' AS checksum, + d.diff ->> 'location' AS location + FROM diffs d + LEFT OUTER JOIN relevant_basefiles rb ON rb.id = d.id; + """ + ) + + op.drop_column("file_history", "diff") + + +def downgrade(): + op.add_column( + "file_history", + sa.Column( + "diff", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=True, + ), + ) + + # migrate data + conn = op.get_bind() + conn.execute( + """ + UPDATE file_history fh + SET diff = jsonb_build_object( + 'path', fd.path, + 'size', fd.size, + 'checksum', fd.checksum, + 'location', fd.location + ) + FROM file_diff fd + WHERE fh.file_path_id = fd.file_path_id AND fh.project_version_name = fd.version AND fd.rank = 0; + """ + ) + + op.drop_index(op.f("ix_file_diff_version"), table_name="file_diff") + op.drop_index(op.f("ix_file_diff_rank"), table_name="file_diff") + op.drop_index(op.f("ix_file_diff_path"), table_name="file_diff") + op.drop_index("ix_file_diff_file_path_id_version_rank", table_name="file_diff") + op.drop_index(op.f("ix_file_diff_basefile_id"), table_name="file_diff") + op.drop_table("file_diff") From 8c18891246cad0b14d39741b3b37ff04a142ff63 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Fri, 29 Aug 2025 11:11:16 +0200 Subject: [PATCH 03/36] Utils for generating cached version levels from versions --- server/mergin/sync/utils.py | 78 ++++++++++++++++++++++++++++++- server/mergin/tests/test_utils.py | 50 ++++++++++++++++++++ 2 files changed, 127 insertions(+), 1 deletion(-) diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index 3235b97f..595a4b79 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -7,13 +7,14 @@ import hashlib import re import secrets +from dataclasses import dataclass from threading import Timer from uuid import UUID from shapely import wkb from shapely.errors import ShapelyError from gevent import sleep from flask import Request -from typing import Optional +from typing import List, Optional from sqlalchemy import text from pathvalidate import ( validate_filename, @@ -577,3 +578,78 @@ def get_x_accel_uri(*url_parts): url = url.lstrip(os.path.sep) result = os.path.join(download_accell_uri, url) return result + + +LOG_BASE = 4 + + +@dataclass +class CachedLevel: + """ + Cached level of version tree. + Used as a checkpoint to merge individual versions / diff files into bigger chunks + """ + + rank: int # power of base + index: int # index of level - multiplyer of rank + + def __post_init__(self): + if type(self.rank) is not int or type(self.index) is not int: + raise ValueError("rank and index must be integers") + + if self.rank < 0 or self.index < 1: + raise ValueError("rank must be positive and index starts from 1") + + @property + def start(self) -> int: + """Start of the range covered by this level""" + return 1 + (LOG_BASE**self.rank * (self.index - 1)) + + @property + def end(self) -> int: + """End of the range covered by this level""" + return LOG_BASE**self.rank * self.index + + def __str__(self) -> str: + return f"CachedLevel(rank={self.rank}, index={self.index}, versions=v{self.start}-v{self.end})" + + +def get_cached_levels(version: int) -> List[CachedLevel]: + """ + Return the most right part of version tree as other nodes are already cached. + Version must divisible by BASE, and then we calculate all cached levels related to it. + """ + levels = [] + rank_max = math.floor((math.log(version) / math.log(LOG_BASE))) + + for rank in range(1, rank_max + 1): + if version % LOG_BASE**rank: + continue + + index = version // LOG_BASE**rank + levels.append(CachedLevel(rank=rank, index=index)) + + return levels + + +def get_merged_versions(start: int, end: int) -> List[CachedLevel]: + """ + Get all (merged) versions between start version and end version while respecting cached levels. + This basically provide the list of smaller versions (checkpoints) to be merged in order to get the final version. + """ + levels = [] + while start <= end: + if start == end: + rank_max = 0 + else: + rank_max = math.floor((math.log(end - start + 1) / math.log(LOG_BASE))) + for rank in reversed(range(0, rank_max + 1)): + if (start - 1) % LOG_BASE**rank: + continue + + index = (start - 1) // LOG_BASE**rank + 1 + levels.append(CachedLevel(rank=rank, index=index)) + start = start + LOG_BASE**rank + break + + return levels diff --git a/server/mergin/tests/test_utils.py b/server/mergin/tests/test_utils.py index a9670a2f..23bb2aee 100644 --- a/server/mergin/tests/test_utils.py +++ b/server/mergin/tests/test_utils.py @@ -16,6 +16,8 @@ from ..utils import save_diagnostic_log_file from ..sync.utils import ( + get_cached_levels, + get_merged_versions, parse_gpkgb_header_size, gpkg_wkb_to_wkt, is_reserved_word, @@ -280,3 +282,51 @@ def test_save_diagnostic_log_file(client, app): with open(saved_file_path, "r") as f: content = f.read() assert content == body.decode("utf-8") + + +def test_checkpoint_utils(): + """Test util functions to construct merged versions of higher ranks (checkpoints)""" + + # all cached versions ending with 64 would be v61-v64 (4) v49-v64 (16) and v1-v64 (64) + cached_levels = get_cached_levels(64) + assert len(cached_levels) == 3 + assert cached_levels[0].rank == 1 + assert cached_levels[0].index == 16 + assert cached_levels[1].rank == 2 + assert cached_levels[1].index == 4 + assert cached_levels[2].rank == 3 + assert cached_levels[2].index == 1 + + # there would not be any cached versions ending with 65 + cached_levels = get_cached_levels(65) + assert len(cached_levels) == 0 + + # exact match to single rank + versions = get_merged_versions(1, 64) + assert len(versions) == 1 + assert versions[0].rank == 3 + assert versions[0].index == 1 + + # v21 would be created from v1-16, v17-20 and v21 + versions = get_merged_versions(1, 21) + assert len(versions) == 3 + assert versions[0].rank == 2 + assert versions[0].index == 1 + assert versions[1].rank == 1 + assert versions[1].index == 5 + assert versions[2].rank == 0 + assert versions[2].index == 21 + + # no cached versions at all, only basic levels v1-v3 + versions = get_merged_versions(1, 3) + assert len(versions) == 3 + assert versions[0].rank == 0 + assert versions[0].index == 1 + assert versions[1].rank == 0 + assert versions[1].index == 2 + assert versions[2].rank == 0 + assert versions[2].index == 3 + + # dummy request + versions = get_merged_versions(2, 1) + assert len(versions) == 0 From afb42e57fe73ad31012fba5843d76adb7a7effa3 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Mon, 1 Sep 2025 08:33:04 +0200 Subject: [PATCH 04/36] Add db hook to trigger caching on project version created --- server/mergin/sync/db_events.py | 54 +++++++++++++++++++ server/mergin/sync/tasks.py | 10 ++++ server/mergin/tests/fixtures.py | 4 +- server/mergin/tests/test_db_hooks.py | 40 ++++++++++++++ .../mergin/tests/test_project_controller.py | 5 +- 5 files changed, 110 insertions(+), 3 deletions(-) diff --git a/server/mergin/sync/db_events.py b/server/mergin/sync/db_events.py index 18d1ce60..c0bb96fd 100644 --- a/server/mergin/sync/db_events.py +++ b/server/mergin/sync/db_events.py @@ -5,8 +5,13 @@ import os from flask import current_app, abort from sqlalchemy import event +from sqlalchemy.sql import text from ..app import db +from .models import ProjectVersion +from .public_api_controller import project_version_created +from .tasks import create_diff_checkpoint +from .utils import get_cached_levels def check(session): @@ -14,9 +19,58 @@ def check(session): abort(503, "Service unavailable due to maintenance, please try later") +def create_checkpoints(project_version: ProjectVersion): + """ + Create version checkpoints related to new project version + """ + # for initial versions there is nothing to do + if project_version.name in (0, 1): + return + + cache_levels = get_cached_levels(project_version.name) + if not cache_levels: + return + + # get all diff-modified gpkg files + query = text( + """ + WITH gpkg_files AS ( + SELECT id + FROM project_file_path + WHERE + project_id = :project_id + AND lower(path) LIKE '%.gpkg' + ), + latest_updates AS ( + SELECT DISTINCT + gf.id, + max(fh.project_version_name) AS latest_version + FROM gpkg_files gf + INNER JOIN file_history fh ON fh.file_path_id = gf.id + GROUP BY gf.id + ) + SELECT + lu.id + FROM latest_updates lu + LEFT OUTER JOIN file_history fh ON lu.id = fh.file_path_id AND lu.latest_version = fh.project_version_name + WHERE fh.change = 'update_diff'; + """ + ) + result = db.session.execute( + query, {"project_id": project_version.project_id} + ).fetchall() + + # create batch of caching jobs + for row in result: + for level in cache_levels: + create_diff_checkpoint.delay(row.id, level.start, level.end) + + def register_events(): event.listen(db.session, "before_commit", check) + project_version_created.connect(create_checkpoints) def remove_events(): event.remove(db.session, "before_commit", check) + project_version_created.disconnect(create_checkpoints) diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index 1a84afe2..682c6347 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -163,3 +163,13 @@ def remove_projects_archives(): os.remove(path) except OSError as e: logging.error(f"Unable to remove {path}: {str(e)}") + + +@celery.task +def create_diff_checkpoint(file_id: int, start: int, end: int): + """Create a diff file checkpoint (aka. merged diff). + Find all smaller diffs which are needed to create the final diff file and merge them. + In case of missing some lower rank checkpoint, use individual diffs instead. + """ + db.session.info = {"msg": "create_diff_checkpoint"} + pass diff --git a/server/mergin/tests/fixtures.py b/server/mergin/tests/fixtures.py index 368eba6b..7b676f80 100644 --- a/server/mergin/tests/fixtures.py +++ b/server/mergin/tests/fixtures.py @@ -104,9 +104,9 @@ def diff_project(app): Following changes are applied to base.gpkg in tests project (v1): v2: removed file -> previous version is lost (unless requested explicitly) - v3: uploaded again + v3: uploaded again -> new basefile v4: patched with changes from inserted_1_A.gpkg (1 inserted feature) - v5: replaced with original file base.gpkg (mimic of force update) + v5: replaced with original file base.gpkg (mimic of force update) -> new basefile again v6: patched with changes from modified_1_geom.gpkg (translated feature) v7: patched with changes from inserted_1_B.gpkg (1 inserted feature), final state is modified_1_geom.gpkg + inserted_1_B.gpkg v8: nothing happened, just to ensure last diff is not last version of project file diff --git a/server/mergin/tests/test_db_hooks.py b/server/mergin/tests/test_db_hooks.py index e7f9e270..f67992bc 100644 --- a/server/mergin/tests/test_db_hooks.py +++ b/server/mergin/tests/test_db_hooks.py @@ -4,6 +4,9 @@ import os from pathlib import Path +from unittest.mock import patch + +import pytest from ..sync.models import ( Project, @@ -19,6 +22,7 @@ ProjectUser, ) from ..sync.files import UploadChanges +from ..sync.public_api_controller import project_version_created from ..auth.models import User from ..app import db from . import DEFAULT_USER @@ -167,3 +171,39 @@ def test_remove_project(client, diff_project): # try to remove the deleted project assert diff_project.delete() is None + + +test_caching_call_data = [ + (4, True), # success + (8, True), # success + (5, False), # call not divisible by 4 + (4, False), # fake last change to be a breaking change +] + + +@pytest.mark.parametrize("version,called", test_caching_call_data) +@patch("mergin.sync.tasks.create_diff_checkpoint.delay") +def test_trigger_diff_caching(checkpoint_mock, diff_project, version, called): + # make target version the latest version + ProjectVersion.query.filter_by(project_id=diff_project.id).filter( + ProjectVersion.name > version + ).delete() + db.session.commit() + + pv = ProjectVersion.query.filter_by( + project_id=diff_project.id, name=version + ).first() + # modify the last change to be a breaking change + if not called and version == 4: + fh = FileHistory.query.filter_by(version_id=pv.id, change="update_diff").first() + fh.change = "delete" + db.session.commit() + + project_version_created.send(pv) + assert checkpoint_mock.called == called + + if called: + # we asked for to cache first level, e.g. with versions 1..4 + _, start, end = checkpoint_mock.call_args[0] + assert start == version - 3 + assert end == version diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index a5c22a5b..a284e569 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -1087,7 +1087,8 @@ def test_push_project_start(client, data, expected): assert failure.error_type == "push_start" -def test_push_to_new_project(client): +@patch("mergin.sync.tasks.create_diff_checkpoint.delay") +def test_push_to_new_project(checkpoint_mock, client): # create blank project p = Project.query.filter_by( name=test_project, workspace_id=test_workspace_id @@ -1107,6 +1108,8 @@ def test_push_to_new_project(client): headers=json_headers, ) assert resp.status_code == 200 + # nothing to cache in new project + assert not checkpoint_mock.called upload_id = resp.json["transaction"] upload = Upload.query.filter_by(id=upload_id).first() From b14e55b1d43a54713ca69e9b0e1f7be6e4e05a95 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 4 Sep 2025 16:14:52 +0200 Subject: [PATCH 05/36] create celery task to generate diff checkpoint --- server/mergin/sync/models.py | 30 +++++-- server/mergin/sync/storages/disk.py | 1 + server/mergin/sync/tasks.py | 109 ++++++++++++++++++++++++- server/mergin/sync/utils.py | 3 + server/mergin/tests/test_celery.py | 121 +++++++++++++++++++++++++++- server/mergin/tests/utils.py | 11 +++ 6 files changed, 262 insertions(+), 13 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 8c859994..6f37c2c1 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -402,6 +402,11 @@ class ProjectFilePath(db.Model): ), ) + project = db.relationship( + "Project", + uselist=False, + ) + def __init__(self, project_id, path): self.project_id = project_id self.path = path @@ -699,14 +704,14 @@ def diffs_chain( def get_basefile(cls, file_path_id: int, version: int) -> Optional[FileHistory]: """Get basefile (start of file diffable history) for diff file change at some version""" return ( - FileHistory.query.filter_by(file_path_id=file_path_id) + cls.query.filter_by(file_path_id=file_path_id) .filter( - FileHistory.project_version_name < version, - FileHistory.change.in_( + cls.project_version_name < version, + cls.change.in_( [PushChangeType.CREATE.value, PushChangeType.UPDATE.value] ), ) - .order_by(desc(FileHistory.project_version_name)) + .order_by(desc(cls.project_version_name)) .first() ) @@ -742,6 +747,8 @@ class FileDiff(db.Model): db.Index("ix_file_diff_file_path_id_version_rank", file_path_id, version, rank), ) + file = db.relationship("ProjectFilePath", uselist=False) + def __init__( self, basefile: FileHistory, @@ -758,11 +765,18 @@ def __init__( self.checksum = checksum self.rank = rank self.version = version + self.location = ( + os.path.join("diffs", path) + if rank > 0 + else os.path.join(f"v{version}", path) + ) - if rank > 0: - self.location = f"diffs/{path}" - else: - self.location = f"v{version}/{path}" + @property + def abs_path(self) -> str: + """ + Return absolute path of the diff file on the file system. + """ + return os.path.join(self.file.project.storage.project_dir, self.location) class ProjectVersion(db.Model): diff --git a/server/mergin/sync/storages/disk.py b/server/mergin/sync/storages/disk.py index 4debb255..f97d9884 100644 --- a/server/mergin/sync/storages/disk.py +++ b/server/mergin/sync/storages/disk.py @@ -134,6 +134,7 @@ def __init__(self, project): str(uuid.uuid4()), ) ) + self.diffs_dir = os.path.join(self.project_dir, "diffs") def _logger_callback(level, text_bytes): text = text_bytes.decode() diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index 682c6347..8a5f53f7 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -3,16 +3,21 @@ # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial import logging +import math import shutil import os import time from datetime import datetime, timedelta, timezone +import uuid from zipfile import ZIP_DEFLATED, ZipFile from flask import current_app +from pygeodiff import GeoDiffLibError +from pygeodiff.geodifflib import GeoDiffLibConflictError -from .models import Project, ProjectVersion, FileHistory +from .models import FileDiff, Project, ProjectVersion, FileHistory from .storages.disk import move_to_tmp from .config import Configuration +from .utils import LOG_BASE, generate_checksum, get_merged_versions from ..celery import celery from ..app import db @@ -172,4 +177,104 @@ def create_diff_checkpoint(file_id: int, start: int, end: int): In case of missing some lower rank checkpoint, use individual diffs instead. """ db.session.info = {"msg": "create_diff_checkpoint"} - pass + diff_range = end - start + 1 + + # invalid request as there would not be a checkpoint with this range + if end % LOG_BASE or diff_range % LOG_BASE: + return + + rank = math.log(diff_range) / math.log(LOG_BASE) + if not rank.is_integer(): + return + + # checkpoint already exists + file_diff = FileDiff.query.filter_by( + file_path_id=file_id, version=end, rank=rank + ).first() + if file_diff and os.path.exists(file_diff.abs_path): + return + + basefile = FileHistory.get_basefile(file_id, end) + if not basefile: + logging.error(f"Unable to find basefile for file {file_id}") + return + + if basefile.project_version_name > start: + logging.error( + f"Basefile version {basefile.project_version_name} is higher than start version {start} - broken history" + ) + return + + diffs_paths = [] + file_diffs = ( + FileDiff.query.filter( + FileDiff.basefile_id == basefile.id, + FileDiff.version >= start, + FileDiff.version <= end, + ) + .order_by(FileDiff.rank, FileDiff.version) + .all() + ) + + # we apply latest change (if any) on previous version + end_diff = next((d for d in file_diffs if d.version == end and d.rank == 0), None) + # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead + for merged_diff in get_merged_versions(start, end - 1): + # basefile is a start of the diff chain + if merged_diff.start <= basefile.project_version_name: + continue + + # find diff in table and on disk + diff = next( + ( + d + for d in file_diffs + if d.version == merged_diff.end and d.rank == merged_diff.rank + ), + None, + ) + if diff and os.path.exists(diff.abs_path): + diffs_paths.append(diff.abs_path) + else: + individual_diffs = [ + d.abs_path + for d in file_diffs + if d.rank == 0 + and d.version >= merged_diff.start + and d.version <= merged_diff.end + ] + if individual_diffs: + diffs_paths.extend(individual_diffs) + else: + logging.error( + f"Unable to find diffs for {merged_diff} for file {file_id}" + ) + return + + if end_diff: + diffs_paths.append(end_diff.abs_path) + + if not diffs_paths: + logging.warning(f"No diffs for next checkpoint for file {file_id}") + return + + project: Project = basefile.file.project + checkpoint_path = f"diff-{uuid.uuid4()}" + os.makedirs(project.storage.diffs_dir, exist_ok=True) + checkpoint_file = os.path.join(project.storage.diffs_dir, checkpoint_path) + try: + project.storage.geodiff.concat_changes(diffs_paths, checkpoint_file) + except (GeoDiffLibError, GeoDiffLibConflictError): + logging.error(f"Geodiff: Failed to merge diffs for file {file_id}") + return + + checkpoint = FileDiff( + basefile=basefile, + path=checkpoint_path, + size=os.path.getsize(checkpoint_file), + checksum=generate_checksum(checkpoint_file), + rank=rank, + version=end, + ) + db.session.add(checkpoint) + db.session.commit() diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index 595a4b79..c4497e13 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -613,6 +613,9 @@ def end(self) -> int: def __str__(self) -> str: return f"CachedLevel(rank={self.rank}, index={self.index}, versions=v{self.start}-v{self.end})" + def __repr__(self) -> str: + return str(self) + def get_cached_levels(version: int) -> List[CachedLevel]: """ diff --git a/server/mergin/tests/test_celery.py b/server/mergin/tests/test_celery.py index 50420cf0..091e5787 100644 --- a/server/mergin/tests/test_celery.py +++ b/server/mergin/tests/test_celery.py @@ -5,16 +5,25 @@ import os from datetime import datetime, timedelta from pathlib import Path - +import shutil from flask import current_app from flask_mail import Mail +from pygeodiff import GeoDiffLibError from unittest.mock import patch from ..app import db from ..config import Configuration -from ..sync.models import Project, AccessRequest, ProjectRole, ProjectVersion +from ..sync.models import ( + FileDiff, + Project, + AccessRequest, + ProjectFilePath, + ProjectRole, + ProjectVersion, +) from ..celery import send_email_async from ..sync.tasks import ( + create_diff_checkpoint, remove_temp_files, remove_projects_backups, create_project_version_zip, @@ -22,7 +31,17 @@ ) from ..sync.storages.disk import move_to_tmp from . import test_project, test_workspace_name, test_workspace_id -from .utils import add_user, create_workspace, create_project, login, modify_file_times +from .utils import ( + add_user, + create_workspace, + create_project, + diffs_are_equal, + execute_query, + gpkgs_are_equal, + login, + modify_file_times, + push_change, +) from ..auth.models import User from . import json_headers @@ -187,3 +206,99 @@ def test_create_project_version_zip(diff_project): modify_file_times(latest_version.zip_path, new_time) remove_projects_archives() # zip has expired -> remove assert not os.path.exists(latest_version.zip_path) + + +def test_create_diff_checkpoint(diff_project): + # add changes v11-v32 where v9 is a basefile + file_path_id = ( + ProjectFilePath.query.filter_by(project_id=diff_project.id, path="test.gpkg") + .first() + .id + ) + + basefile = os.path.join(diff_project.storage.project_dir, "test.gpkg") + shutil.copy( + os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), basefile + ) + for i in range(22): + sql = f"UPDATE simple SET rating={i}" + execute_query(basefile, sql) + pv = push_change( + diff_project, "updated", "test.gpkg", diff_project.storage.project_dir + ) + assert diff_project.latest_version == pv.name == (11 + i) + file_diff = FileDiff.query.filter_by( + file_path_id=file_path_id, version=pv.name, rank=0 + ).first() + assert file_diff and os.path.exists(file_diff.abs_path) + + # diff for v17-v20 from individual diffs + create_diff_checkpoint(file_path_id, 17, 20) + diff = FileDiff.query.filter_by( + file_path_id=file_path_id, version=20, rank=1 + ).first() + assert os.path.exists(diff.abs_path) + + # repeat - nothing to do + mtime = os.path.getmtime(diff.abs_path) + create_diff_checkpoint(file_path_id, 17, 20) + assert mtime == os.path.getmtime(diff.abs_path) + + # diff for v17-v32 with merged diffs (using one above) + create_diff_checkpoint(file_path_id, 17, 32) + diff = FileDiff.query.filter_by( + file_path_id=file_path_id, version=32, rank=2 + ).first() + assert os.path.exists(diff.abs_path) + # assert gpkg diff is the same as it would be from merging all individual diffs + individual_diffs = ( + FileDiff.query.filter_by(file_path_id=file_path_id, rank=0) + .filter(FileDiff.version.between(17, 32)) + .all() + ) + merged_diff = os.path.join(diff_project.storage.diffs_dir, "merged-diff") + diff_project.storage.geodiff.concat_changes( + [d.abs_path for d in individual_diffs], merged_diff + ) + assert diffs_are_equal(diff.abs_path, merged_diff) + + # test various failures + with patch.object(diff_project.storage.geodiff, "concat_changes") as mock: + # diff for not existing version + create_diff_checkpoint(file_path_id, 33, 36) + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=36 + ).count() + + # diff for invalid range + create_diff_checkpoint(file_path_id, 17, 31) + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=31, rank=1 + ).count() + + create_diff_checkpoint(file_path_id, 27, 32) + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=32, rank=1 + ).count() + + # diff with broken history at v9 + create_diff_checkpoint(file_path_id, 5, 20) + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=20, rank=2 + ).count() + + # diff for missing basefile (e.g. deleted file or not-existing file) + create_diff_checkpoint(file_path_id, 5, 8) + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=8, rank=1 + ).count() + + assert not mock.called + + # geodiff failure + mock.side_effect = GeoDiffLibError + create_diff_checkpoint(file_path_id, 13, 16) + assert mock.called + assert not FileDiff.query.filter_by( + file_path_id=file_path_id, version=16, rank=1 + ).count() diff --git a/server/mergin/tests/utils.py b/server/mergin/tests/utils.py index a0bbf3fb..9af69eb9 100644 --- a/server/mergin/tests/utils.py +++ b/server/mergin/tests/utils.py @@ -393,3 +393,14 @@ def modify_file_times(path, time: datetime, accessed=True, modified=True): mtime = epoch_time if modified else file_stat.st_mtime os.utime(path, (atime, mtime)) + + +def diffs_are_equal(diff1, diff2): + """Compare changes of two geodiff files""" + changes1 = os.path.join(TMP_DIR, "changeset" + str(uuid.uuid4())) + changes2 = os.path.join(TMP_DIR, "changeset" + str(uuid.uuid4())) + geodiff = GeoDiff() + geodiff.list_changes_summary(diff1, changes1) + geodiff.list_changes_summary(diff2, changes2) + with open(changes1) as f, open(changes2) as f2: + return json.load(f) == json.load(f2) From 4461ec9d33c336852c22606ea3c77de4b7654360 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Fri, 5 Sep 2025 13:33:18 +0200 Subject: [PATCH 06/36] Use diff checkpoint in gpkg restore function --- server/mergin/sync/models.py | 148 ++++++++---------- server/mergin/sync/storages/disk.py | 38 ++--- server/mergin/sync/tasks.py | 58 ++++--- server/mergin/tests/test_celery.py | 18 ++- server/mergin/tests/test_file_restore.py | 26 +-- .../mergin/tests/test_project_controller.py | 69 +++----- 6 files changed, 162 insertions(+), 195 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 6f37c2c1..46236fa4 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -14,7 +14,7 @@ from blinker import signal from flask_login import current_user from pygeodiff import GeoDiff -from sqlalchemy import text, null, desc, nullslast +from sqlalchemy import text, null, desc, nullslast, tuple_ from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, UUID, JSONB, ENUM from sqlalchemy.types import String from sqlalchemy.ext.hybrid import hybrid_property @@ -31,7 +31,7 @@ from .storages.disk import move_to_tmp from ..app import db from .storages import DiskStorage -from .utils import is_versioned_file, is_qgis +from .utils import get_merged_versions, is_versioned_file, is_qgis Storages = {"local": DiskStorage} project_deleted = signal("project_deleted") @@ -610,93 +610,81 @@ def changes( @classmethod def diffs_chain( - cls, project: Project, file: str, version: int - ) -> Tuple[Optional[FileHistory], List[Optional[File]]]: - """Find chain of diffs from the closest basefile that leads to a given file at certain project version. + cls, file_id: int, version: int + ) -> Tuple[Optional[FileHistory], List[Optional[FileDiff]]]: + """Find chain of diffs from the basefile that leads to a given file at certain project version. Returns basefile and list of diffs for gpkg that needs to be applied to reconstruct file. List of diffs can be empty if basefile was eventually asked. Basefile can be empty if file cannot be reconstructed (removed/renamed). """ + latest_change = ( + cls.query.filter_by(file_path_id=file_id) + .filter(cls.project_version_name <= version) + .order_by(desc(cls.project_version_name)) + .first() + ) + # file never existed prior that version + if not latest_change: + return None, [] + + # the last update to file was a delete + if latest_change.change == PushChangeType.DELETE.value: + return None, [] + + # the last update to file was a create / force update + if latest_change.change in ( + PushChangeType.CREATE.value, + PushChangeType.UPDATE.value, + ): + return latest_change, [] + + basefile = cls.get_basefile(file_id, version) + if not basefile: + return None, [] + diffs = [] - basefile = None - v_x = version # the version of interest - v_last = project.latest_version - - # we ask for the latest version which is always a basefile if the file has not been removed - if v_x == v_last: - latest_change = ( - FileHistory.query.join(ProjectFilePath) - .join(FileHistory.version) - .filter( - ProjectFilePath.path == file, - ProjectVersion.project_id == project.id, + cached_items = get_merged_versions(basefile.project_version_name, version) + expected_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + ) + .filter( + tuple_(FileDiff.rank, FileDiff.version).in_( + [(item.rank, item.end) for item in cached_items] ) - .order_by(desc(ProjectVersion.created)) - .first() ) - if latest_change.change != PushChangeType.DELETE.value: - return latest_change, [] + .all() + ) + + for item in cached_items: + diff = next( + ( + d + for d in expected_diffs + if d.rank == item.rank and d.version == item.end + ), + None, + ) + if diff: + diffs.append(diff) + elif item.rank > 0: + # fallback if checkpoint does not exist: replace merged diff with individual diffs + individual_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + rank=0, + ) + .filter( + FileDiff.version >= item.start, FileDiff.version <= item.end + ) + .order_by(FileDiff.version) + .all() + ) + diffs.extend(individual_diffs) else: - # file is actually not in the latest project version - return None, [] - - # check if it would not be faster to look up from the latest version - backward = (v_last - v_x) < v_x - - if backward: - # get list of file history changes starting with the latest version (v_last, ..., v_x+n, (..., v_x)) - history = FileHistory.changes(project.id, file, v_x, v_last, diffable=True) - if history: - first_change = history[-1] - # we have either full history of changes or v_x = v_x+n => no basefile in way, it is 'diffable' from the end - if first_change.change == PushChangeType.UPDATE_DIFF.value: - # omit diff for target version as it would lead to previous version if reconstructed backward - diffs = [ - value.diff_file - for value in reversed(history) - if value.version.name != v_x - ] - basefile = history[0] - return basefile, diffs - # there was either breaking change or v_x is a basefile itself - else: - # we asked for basefile - if v_x == first_change.version.name and first_change.change in [ - PushChangeType.CREATE.value, - PushChangeType.UPDATE.value, - ]: - return first_change, [] - # file was removed (or renamed for backward compatibility) - elif v_x == first_change.version.name: - return basefile, diffs - # there was a breaking change in v_x+n, and we need to search from start - else: - pass - - # we haven't found anything so far, search from v1 - if not (basefile and diffs): - # get ordered dict of file history starting with version of interest (v_x, ..., v_x-n, (..., v_1)) - history = FileHistory.changes(project.id, file, 1, v_x, diffable=True) - if history: - first_change = history[-1] - # we found basefile - if first_change.change in [ - PushChangeType.CREATE.value, - PushChangeType.UPDATE.value, - ]: - basefile = first_change - if v_x == first_change.version.name: - # we asked for basefile - diffs = [] - else: - # basefile has no diff - diffs = [ - value.diff_file for value in list(reversed(history))[1:] - ] - # file was removed (or renamed for backward compatibility) - else: - pass + # we asked for individual diff but there is no such diff as there was not change at that version + continue return basefile, diffs diff --git a/server/mergin/sync/storages/disk.py b/server/mergin/sync/storages/disk.py index f97d9884..5c27e9ea 100644 --- a/server/mergin/sync/storages/disk.py +++ b/server/mergin/sync/storages/disk.py @@ -372,7 +372,12 @@ def restore_versioned_file(self, file: str, version: int): :param file: path of file in project to recover :param version: project version (e.g. 2) """ - from ..models import GeodiffActionHistory, ProjectVersion, FileHistory + from ..models import ( + GeodiffActionHistory, + ProjectVersion, + FileHistory, + ProjectFilePath, + ) if not is_versioned_file(file): return @@ -393,7 +398,13 @@ def restore_versioned_file(self, file: str, version: int): ): return - base_meta, diffs = FileHistory.diffs_chain(self.project, file, version) + file_id = ( + ProjectFilePath.query.filter_by(path=file, project_id=self.project.id) + .first() + .id + ) + + base_meta, diffs = FileHistory.diffs_chain(file_id, version) if not (base_meta and diffs): return @@ -412,31 +423,17 @@ def restore_versioned_file(self, file: str, version: int): ) if len(diffs) > 1: # concatenate multiple diffs into single one - partials = [ - os.path.join(self.project_dir, d.location) for d in diffs - ] + partials = [d.abs_path for d in diffs] self.geodiff.concat_changes(partials, changeset) else: - copy_file( - os.path.join(self.project_dir, diffs[0].location), changeset - ) + copy_file(diffs[0].abs_path, changeset) logging.info( f"Geodiff: apply changeset {changeset} of size {os.path.getsize(changeset)}" ) - # if we are going backwards we need to reverse changeset! - if base_meta.version.name > version: - logging.info(f"Geodiff: inverting changeset") - changes = os.path.join( - self.geodiff_working_dir, - os.path.basename(base_meta.abs_path) + "-diff-inv", - ) - self.geodiff.invert_changeset(changeset, changes) - else: - changes = changeset start = time.time() - self.geodiff.apply_changeset(restored_file, changes) + self.geodiff.apply_changeset(restored_file, changeset) # track geodiff event for performance analysis gh = GeodiffActionHistory( self.project.id, @@ -445,7 +442,7 @@ def restore_versioned_file(self, file: str, version: int): base_meta.size, ProjectVersion.to_v_name(project_version.name), "restore_file", - changes, + changeset, ) apply_time = time.time() - start gh.geodiff_time = apply_time @@ -457,7 +454,6 @@ def restore_versioned_file(self, file: str, version: int): ) return finally: - move_to_tmp(changes) move_to_tmp(changeset) # move final restored file to place where it is expected (only after it is successfully created) logging.info( diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index 8a5f53f7..d3442efb 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -13,6 +13,7 @@ from flask import current_app from pygeodiff import GeoDiffLibError from pygeodiff.geodifflib import GeoDiffLibConflictError +from sqlalchemy import tuple_ from .models import FileDiff, Project, ProjectVersion, FileHistory from .storages.disk import move_to_tmp @@ -206,51 +207,60 @@ def create_diff_checkpoint(file_id: int, start: int, end: int): return diffs_paths = [] - file_diffs = ( - FileDiff.query.filter( - FileDiff.basefile_id == basefile.id, - FileDiff.version >= start, - FileDiff.version <= end, + + # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead + cached_items = get_merged_versions(start, end - 1) + expected_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + ) + .filter( + tuple_(FileDiff.rank, FileDiff.version).in_( + [(item.rank, item.end) for item in cached_items] + ) ) - .order_by(FileDiff.rank, FileDiff.version) .all() ) - # we apply latest change (if any) on previous version - end_diff = next((d for d in file_diffs if d.version == end and d.rank == 0), None) - # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead - for merged_diff in get_merged_versions(start, end - 1): + for item in cached_items: # basefile is a start of the diff chain - if merged_diff.start <= basefile.project_version_name: + if item.start <= basefile.project_version_name: continue # find diff in table and on disk diff = next( ( d - for d in file_diffs - if d.version == merged_diff.end and d.rank == merged_diff.rank + for d in expected_diffs + if d.rank == item.rank and d.version == item.end ), None, ) if diff and os.path.exists(diff.abs_path): diffs_paths.append(diff.abs_path) else: - individual_diffs = [ - d.abs_path - for d in file_diffs - if d.rank == 0 - and d.version >= merged_diff.start - and d.version <= merged_diff.end - ] + individual_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + rank=0, + ) + .filter(FileDiff.version >= item.start, FileDiff.version <= item.end) + .order_by(FileDiff.version) + .all() + ) if individual_diffs: - diffs_paths.extend(individual_diffs) + diffs_paths.extend([i.abs_path for i in individual_diffs]) else: - logging.error( - f"Unable to find diffs for {merged_diff} for file {file_id}" - ) + logging.error(f"Unable to find diffs for {item} for file {file_id}") return + # we apply latest change (if any) on previous version + end_diff = FileDiff.query.filter_by( + basefile_id=basefile.id, + rank=0, + version=end, + ).first() + if end_diff: diffs_paths.append(end_diff.abs_path) diff --git a/server/mergin/tests/test_celery.py b/server/mergin/tests/test_celery.py index 091e5787..364a4970 100644 --- a/server/mergin/tests/test_celery.py +++ b/server/mergin/tests/test_celery.py @@ -15,6 +15,7 @@ from ..config import Configuration from ..sync.models import ( FileDiff, + FileHistory, Project, AccessRequest, ProjectFilePath, @@ -216,13 +217,13 @@ def test_create_diff_checkpoint(diff_project): .id ) - basefile = os.path.join(diff_project.storage.project_dir, "test.gpkg") + base_gpkg = os.path.join(diff_project.storage.project_dir, "test.gpkg") shutil.copy( - os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), basefile + os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), base_gpkg ) for i in range(22): sql = f"UPDATE simple SET rating={i}" - execute_query(basefile, sql) + execute_query(base_gpkg, sql) pv = push_change( diff_project, "updated", "test.gpkg", diff_project.storage.project_dir ) @@ -232,6 +233,11 @@ def test_create_diff_checkpoint(diff_project): ).first() assert file_diff and os.path.exists(file_diff.abs_path) + basefile, diffs = FileHistory.diffs_chain(file_path_id, 32) + assert basefile.project_version_name == 9 + # so far we only have individual diffs + assert len(diffs) == 22 + # diff for v17-v20 from individual diffs create_diff_checkpoint(file_path_id, 17, 20) diff = FileDiff.query.filter_by( @@ -239,6 +245,12 @@ def test_create_diff_checkpoint(diff_project): ).first() assert os.path.exists(diff.abs_path) + basefile, diffs = FileHistory.diffs_chain(file_path_id, 20) + assert basefile.project_version_name == 9 + # 6 individual diffs (v11-v16) + merged diff (v17-v20) as the last one + assert len(diffs) == 7 + assert diffs[-1] == diff + # repeat - nothing to do mtime = os.path.getmtime(diff.abs_path) create_diff_checkpoint(file_path_id, 17, 20) diff --git a/server/mergin/tests/test_file_restore.py b/server/mergin/tests/test_file_restore.py index 278837d3..19e02ce2 100644 --- a/server/mergin/tests/test_file_restore.py +++ b/server/mergin/tests/test_file_restore.py @@ -112,17 +112,12 @@ def test_crud_in_version_file_restore(app, forward_check): assert gpkgs_are_equal(test_file, test_file + "_backup") -@pytest.mark.parametrize("forward_check", [True, False]) -def test_version_file_restore_with_no_changes(app, forward_check): +def test_version_file_restore_with_no_changes(app): """Test to restore gpkg file from diffs where history contains some blank versions (no changes).""" working_dir = os.path.join(TMP_DIR, "restore_from_diffs_with_gaps") basefile = os.path.join(working_dir, "base.gpkg") p = _prepare_restore_project(working_dir) - if not forward_check: - for _ in range(6): - create_blank_version(p) - base_version = p.get_latest_version().name for i in range(3): sql = "INSERT INTO simple (geometry, name) VALUES (GeomFromText('POINT(24.5, 38.2)', 4326), 'insert_test')" @@ -183,25 +178,8 @@ def test_version_file_restore(diff_project): diff_project.storage.restore_versioned_file("base.gpkg", 6) assert os.path.exists(test_file) assert gpkgs_are_equal(test_file, test_file + "_backup") - - # remove v9 and v10 to mimic that project history end with existing file - pv_8 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=8).first() - pv_9 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=9).first() - pv_10 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=10).first() - diff_project.latest_version = 8 - db.session.delete(pv_9) - db.session.delete(pv_10) - db.session.commit() - diff_project.cache_latest_files() - # restore v6 backward, from the latest file (v7=v8) - test_file = os.path.join(diff_project.storage.project_dir, "v6", "base.gpkg") - if os.path.exists(test_file): - os.remove(test_file) - diff_project.storage.restore_versioned_file("base.gpkg", 6) - assert os.path.exists(test_file) - assert gpkgs_are_equal(test_file, test_file + "_backup") gh = GeodiffActionHistory.query.filter_by( - project_id=diff_project.id, base_version="v7", target_version="v6" + project_id=diff_project.id, base_version="v5", target_version="v6" ).first() assert gh.geodiff_time assert gh.copy_time diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index a284e569..22a45c6f 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -1832,90 +1832,73 @@ def test_optimize_storage(app, client, diff_project): def test_file_diffs_chain(diff_project): # file test.gpkg was added only in v9, and then left intact + file_id = ( + ProjectFilePath.query.filter_by(project_id=diff_project.id, path="test.gpkg") + .first() + .id + ) + # direct search - basefile, diffs = FileHistory.diffs_chain(diff_project, "test.gpkg", 2) - assert not basefile - assert not diffs - # reverse search - basefile, diffs = FileHistory.diffs_chain(diff_project, "test.gpkg", 8) + basefile, diffs = FileHistory.diffs_chain(file_id, 2) assert not basefile assert not diffs # ask for basefile - basefile, diffs = FileHistory.diffs_chain(diff_project, "test.gpkg", 9) + basefile, diffs = FileHistory.diffs_chain(file_id, 9) assert basefile.version.name == 9 assert basefile.change == "create" assert not diffs + file_id = ( + ProjectFilePath.query.filter_by(project_id=diff_project.id, path="base.gpkg") + .first() + .id + ) + # version history has been broken by removal of file in v2 - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 2) + basefile, diffs = FileHistory.diffs_chain(file_id, 2) assert not basefile assert not diffs # file was re-added in v3 - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 3) + basefile, diffs = FileHistory.diffs_chain(file_id, 3) assert basefile.version.name == 3 assert basefile.change == "create" assert not diffs # diff was used in v4, direct search - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 4) + basefile, diffs = FileHistory.diffs_chain(file_id, 4) assert basefile.version.name == 3 assert len(diffs) == 1 - assert "v4" in diffs[0].location + assert diffs[0].version == 4 # file was overwritten in v5 - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 5) + basefile, diffs = FileHistory.diffs_chain(file_id, 5) assert basefile.version.name == 5 assert basefile.change == "update" assert not diffs - # diff was used in v6, reverse search followed by direct search - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 6) + # diff was used in v6 + basefile, diffs = FileHistory.diffs_chain(file_id, 6) assert basefile.version.name == 5 assert len(diffs) == 1 - assert "v6" in diffs[0].location + assert diffs[0].version == 6 - # diff was used in v7, nothing happened in v8 (=v7), reverse search followed by direct search - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 8) + # diff was used in v7, nothing happened in v8 (=v7) + basefile, diffs = FileHistory.diffs_chain(file_id, 8) assert basefile.version.name == 5 assert len(diffs) == 2 # file was removed in v9 - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 9) + basefile, diffs = FileHistory.diffs_chain(file_id, 9) assert not basefile assert not diffs # ask for latest version, but file is already gone - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 10) + basefile, diffs = FileHistory.diffs_chain(file_id, 10) assert not basefile assert not diffs - # remove v9 and v10 to mimic that project history end with existing file - pv_8 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=8).first() - pv_9 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=9).first() - pv_10 = ProjectVersion.query.filter_by(project_id=diff_project.id, name=10).first() - diff_project.latest_version = 8 - db.session.delete(pv_9) - db.session.delete(pv_10) - db.session.commit() - - # diff was used in v6, v7, nothing happened in v8 => v7 = v8, reverse search - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 6) - assert basefile.version.name == 7 - assert len(diffs) == 1 - assert "v7" in diffs[0].location - - # we asked for last existing file version - basefile - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 7) - assert basefile.version.name == 7 - assert not diffs - - # we asked for last project version - basefile, diffs = FileHistory.diffs_chain(diff_project, "base.gpkg", 8) - assert basefile.version.name == 7 - assert not diffs - changeset_data = [ ("v1", "test.gpkg", 404), From dd6463e46b8c7a271a69a4aafa8cf10546f1d1d9 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 25 Sep 2025 14:22:55 +0200 Subject: [PATCH 07/36] API: Add new v2 endpoint to download diff file --- server/mergin/sync/models.py | 150 ++++++++++++++++-- server/mergin/sync/private_api_controller.py | 31 ++-- server/mergin/sync/public_api_controller.py | 34 +--- server/mergin/sync/public_api_v2.yaml | 32 ++++ .../mergin/sync/public_api_v2_controller.py | 24 ++- server/mergin/sync/utils.py | 28 +++- server/mergin/tests/test_public_api_v2.py | 145 ++++++++++++++++- .../bd1ec73db389_create_file_diff_table.py | 4 +- 8 files changed, 376 insertions(+), 72 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 46236fa4..b0f2795f 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial from __future__ import annotations import json +import logging import os import time import uuid @@ -18,7 +19,7 @@ from sqlalchemy.dialects.postgresql import ARRAY, BIGINT, UUID, JSONB, ENUM from sqlalchemy.types import String from sqlalchemy.ext.hybrid import hybrid_property -from pygeodiff.geodifflib import GeoDiffLibError +from pygeodiff.geodifflib import GeoDiffLibError, GeoDiffLibConflictError from flask import current_app from .files import ( @@ -31,7 +32,14 @@ from .storages.disk import move_to_tmp from ..app import db from .storages import DiskStorage -from .utils import get_merged_versions, is_versioned_file, is_qgis +from .utils import ( + LOG_BASE, + CachedLevel, + generate_checksum, + get_merged_versions, + is_versioned_file, + is_qgis, +) Storages = {"local": DiskStorage} project_deleted = signal("project_deleted") @@ -509,12 +517,12 @@ def __init__( if diff is not None: basefile = FileHistory.get_basefile(file.id, version_name) diff_file = FileDiff( - basefile, - diff.get("path"), - diff.get("size"), - diff.get("checksum"), + basefile=basefile, + path=diff.get("path"), rank=0, version=version_name, + size=diff.get("size"), + checksum=diff.get("checksum"), ) db.session.add(diff_file) @@ -727,8 +735,8 @@ class FileDiff(db.Model): version = db.Column(db.Integer, nullable=False, index=True) # path on FS relative to project directory location = db.Column(db.String) - size = db.Column(db.BigInteger, nullable=False) - checksum = db.Column(db.String, nullable=False) + size = db.Column(db.BigInteger, nullable=True) + checksum = db.Column(db.String, nullable=True) __table_args__ = ( db.UniqueConstraint("file_path_id", "rank", "version", name="unique_diff"), @@ -741,10 +749,10 @@ def __init__( self, basefile: FileHistory, path: str, - size: int, - checksum: str, rank: int, version: int, + size: int = None, + checksum: str = None, ): self.basefile_id = basefile.id self.file_path_id = basefile.file_path_id @@ -766,6 +774,128 @@ def abs_path(self) -> str: """ return os.path.join(self.file.project.storage.project_dir, self.location) + @property + def cache_level(self) -> Optional[CachedLevel]: + """ + Return cache level representation for diff file + """ + # individual diff for any version + if self.rank == 0: + return CachedLevel(rank=self.rank, index=self.version) + + # merged diffs can only be created for certain versions + if self.version % LOG_BASE: + return + + index = self.version // LOG_BASE**self.rank + # some invalid record + if index < 1 or self.rank < 0: + return + + return CachedLevel(rank=self.rank, index=index) + + def construct_checkpoint(self) -> None: + """Create a diff file checkpoint (aka. merged diff). + Find all smaller diffs which are needed to create the final diff file and merge them. + In case of missing some lower rank checkpoint, use individual diffs instead. + """ + if os.path.exists(self.abs_path): + return + + basefile = FileHistory.get_basefile(self.file_path_id, self.cache_level.end) + if not basefile: + logging.error(f"Unable to find basefile for file {self.file_path_id}") + return + + if basefile.project_version_name > self.cache_level.start: + logging.error( + f"Basefile version {basefile.project_version_name} is higher than start version {self.cache_level.start} - broken history" + ) + return + + diffs_paths = [] + # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead + cached_items = get_merged_versions( + self.cache_level.start, self.cache_level.end - 1 + ) + expected_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + ) + .filter( + tuple_(FileDiff.rank, FileDiff.version).in_( + [(item.rank, item.end) for item in cached_items] + ) + ) + .all() + ) + + for item in cached_items: + # basefile is a start of the diff chain + if item.start <= basefile.project_version_name: + continue + + # find diff in table and on disk + diff = next( + ( + d + for d in expected_diffs + if d.rank == item.rank and d.version == item.end + ), + None, + ) + if diff and os.path.exists(diff.abs_path): + diffs_paths.append(diff.abs_path) + else: + individual_diffs = ( + FileDiff.query.filter_by( + basefile_id=basefile.id, + rank=0, + ) + .filter( + FileDiff.version >= item.start, FileDiff.version <= item.end + ) + .order_by(FileDiff.version) + .all() + ) + if individual_diffs: + diffs_paths.extend([i.abs_path for i in individual_diffs]) + else: + logging.error( + f"Unable to find diffs for {item} for file {self.file_path_id}" + ) + return + + # we apply latest change (if any) on previous version + end_diff = FileDiff.query.filter_by( + basefile_id=basefile.id, + rank=0, + version=self.cache_level.end, + ).first() + + if end_diff: + diffs_paths.append(end_diff.abs_path) + + if not diffs_paths: + logging.warning( + f"No diffs for next checkpoint for file {self.file_path_id}" + ) + return + + project: Project = basefile.file.project + os.makedirs(project.storage.diffs_dir, exist_ok=True) + try: + project.storage.geodiff.concat_changes(diffs_paths, self.abs_path) + except (GeoDiffLibError, GeoDiffLibConflictError): + logging.error( + f"Geodiff: Failed to merge diffs for file {self.file_path_id}" + ) + return + + self.size = os.path.getsize(self.abs_path) + self.checksum = generate_checksum(self.abs_path) + db.session.commit() + class ProjectVersion(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) diff --git a/server/mergin/sync/private_api_controller.py b/server/mergin/sync/private_api_controller.py index 13f059b9..6c5fd802 100644 --- a/server/mergin/sync/private_api_controller.py +++ b/server/mergin/sync/private_api_controller.py @@ -5,15 +5,7 @@ from datetime import datetime, timedelta, timezone from urllib.parse import quote from connexion import NoContent -from flask import ( - render_template, - request, - current_app, - jsonify, - abort, - make_response, - send_file, -) +from flask import render_template, request, current_app, jsonify, abort from flask_login import current_user from sqlalchemy.orm import defer from sqlalchemy import text @@ -41,8 +33,7 @@ ) from ..utils import parse_order_params, split_order_param, get_order_param from .tasks import create_project_version_zip -from .storages.disk import move_to_tmp -from .utils import get_x_accel_uri +from .utils import prepare_download_response @auth_required @@ -336,22 +327,20 @@ def download_project(id: str, version=None): # noqa: E501 # pylint: disable=W06 # check zip is already created if os.path.exists(project_version.zip_path): + response = prepare_download_response( + os.path.dirname(project_version.zip_path), + os.path.basename(project_version.zip_path), + ) if current_app.config["USE_X_ACCEL"]: - resp = make_response() - resp.headers["X-Accel-Redirect"] = get_x_accel_uri(project_version.zip_path) - resp.headers["X-Accel-Buffering"] = current_app.config.get( + response.headers["X-Accel-Buffering"] = current_app.config.get( "PROJECTS_ARCHIVES_X_ACCEL_BUFFERING" ) - resp.headers["X-Accel-Expires"] = "off" - resp.headers["Content-Type"] = "application/zip" - else: - resp = send_file(project_version.zip_path, mimetype="application/zip") - + # set custom file in header file_name = quote(f"{project.name}-v{lookup_version}.zip".encode("utf-8")) - resp.headers["Content-Disposition"] = ( + response.headers["Content-Disposition"] = ( f"attachment; filename*=UTF-8''{file_name}" ) - return resp + return response # GET request triggers background job if no partial zip or expired one if request.method == "GET": temp_zip_path = project_version.zip_path + ".partial" diff --git a/server/mergin/sync/public_api_controller.py b/server/mergin/sync/public_api_controller.py index b3fb3c1b..6c56d429 100644 --- a/server/mergin/sync/public_api_controller.py +++ b/server/mergin/sync/public_api_controller.py @@ -9,7 +9,6 @@ import logging from dataclasses import asdict from typing import Dict -from urllib.parse import quote import uuid from datetime import datetime @@ -20,7 +19,6 @@ from flask import ( abort, current_app, - send_from_directory, jsonify, make_response, ) @@ -28,10 +26,8 @@ from flask_login import current_user from sqlalchemy import and_, desc, asc from sqlalchemy.exc import IntegrityError -from binaryornot.check import is_binary from gevent import sleep import base64 - from werkzeug.exceptions import HTTPException from mergin.sync.forms import project_name_validation @@ -40,7 +36,6 @@ from ..auth import auth_required from ..auth.models import User from .models import ( - FileDiff, Project, ProjectVersion, Upload, @@ -78,7 +73,6 @@ from .utils import ( generate_checksum, Toucher, - get_x_accel_uri, is_file_name_blacklisted, get_ip, get_user_agent, @@ -91,7 +85,7 @@ is_valid_path, is_supported_type, is_supported_extension, - get_mimetype, + prepare_download_response, ) from .errors import StorageLimitHit, ProjectLocked from ..utils import format_time_delta @@ -353,30 +347,8 @@ def download_project_file( logging.error(f"Missing file {namespace}/{project_name}/{file_path}") abort(404) - if current_app.config["USE_X_ACCEL"]: - # encoding for nginx to be able to download file with non-ascii chars - encoded_file_path = quote(file_path.encode("utf-8")) - resp = make_response() - resp.headers["X-Accel-Redirect"] = get_x_accel_uri( - project.storage_params["location"], encoded_file_path - ) - resp.headers["X-Accel-Buffering"] = True - resp.headers["X-Accel-Expires"] = "off" - else: - resp = send_from_directory( - os.path.dirname(abs_path), os.path.basename(abs_path) - ) - - if not is_binary(abs_path): - mime_type = "text/plain" - else: - mime_type = get_mimetype(abs_path) - resp.headers["Content-Type"] = mime_type - resp.headers["Content-Disposition"] = "attachment; filename={}".format( - quote(os.path.basename(file).encode("utf-8")) - ) - resp.direct_passthrough = False - return resp + response = prepare_download_response(project.storage.project_dir, file_path) + return response def get_project(project_name, namespace, since="", version=None): # noqa: E501 diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index 04dbce61..45521a5d 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -219,6 +219,38 @@ paths: "404": $ref: "#/components/responses/NotFound" x-openapi-router-controller: mergin.sync.public_api_v2_controller + /projects/{id}/raw/diff/{file}: + get: + tags: + - project + summary: Download project geopackage diff file + operationId: download_diff_file + parameters: + - $ref: "#/components/parameters/ProjectId" + - name: file + required: true + in: path + description: File path + schema: + type: string + example: survey.gpkg-diff-1b9fe848-d2e4-4c53-958d-3dd97e5486f6 + responses: + "200": + description: File content (or its part) + content: + application/octet-stream: + schema: + type: string + format: binary + "400": + $ref: "#/components/responses/BadRequest" + "401": + $ref: "#/components/responses/Unauthorized" + "403": + $ref: "#/components/responses/Forbidden" + "404": + $ref: "#/components/responses/NotFound" + x-openapi-router-controller: mergin.sync.public_api_v2_controller components: responses: NoContent: diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 7f40c54b..3b6cda40 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -2,20 +2,21 @@ # # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial +import os from datetime import datetime from connexion import NoContent, request from flask import abort, jsonify from flask_login import current_user -from mergin.sync.forms import project_name_validation - +from .forms import project_name_validation from .schemas import ProjectMemberSchema from .workspace import WorkspaceRole from ..app import db from ..auth import auth_required from ..auth.models import User -from .models import Project, ProjectRole, ProjectMember +from .models import FileDiff, Project, ProjectRole, ProjectMember from .permissions import ProjectPermissions, require_project_by_uuid +from .utils import prepare_download_response @auth_required @@ -128,3 +129,20 @@ def remove_project_collaborator(id, user_id): project.unset_role(user_id) db.session.commit() return NoContent, 204 + + +def download_diff_file(id: str, file: str): + """Download project geopackage diff file""" + project = require_project_by_uuid(id, ProjectPermissions.Read) + diff_file = FileDiff.query.filter_by(path=file).first_or_404() + + # create merged diff if it does not exist + if not os.path.exists(diff_file.abs_path): + diff_file.construct_checkpoint() + if not os.path.exists(diff_file.abs_path): + abort(404) + + response = prepare_download_response( + project.storage.project_dir, diff_file.location + ) + return response diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index c4497e13..a0f61fd6 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -7,13 +7,15 @@ import hashlib import re import secrets +from binaryornot.check import is_binary from dataclasses import dataclass from threading import Timer +from urllib.parse import quote from uuid import UUID from shapely import wkb from shapely.errors import ShapelyError from gevent import sleep -from flask import Request +from flask import Request, Response, make_response, send_from_directory from typing import List, Optional from sqlalchemy import text from pathvalidate import ( @@ -580,6 +582,30 @@ def get_x_accel_uri(*url_parts): return result +def prepare_download_response(project_dir: str, path: str) -> Response: + """Prepare flask response for file download with custom headers""" + abs_path = os.path.join(project_dir, path) + if current_app.config["USE_X_ACCEL"]: + # encoding for nginx to be able to download file with non-ascii chars + resp = make_response() + resp.headers["X-Accel-Redirect"] = get_x_accel_uri( + project_dir, quote(path.encode("utf-8")) + ) + resp.headers["X-Accel-Buffering"] = True + resp.headers["X-Accel-Expires"] = "off" + else: + resp = send_from_directory( + os.path.dirname(abs_path), os.path.basename(abs_path) + ) + + mime_type = "text/plain" if not is_binary(abs_path) else get_mimetype(abs_path) + resp.headers["Content-Type"] = mime_type + file_name = quote(os.path.basename(path).encode("utf-8")) + resp.headers["Content-Disposition"] = f"attachment; filename*=UTF-8''{file_name}" + resp.direct_passthrough = False + return resp + + LOG_BASE = 4 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 2d88d652..9111b796 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -1,13 +1,17 @@ # Copyright (C) Lutra Consulting Limited # # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial -from .utils import add_user +import os +import shutil +from unittest.mock import patch +import uuid +from pygeodiff import GeoDiffLibError + +from .utils import add_user, diffs_are_equal, execute_query, push_change from ..app import db -from mergin.sync.models import Project from tests import test_project, test_workspace_id - from ..config import Configuration -from ..sync.models import ProjectRole +from ..sync.models import FileDiff, FileHistory, Project, ProjectFilePath, ProjectRole def test_schedule_delete_project(client): @@ -126,3 +130,136 @@ def test_project_members(client): # access provided by workspace role cannot be removed directly response = client.delete(url + f"/{user.id}") assert response.status_code == 404 + + +def test_file_diff_download(client, diff_project): + """Test download of gpkg diff files""" + gpkg_file = ProjectFilePath.query.filter_by( + project_id=diff_project.id, path="base.gpkg" + ).first() + + diff_file = FileDiff.query.filter_by( + file_path_id=gpkg_file.id, version=4, rank=0 + ).first() + + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff_file.path}") + assert response.status_code == 200 + assert response.content_type == "application/octet-stream" + + # add some indented merged diff to db, v5-v8 + basefile = FileHistory.get_basefile(gpkg_file.id, 8) + diff = FileDiff( + basefile=basefile, + version=8, + rank=1, + path=f"base.gpkg-{uuid.uuid4()}", + size=None, + checksum=None, + ) + db.session.add(diff) + db.session.commit() + assert not os.path.exists(diff.abs_path) + + # download merged diff with its reconstuction on the fly + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff.path}") + assert response.status_code == 200 + assert response.content_type == "application/octet-stream" + assert os.path.exists(diff.abs_path) + + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff.path}+1") + assert response.status_code == 404 + + +def test_create_diff_checkpoint(diff_project): + """Test creation of diff checkpoints""" + # add changes v11-v32 where v9 is a basefile + file_path_id = ( + ProjectFilePath.query.filter_by(project_id=diff_project.id, path="test.gpkg") + .first() + .id + ) + + base_gpkg = os.path.join(diff_project.storage.project_dir, "test.gpkg") + shutil.copy( + os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), base_gpkg + ) + for i in range(22): + sql = f"UPDATE simple SET rating={i}" + execute_query(base_gpkg, sql) + pv = push_change( + diff_project, "updated", "test.gpkg", diff_project.storage.project_dir + ) + assert diff_project.latest_version == pv.name == (11 + i) + file_diff = FileDiff.query.filter_by( + file_path_id=file_path_id, version=pv.name, rank=0 + ).first() + assert file_diff and os.path.exists(file_diff.abs_path) + + basefile, diffs = FileHistory.diffs_chain(file_path_id, 32) + assert basefile.project_version_name == 9 + # so far we only have individual diffs + assert len(diffs) == 22 + + # diff for v17-v20 from individual diffs + diff = FileDiff( + basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=20, rank=1 + ) + db.session.add(diff) + db.session.commit() + assert not os.path.exists(diff.abs_path) + diff.construct_checkpoint() + assert os.path.exists(diff.abs_path) + + basefile, diffs = FileHistory.diffs_chain(file_path_id, 20) + assert basefile.project_version_name == 9 + # 6 individual diffs (v11-v16) + merged diff (v17-v20) as the last one + assert len(diffs) == 7 + assert diffs[-1] == diff + + # repeat - nothing to do + mtime = os.path.getmtime(diff.abs_path) + diff.construct_checkpoint() + assert mtime == os.path.getmtime(diff.abs_path) + + # diff for v17-v32 with merged diffs (using one above) + diff = FileDiff( + basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=32, rank=2 + ) + db.session.add(diff) + db.session.commit() + diff.construct_checkpoint() + assert os.path.exists(diff.abs_path) + + # assert gpkg diff is the same as it would be from merging all individual diffs + individual_diffs = ( + FileDiff.query.filter_by(file_path_id=file_path_id, rank=0) + .filter(FileDiff.version.between(17, 32)) + .all() + ) + merged_diff = os.path.join(diff_project.storage.diffs_dir, "merged-diff") + diff_project.storage.geodiff.concat_changes( + [d.abs_path for d in individual_diffs], merged_diff + ) + assert diffs_are_equal(diff.abs_path, merged_diff) + + # test various failures + with patch.object(diff_project.storage.geodiff, "concat_changes") as mock: + # diff for missing basefile (e.g. deleted file or not-existing file) + diff = FileDiff( + basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=8, rank=1 + ) + db.session.add(diff) + db.session.commit() + diff.construct_checkpoint() + assert not mock.called + + # geodiff failure + mock.side_effect = GeoDiffLibError + diff = FileDiff( + basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=16, rank=1 + ) + db.session.add(diff) + db.session.commit() + diff.construct_checkpoint() + assert mock.called + assert not os.path.exists(diff.abs_path) diff --git a/server/migrations/community/bd1ec73db389_create_file_diff_table.py b/server/migrations/community/bd1ec73db389_create_file_diff_table.py index 78ce673c..5a30b862 100644 --- a/server/migrations/community/bd1ec73db389_create_file_diff_table.py +++ b/server/migrations/community/bd1ec73db389_create_file_diff_table.py @@ -27,8 +27,8 @@ def upgrade(): sa.Column("rank", sa.Integer(), nullable=False), sa.Column("version", sa.Integer(), nullable=False), sa.Column("location", sa.String(), nullable=True), - sa.Column("size", sa.BigInteger(), nullable=False), - sa.Column("checksum", sa.String(), nullable=False), + sa.Column("size", sa.BigInteger(), nullable=True), + sa.Column("checksum", sa.String(), nullable=True), sa.ForeignKeyConstraint( ["basefile_id"], ["file_history.id"], From 85ef488a99f501be9e48f618d5d14d63af2226b2 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 25 Sep 2025 16:00:55 +0200 Subject: [PATCH 08/36] Remove celery caching job and trigger on project push Specification changed - rollback recent changes --- server/mergin/sync/db_events.py | 54 -------- server/mergin/sync/tasks.py | 119 ------------------ server/mergin/tests/test_celery.py | 117 ----------------- server/mergin/tests/test_db_hooks.py | 40 ------ .../mergin/tests/test_project_controller.py | 5 +- 5 files changed, 1 insertion(+), 334 deletions(-) diff --git a/server/mergin/sync/db_events.py b/server/mergin/sync/db_events.py index c0bb96fd..18d1ce60 100644 --- a/server/mergin/sync/db_events.py +++ b/server/mergin/sync/db_events.py @@ -5,13 +5,8 @@ import os from flask import current_app, abort from sqlalchemy import event -from sqlalchemy.sql import text from ..app import db -from .models import ProjectVersion -from .public_api_controller import project_version_created -from .tasks import create_diff_checkpoint -from .utils import get_cached_levels def check(session): @@ -19,58 +14,9 @@ def check(session): abort(503, "Service unavailable due to maintenance, please try later") -def create_checkpoints(project_version: ProjectVersion): - """ - Create version checkpoints related to new project version - """ - # for initial versions there is nothing to do - if project_version.name in (0, 1): - return - - cache_levels = get_cached_levels(project_version.name) - if not cache_levels: - return - - # get all diff-modified gpkg files - query = text( - """ - WITH gpkg_files AS ( - SELECT id - FROM project_file_path - WHERE - project_id = :project_id - AND lower(path) LIKE '%.gpkg' - ), - latest_updates AS ( - SELECT DISTINCT - gf.id, - max(fh.project_version_name) AS latest_version - FROM gpkg_files gf - INNER JOIN file_history fh ON fh.file_path_id = gf.id - GROUP BY gf.id - ) - SELECT - lu.id - FROM latest_updates lu - LEFT OUTER JOIN file_history fh ON lu.id = fh.file_path_id AND lu.latest_version = fh.project_version_name - WHERE fh.change = 'update_diff'; - """ - ) - result = db.session.execute( - query, {"project_id": project_version.project_id} - ).fetchall() - - # create batch of caching jobs - for row in result: - for level in cache_levels: - create_diff_checkpoint.delay(row.id, level.start, level.end) - - def register_events(): event.listen(db.session, "before_commit", check) - project_version_created.connect(create_checkpoints) def remove_events(): event.remove(db.session, "before_commit", check) - project_version_created.disconnect(create_checkpoints) diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index d3442efb..c40343eb 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -169,122 +169,3 @@ def remove_projects_archives(): os.remove(path) except OSError as e: logging.error(f"Unable to remove {path}: {str(e)}") - - -@celery.task -def create_diff_checkpoint(file_id: int, start: int, end: int): - """Create a diff file checkpoint (aka. merged diff). - Find all smaller diffs which are needed to create the final diff file and merge them. - In case of missing some lower rank checkpoint, use individual diffs instead. - """ - db.session.info = {"msg": "create_diff_checkpoint"} - diff_range = end - start + 1 - - # invalid request as there would not be a checkpoint with this range - if end % LOG_BASE or diff_range % LOG_BASE: - return - - rank = math.log(diff_range) / math.log(LOG_BASE) - if not rank.is_integer(): - return - - # checkpoint already exists - file_diff = FileDiff.query.filter_by( - file_path_id=file_id, version=end, rank=rank - ).first() - if file_diff and os.path.exists(file_diff.abs_path): - return - - basefile = FileHistory.get_basefile(file_id, end) - if not basefile: - logging.error(f"Unable to find basefile for file {file_id}") - return - - if basefile.project_version_name > start: - logging.error( - f"Basefile version {basefile.project_version_name} is higher than start version {start} - broken history" - ) - return - - diffs_paths = [] - - # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead - cached_items = get_merged_versions(start, end - 1) - expected_diffs = ( - FileDiff.query.filter_by( - basefile_id=basefile.id, - ) - .filter( - tuple_(FileDiff.rank, FileDiff.version).in_( - [(item.rank, item.end) for item in cached_items] - ) - ) - .all() - ) - - for item in cached_items: - # basefile is a start of the diff chain - if item.start <= basefile.project_version_name: - continue - - # find diff in table and on disk - diff = next( - ( - d - for d in expected_diffs - if d.rank == item.rank and d.version == item.end - ), - None, - ) - if diff and os.path.exists(diff.abs_path): - diffs_paths.append(diff.abs_path) - else: - individual_diffs = ( - FileDiff.query.filter_by( - basefile_id=basefile.id, - rank=0, - ) - .filter(FileDiff.version >= item.start, FileDiff.version <= item.end) - .order_by(FileDiff.version) - .all() - ) - if individual_diffs: - diffs_paths.extend([i.abs_path for i in individual_diffs]) - else: - logging.error(f"Unable to find diffs for {item} for file {file_id}") - return - - # we apply latest change (if any) on previous version - end_diff = FileDiff.query.filter_by( - basefile_id=basefile.id, - rank=0, - version=end, - ).first() - - if end_diff: - diffs_paths.append(end_diff.abs_path) - - if not diffs_paths: - logging.warning(f"No diffs for next checkpoint for file {file_id}") - return - - project: Project = basefile.file.project - checkpoint_path = f"diff-{uuid.uuid4()}" - os.makedirs(project.storage.diffs_dir, exist_ok=True) - checkpoint_file = os.path.join(project.storage.diffs_dir, checkpoint_path) - try: - project.storage.geodiff.concat_changes(diffs_paths, checkpoint_file) - except (GeoDiffLibError, GeoDiffLibConflictError): - logging.error(f"Geodiff: Failed to merge diffs for file {file_id}") - return - - checkpoint = FileDiff( - basefile=basefile, - path=checkpoint_path, - size=os.path.getsize(checkpoint_file), - checksum=generate_checksum(checkpoint_file), - rank=rank, - version=end, - ) - db.session.add(checkpoint) - db.session.commit() diff --git a/server/mergin/tests/test_celery.py b/server/mergin/tests/test_celery.py index 364a4970..bf1bee65 100644 --- a/server/mergin/tests/test_celery.py +++ b/server/mergin/tests/test_celery.py @@ -5,26 +5,20 @@ import os from datetime import datetime, timedelta from pathlib import Path -import shutil from flask import current_app from flask_mail import Mail -from pygeodiff import GeoDiffLibError from unittest.mock import patch from ..app import db from ..config import Configuration from ..sync.models import ( - FileDiff, - FileHistory, Project, AccessRequest, - ProjectFilePath, ProjectRole, ProjectVersion, ) from ..celery import send_email_async from ..sync.tasks import ( - create_diff_checkpoint, remove_temp_files, remove_projects_backups, create_project_version_zip, @@ -36,12 +30,8 @@ add_user, create_workspace, create_project, - diffs_are_equal, - execute_query, - gpkgs_are_equal, login, modify_file_times, - push_change, ) from ..auth.models import User from . import json_headers @@ -207,110 +197,3 @@ def test_create_project_version_zip(diff_project): modify_file_times(latest_version.zip_path, new_time) remove_projects_archives() # zip has expired -> remove assert not os.path.exists(latest_version.zip_path) - - -def test_create_diff_checkpoint(diff_project): - # add changes v11-v32 where v9 is a basefile - file_path_id = ( - ProjectFilePath.query.filter_by(project_id=diff_project.id, path="test.gpkg") - .first() - .id - ) - - base_gpkg = os.path.join(diff_project.storage.project_dir, "test.gpkg") - shutil.copy( - os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), base_gpkg - ) - for i in range(22): - sql = f"UPDATE simple SET rating={i}" - execute_query(base_gpkg, sql) - pv = push_change( - diff_project, "updated", "test.gpkg", diff_project.storage.project_dir - ) - assert diff_project.latest_version == pv.name == (11 + i) - file_diff = FileDiff.query.filter_by( - file_path_id=file_path_id, version=pv.name, rank=0 - ).first() - assert file_diff and os.path.exists(file_diff.abs_path) - - basefile, diffs = FileHistory.diffs_chain(file_path_id, 32) - assert basefile.project_version_name == 9 - # so far we only have individual diffs - assert len(diffs) == 22 - - # diff for v17-v20 from individual diffs - create_diff_checkpoint(file_path_id, 17, 20) - diff = FileDiff.query.filter_by( - file_path_id=file_path_id, version=20, rank=1 - ).first() - assert os.path.exists(diff.abs_path) - - basefile, diffs = FileHistory.diffs_chain(file_path_id, 20) - assert basefile.project_version_name == 9 - # 6 individual diffs (v11-v16) + merged diff (v17-v20) as the last one - assert len(diffs) == 7 - assert diffs[-1] == diff - - # repeat - nothing to do - mtime = os.path.getmtime(diff.abs_path) - create_diff_checkpoint(file_path_id, 17, 20) - assert mtime == os.path.getmtime(diff.abs_path) - - # diff for v17-v32 with merged diffs (using one above) - create_diff_checkpoint(file_path_id, 17, 32) - diff = FileDiff.query.filter_by( - file_path_id=file_path_id, version=32, rank=2 - ).first() - assert os.path.exists(diff.abs_path) - # assert gpkg diff is the same as it would be from merging all individual diffs - individual_diffs = ( - FileDiff.query.filter_by(file_path_id=file_path_id, rank=0) - .filter(FileDiff.version.between(17, 32)) - .all() - ) - merged_diff = os.path.join(diff_project.storage.diffs_dir, "merged-diff") - diff_project.storage.geodiff.concat_changes( - [d.abs_path for d in individual_diffs], merged_diff - ) - assert diffs_are_equal(diff.abs_path, merged_diff) - - # test various failures - with patch.object(diff_project.storage.geodiff, "concat_changes") as mock: - # diff for not existing version - create_diff_checkpoint(file_path_id, 33, 36) - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=36 - ).count() - - # diff for invalid range - create_diff_checkpoint(file_path_id, 17, 31) - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=31, rank=1 - ).count() - - create_diff_checkpoint(file_path_id, 27, 32) - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=32, rank=1 - ).count() - - # diff with broken history at v9 - create_diff_checkpoint(file_path_id, 5, 20) - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=20, rank=2 - ).count() - - # diff for missing basefile (e.g. deleted file or not-existing file) - create_diff_checkpoint(file_path_id, 5, 8) - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=8, rank=1 - ).count() - - assert not mock.called - - # geodiff failure - mock.side_effect = GeoDiffLibError - create_diff_checkpoint(file_path_id, 13, 16) - assert mock.called - assert not FileDiff.query.filter_by( - file_path_id=file_path_id, version=16, rank=1 - ).count() diff --git a/server/mergin/tests/test_db_hooks.py b/server/mergin/tests/test_db_hooks.py index f67992bc..e7f9e270 100644 --- a/server/mergin/tests/test_db_hooks.py +++ b/server/mergin/tests/test_db_hooks.py @@ -4,9 +4,6 @@ import os from pathlib import Path -from unittest.mock import patch - -import pytest from ..sync.models import ( Project, @@ -22,7 +19,6 @@ ProjectUser, ) from ..sync.files import UploadChanges -from ..sync.public_api_controller import project_version_created from ..auth.models import User from ..app import db from . import DEFAULT_USER @@ -171,39 +167,3 @@ def test_remove_project(client, diff_project): # try to remove the deleted project assert diff_project.delete() is None - - -test_caching_call_data = [ - (4, True), # success - (8, True), # success - (5, False), # call not divisible by 4 - (4, False), # fake last change to be a breaking change -] - - -@pytest.mark.parametrize("version,called", test_caching_call_data) -@patch("mergin.sync.tasks.create_diff_checkpoint.delay") -def test_trigger_diff_caching(checkpoint_mock, diff_project, version, called): - # make target version the latest version - ProjectVersion.query.filter_by(project_id=diff_project.id).filter( - ProjectVersion.name > version - ).delete() - db.session.commit() - - pv = ProjectVersion.query.filter_by( - project_id=diff_project.id, name=version - ).first() - # modify the last change to be a breaking change - if not called and version == 4: - fh = FileHistory.query.filter_by(version_id=pv.id, change="update_diff").first() - fh.change = "delete" - db.session.commit() - - project_version_created.send(pv) - assert checkpoint_mock.called == called - - if called: - # we asked for to cache first level, e.g. with versions 1..4 - _, start, end = checkpoint_mock.call_args[0] - assert start == version - 3 - assert end == version diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index 22a45c6f..b9ece5e2 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -1087,8 +1087,7 @@ def test_push_project_start(client, data, expected): assert failure.error_type == "push_start" -@patch("mergin.sync.tasks.create_diff_checkpoint.delay") -def test_push_to_new_project(checkpoint_mock, client): +def test_push_to_new_project(client): # create blank project p = Project.query.filter_by( name=test_project, workspace_id=test_workspace_id @@ -1108,8 +1107,6 @@ def test_push_to_new_project(checkpoint_mock, client): headers=json_headers, ) assert resp.status_code == 200 - # nothing to cache in new project - assert not checkpoint_mock.called upload_id = resp.json["transaction"] upload = Upload.query.filter_by(id=upload_id).first() From 4ab69f60cce7def5a67ffa2972ac9ecec0267e15 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Wed, 1 Oct 2025 15:21:54 +0200 Subject: [PATCH 09/36] Initial migration --- server/mergin/sync/models.py | 25 +++++++++++ ...f54ee8c4acd_add_project_version_changes.py | 42 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 server/migrations/community/4f54ee8c4acd_add_project_version_changes.py diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index b0f2795f..9beb6ec6 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -44,6 +44,7 @@ Storages = {"local": DiskStorage} project_deleted = signal("project_deleted") project_access_granted = signal("project_access_granted") +project_version_created = signal("project_version_created") class PushChangeType(Enum): @@ -897,6 +898,30 @@ def construct_checkpoint(self) -> None: db.session.commit() +class ProjectVersionChanges(db.Model): + id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) + # exponential order of changes json + rank = db.Column(db.Integer, nullable=False, index=True) + # to which project version is this linked + version_id = db.Column( + db.Integer, + db.ForeignKey("project_version.id", ondelete="CASCADE"), + index=True, + nullable=False, + ) + # cached changes for versions from start to end (inclusive) + changes = db.Column(JSONB, nullable=False) + + __table_args__ = ( + db.UniqueConstraint("version_id", "rank", name="unique_changes"), + db.Index( + "ix_project_version_change_version_id_rank", + version_id, + rank, + ), + ) + + class ProjectVersion(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) name = db.Column(db.Integer, index=True) diff --git a/server/migrations/community/4f54ee8c4acd_add_project_version_changes.py b/server/migrations/community/4f54ee8c4acd_add_project_version_changes.py new file mode 100644 index 00000000..f40b3ef3 --- /dev/null +++ b/server/migrations/community/4f54ee8c4acd_add_project_version_changes.py @@ -0,0 +1,42 @@ +"""Add project version changes + +Revision ID: 4f54ee8c4acd +Revises: bd1ec73db389 +Create Date: 2025-10-01 11:49:13.560320 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '4f54ee8c4acd' +down_revision = 'bd1ec73db389' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table('project_version_changes', + sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column('rank', sa.Integer(), nullable=False), + sa.Column('version_id', sa.Integer(), nullable=False), + sa.Column('changes', postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.ForeignKeyConstraint(['version_id'], ['project_version.id'], name=op.f('fk_project_version_changes_version_id_project_version'), ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id', name=op.f('pk_project_version_changes')), + sa.UniqueConstraint('version_id', 'rank', name='unique_changes') + ) + op.create_index('ix_project_version_change_version_id_rank', 'project_version_changes', ['version_id', 'rank'], unique=False) + op.create_index(op.f('ix_project_version_changes_rank'), 'project_version_changes', ['rank'], unique=False) + op.create_index(op.f('ix_project_version_changes_version_id'), 'project_version_changes', ['version_id'], unique=False) + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index(op.f('ix_project_version_changes_version_id'), table_name='project_version_changes') + op.drop_index(op.f('ix_project_version_changes_rank'), table_name='project_version_changes') + op.drop_index('ix_project_version_change_version_id_rank', table_name='project_version_changes') + op.drop_table('project_version_changes') + # ### end Alembic commands ### From 71f9f0f65be49f6a8cfda601aea7a615d0d670b2 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Wed, 1 Oct 2025 16:50:49 +0200 Subject: [PATCH 10/36] Modify changes table in name --- server/mergin/sync/models.py | 6 ++++- ...adc90fca0c_add_project_version_changes.py} | 26 +++++++++---------- 2 files changed, 18 insertions(+), 14 deletions(-) rename server/migrations/community/{4f54ee8c4acd_add_project_version_changes.py => 63adc90fca0c_add_project_version_changes.py} (57%) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 9beb6ec6..d0475ec6 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -898,7 +898,7 @@ def construct_checkpoint(self) -> None: db.session.commit() -class ProjectVersionChanges(db.Model): +class ProjectVersionChange(db.Model): id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) # exponential order of changes json rank = db.Column(db.Integer, nullable=False, index=True) @@ -920,6 +920,10 @@ class ProjectVersionChanges(db.Model): rank, ), ) + project = db.relationship( + "ProjectVersion", + uselist=False, + ) class ProjectVersion(db.Model): diff --git a/server/migrations/community/4f54ee8c4acd_add_project_version_changes.py b/server/migrations/community/63adc90fca0c_add_project_version_changes.py similarity index 57% rename from server/migrations/community/4f54ee8c4acd_add_project_version_changes.py rename to server/migrations/community/63adc90fca0c_add_project_version_changes.py index f40b3ef3..f7836df6 100644 --- a/server/migrations/community/4f54ee8c4acd_add_project_version_changes.py +++ b/server/migrations/community/63adc90fca0c_add_project_version_changes.py @@ -1,8 +1,8 @@ """Add project version changes -Revision ID: 4f54ee8c4acd +Revision ID: 63adc90fca0c Revises: bd1ec73db389 -Create Date: 2025-10-01 11:49:13.560320 +Create Date: 2025-10-01 16:50:08.343639 """ from alembic import op @@ -10,7 +10,7 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = '4f54ee8c4acd' +revision = '63adc90fca0c' down_revision = 'bd1ec73db389' branch_labels = None depends_on = None @@ -18,25 +18,25 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.create_table('project_version_changes', + op.create_table('project_version_change', sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), sa.Column('rank', sa.Integer(), nullable=False), sa.Column('version_id', sa.Integer(), nullable=False), sa.Column('changes', postgresql.JSONB(astext_type=sa.Text()), nullable=False), - sa.ForeignKeyConstraint(['version_id'], ['project_version.id'], name=op.f('fk_project_version_changes_version_id_project_version'), ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id', name=op.f('pk_project_version_changes')), + sa.ForeignKeyConstraint(['version_id'], ['project_version.id'], name=op.f('fk_project_version_change_version_id_project_version'), ondelete='CASCADE'), + sa.PrimaryKeyConstraint('id', name=op.f('pk_project_version_change')), sa.UniqueConstraint('version_id', 'rank', name='unique_changes') ) - op.create_index('ix_project_version_change_version_id_rank', 'project_version_changes', ['version_id', 'rank'], unique=False) - op.create_index(op.f('ix_project_version_changes_rank'), 'project_version_changes', ['rank'], unique=False) - op.create_index(op.f('ix_project_version_changes_version_id'), 'project_version_changes', ['version_id'], unique=False) + op.create_index(op.f('ix_project_version_change_rank'), 'project_version_change', ['rank'], unique=False) + op.create_index(op.f('ix_project_version_change_version_id'), 'project_version_change', ['version_id'], unique=False) + op.create_index('ix_project_version_change_version_id_rank', 'project_version_change', ['version_id', 'rank'], unique=False) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_index(op.f('ix_project_version_changes_version_id'), table_name='project_version_changes') - op.drop_index(op.f('ix_project_version_changes_rank'), table_name='project_version_changes') - op.drop_index('ix_project_version_change_version_id_rank', table_name='project_version_changes') - op.drop_table('project_version_changes') + op.drop_index('ix_project_version_change_version_id_rank', table_name='project_version_change') + op.drop_index(op.f('ix_project_version_change_version_id'), table_name='project_version_change') + op.drop_index(op.f('ix_project_version_change_rank'), table_name='project_version_change') + op.drop_table('project_version_change') # ### end Alembic commands ### From 0025b3bfa2f77993b74b77849abcf51d236d55e0 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 2 Oct 2025 10:10:39 +0200 Subject: [PATCH 11/36] Cosmetic changes --- server/mergin/sync/models.py | 46 +++++++++++++++--------------------- server/mergin/sync/tasks.py | 8 +------ server/mergin/sync/utils.py | 10 ++++---- 3 files changed, 25 insertions(+), 39 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index b0f2795f..e1505575 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -735,6 +735,7 @@ class FileDiff(db.Model): version = db.Column(db.Integer, nullable=False, index=True) # path on FS relative to project directory location = db.Column(db.String) + # size and checksum are nullable as for merged diffs (higher orders) they might not exist on disk yet size = db.Column(db.BigInteger, nullable=True) checksum = db.Column(db.String, nullable=True) @@ -774,26 +775,6 @@ def abs_path(self) -> str: """ return os.path.join(self.file.project.storage.project_dir, self.location) - @property - def cache_level(self) -> Optional[CachedLevel]: - """ - Return cache level representation for diff file - """ - # individual diff for any version - if self.rank == 0: - return CachedLevel(rank=self.rank, index=self.version) - - # merged diffs can only be created for certain versions - if self.version % LOG_BASE: - return - - index = self.version // LOG_BASE**self.rank - # some invalid record - if index < 1 or self.rank < 0: - return - - return CachedLevel(rank=self.rank, index=index) - def construct_checkpoint(self) -> None: """Create a diff file checkpoint (aka. merged diff). Find all smaller diffs which are needed to create the final diff file and merge them. @@ -802,22 +783,33 @@ def construct_checkpoint(self) -> None: if os.path.exists(self.abs_path): return - basefile = FileHistory.get_basefile(self.file_path_id, self.cache_level.end) + # merged diffs can only be created for certain versions + if self.version % LOG_BASE: + return + + cache_level_index = self.version // LOG_BASE**self.rank + try: + cache_level = CachedLevel(rank=self.rank, index=cache_level_index) + except ValueError: + logging.error( + f"Invalid record for cached level of rank {self.rank} and index {cache_level_index} for file {self.file_path_id}" + ) + return + + basefile = FileHistory.get_basefile(self.file_path_id, cache_level.end) if not basefile: logging.error(f"Unable to find basefile for file {self.file_path_id}") return - if basefile.project_version_name > self.cache_level.start: + if basefile.project_version_name > cache_level.start: logging.error( - f"Basefile version {basefile.project_version_name} is higher than start version {self.cache_level.start} - broken history" + f"Basefile version {basefile.project_version_name} is higher than start version {cache_level.start} - broken history" ) return diffs_paths = [] # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead - cached_items = get_merged_versions( - self.cache_level.start, self.cache_level.end - 1 - ) + cached_items = get_merged_versions(cache_level.start, cache_level.end - 1) expected_diffs = ( FileDiff.query.filter_by( basefile_id=basefile.id, @@ -870,7 +862,7 @@ def construct_checkpoint(self) -> None: end_diff = FileDiff.query.filter_by( basefile_id=basefile.id, rank=0, - version=self.cache_level.end, + version=cache_level.end, ).first() if end_diff: diff --git a/server/mergin/sync/tasks.py b/server/mergin/sync/tasks.py index c40343eb..1a84afe2 100644 --- a/server/mergin/sync/tasks.py +++ b/server/mergin/sync/tasks.py @@ -3,22 +3,16 @@ # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial import logging -import math import shutil import os import time from datetime import datetime, timedelta, timezone -import uuid from zipfile import ZIP_DEFLATED, ZipFile from flask import current_app -from pygeodiff import GeoDiffLibError -from pygeodiff.geodifflib import GeoDiffLibConflictError -from sqlalchemy import tuple_ -from .models import FileDiff, Project, ProjectVersion, FileHistory +from .models import Project, ProjectVersion, FileHistory from .storages.disk import move_to_tmp from .config import Configuration -from .utils import LOG_BASE, generate_checksum, get_merged_versions from ..celery import celery from ..app import db diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index a0f61fd6..eb672414 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -27,6 +27,9 @@ import magic from flask import current_app +# log base for caching strategy, diff checkpoints, etc. +LOG_BASE = 4 + def generate_checksum(file, chunk_size=4096): """ @@ -606,9 +609,6 @@ def prepare_download_response(project_dir: str, path: str) -> Response: return resp -LOG_BASE = 4 - - @dataclass class CachedLevel: """ @@ -649,7 +649,7 @@ def get_cached_levels(version: int) -> List[CachedLevel]: Version must divisible by BASE, and then we calculate all cached levels related to it. """ levels = [] - rank_max = math.floor((math.log(version) / math.log(LOG_BASE))) + rank_max = math.floor(math.log(version, LOG_BASE)) for rank in range(1, rank_max + 1): if version % LOG_BASE**rank: @@ -671,7 +671,7 @@ def get_merged_versions(start: int, end: int) -> List[CachedLevel]: if start == end: rank_max = 0 else: - rank_max = math.floor((math.log(end - start + 1) / math.log(LOG_BASE))) + rank_max = math.floor(math.log(end - start + 1, LOG_BASE)) for rank in reversed(range(0, rank_max + 1)): if (start - 1) % LOG_BASE**rank: continue From 3c2153a4b9319d889ace3cbb2297b941f5e3a989 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Fri, 3 Oct 2025 13:05:16 +0200 Subject: [PATCH 12/36] Return custom error on failed diff download + small functions renaming/refactor --- server/mergin/sync/errors.py | 7 ++ server/mergin/sync/models.py | 37 ++++++---- server/mergin/sync/public_api_v2.yaml | 14 ++++ .../mergin/sync/public_api_v2_controller.py | 7 +- server/mergin/sync/utils.py | 71 +++++++------------ server/mergin/tests/test_public_api_v2.py | 9 +++ server/mergin/tests/test_utils.py | 26 ++----- 7 files changed, 89 insertions(+), 82 deletions(-) diff --git a/server/mergin/sync/errors.py b/server/mergin/sync/errors.py index 35985ab9..33b80d74 100644 --- a/server/mergin/sync/errors.py +++ b/server/mergin/sync/errors.py @@ -95,3 +95,10 @@ def to_dict(self) -> Dict: class BigChunkError(ResponseError): code = "BigChunkError" detail = f"Chunk size exceeds maximum allowed size {MAX_CHUNK_SIZE} MB" + + +class DiffDownloadError(ResponseError): + code = "DiffDownloadError" + detail = ( + "Required diff file could not be downloaded as it could not be reconstructed" + ) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index d20578f8..4a6c00ea 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -39,9 +39,8 @@ from .storages import DiskStorage from .utils import ( LOG_BASE, - CachedLevel, + Checkpoint, generate_checksum, - get_merged_versions, Toucher, get_chunk_location, get_project_path, @@ -656,7 +655,9 @@ def diffs_chain( return None, [] diffs = [] - cached_items = get_merged_versions(basefile.project_version_name, version) + cached_items = Checkpoint.get_checkpoints( + basefile.project_version_name, version + ) expected_diffs = ( FileDiff.query.filter_by( basefile_id=basefile.id, @@ -779,41 +780,48 @@ def abs_path(self) -> str: """ return os.path.join(self.file.project.storage.project_dir, self.location) - def construct_checkpoint(self) -> None: + def construct_checkpoint(self) -> bool: """Create a diff file checkpoint (aka. merged diff). Find all smaller diffs which are needed to create the final diff file and merge them. In case of missing some lower rank checkpoint, use individual diffs instead. + + Once checkpoint is created, size and checksum are updated in the database. + + Returns: + bool: True if checkpoint was successfully created or already present """ if os.path.exists(self.abs_path): - return + return True # merged diffs can only be created for certain versions if self.version % LOG_BASE: - return + return False cache_level_index = self.version // LOG_BASE**self.rank try: - cache_level = CachedLevel(rank=self.rank, index=cache_level_index) + cache_level = Checkpoint(rank=self.rank, index=cache_level_index) except ValueError: logging.error( f"Invalid record for cached level of rank {self.rank} and index {cache_level_index} for file {self.file_path_id}" ) - return + return False basefile = FileHistory.get_basefile(self.file_path_id, cache_level.end) if not basefile: logging.error(f"Unable to find basefile for file {self.file_path_id}") - return + return False if basefile.project_version_name > cache_level.start: logging.error( f"Basefile version {basefile.project_version_name} is higher than start version {cache_level.start} - broken history" ) - return + return False diffs_paths = [] # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead - cached_items = get_merged_versions(cache_level.start, cache_level.end - 1) + cached_items = Checkpoint.get_checkpoints( + cache_level.start, cache_level.end - 1 + ) expected_diffs = ( FileDiff.query.filter_by( basefile_id=basefile.id, @@ -860,7 +868,7 @@ def construct_checkpoint(self) -> None: logging.error( f"Unable to find diffs for {item} for file {self.file_path_id}" ) - return + return False # we apply latest change (if any) on previous version end_diff = FileDiff.query.filter_by( @@ -876,7 +884,7 @@ def construct_checkpoint(self) -> None: logging.warning( f"No diffs for next checkpoint for file {self.file_path_id}" ) - return + return False project: Project = basefile.file.project os.makedirs(project.storage.diffs_dir, exist_ok=True) @@ -886,11 +894,12 @@ def construct_checkpoint(self) -> None: logging.error( f"Geodiff: Failed to merge diffs for file {self.file_path_id}" ) - return + return False self.size = os.path.getsize(self.abs_path) self.checksum = generate_checksum(self.abs_path) db.session.commit() + return True class ProjectVersion(db.Model): diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index c8e28766..d4c87016 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -283,6 +283,14 @@ paths: $ref: "#/components/responses/Forbidden" "404": $ref: "#/components/responses/NotFound" + "422": + description: Requested diff file could not be downloaded as it was not created + # mixing content types would trigger 500 on response validation + # might be related to issue https://github.com/spec-first/connexion/issues/2054 + # content: + # application/json+problem: + # schema: + # $ref: "#/components/schemas/DiffDownloadError" x-openapi-router-controller: mergin.sync.public_api_v2_controller /projects/{id}/versions: post: @@ -479,6 +487,12 @@ components: example: code: UploadError detail: "Project version could not be created (UploadError)" + DiffDownloadError: + allOf: + - $ref: "#/components/schemas/CustomError" + example: + code: DiffDownloadError + detail: Required diff file could not be created (DiffDownloadError) # Data ProjectRole: type: string diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 2f69b73b..6bac0ff6 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -32,6 +32,7 @@ AnotherUploadRunning, BigChunkError, DataSyncError, + DiffDownloadError, ProjectLocked, ProjectVersionExists, StorageLimitHit, @@ -179,9 +180,9 @@ def download_diff_file(id: str, file: str): # create merged diff if it does not exist if not os.path.exists(diff_file.abs_path): - diff_file.construct_checkpoint() - if not os.path.exists(diff_file.abs_path): - abort(404) + diff_created = diff_file.construct_checkpoint() + if not diff_created: + return DiffDownloadError().response(422) response = prepare_download_response( project.storage.project_dir, diff_file.location diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index 13eee624..e6fd51c2 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial +from __future__ import annotations import logging import math import os @@ -20,7 +21,7 @@ from flask import Request, Response, make_response, send_from_directory from typing import List, Optional from flask import Request -from typing import Optional, Tuple +from typing import Optional from sqlalchemy import text from pathvalidate import ( validate_filename, @@ -616,10 +617,10 @@ def prepare_download_response(project_dir: str, path: str) -> Response: @dataclass -class CachedLevel: +class Checkpoint: """ Cached level of version tree. - Used as a checkpoint to merge individual versions / diff files into bigger chunks + Used as a checkpoint to merge individual versions / diff files into bigger chunks. """ rank: int # power of base @@ -643,51 +644,33 @@ def end(self) -> int: return LOG_BASE**self.rank * self.index def __str__(self) -> str: - return f"CachedLevel(rank={self.rank}, index={self.index}, versions=v{self.start}-v{self.end})" + return f"Checkpoint(rank={self.rank}, index={self.index}, versions=v{self.start}-v{self.end})" def __repr__(self) -> str: return str(self) - -def get_cached_levels(version: int) -> List[CachedLevel]: - """ - Return the most right part of version tree as other nodes are already cached. - Version must divisible by BASE, and then we calculate all cached levels related to it. - """ - levels = [] - rank_max = math.floor(math.log(version, LOG_BASE)) - - for rank in range(1, rank_max + 1): - if version % LOG_BASE**rank: - continue - - index = version // LOG_BASE**rank - levels.append(CachedLevel(rank=rank, index=index)) - - return levels - - -def get_merged_versions(start: int, end: int) -> List[CachedLevel]: - """ - Get all (merged) versions between start version and end version while respecting cached levels. - This basically provide the list of smaller versions (checkpoints) to be merged in order to get the final version. - """ - levels = [] - while start <= end: - if start == end: - rank_max = 0 - else: - rank_max = math.floor(math.log(end - start + 1, LOG_BASE)) - for rank in reversed(range(0, rank_max + 1)): - if (start - 1) % LOG_BASE**rank: - continue - - index = (start - 1) // LOG_BASE**rank + 1 - levels.append(CachedLevel(rank=rank, index=index)) - start = start + LOG_BASE**rank - break - - return levels + @classmethod + def get_checkpoints(cls, start: int, end: int) -> List[Checkpoint]: + """ + Get all checkpoints in a range. + This basically provide a list of smaller versions (checkpoints) to be merged in order to get the final version. + """ + levels = [] + while start <= end: + if start == end: + rank_max = 0 + else: + rank_max = math.floor(math.log(end - start + 1, LOG_BASE)) + for rank in reversed(range(0, rank_max + 1)): + if (start - 1) % LOG_BASE**rank: + continue + + index = (start - 1) // LOG_BASE**rank + 1 + levels.append(cls(rank=rank, index=index)) + start = start + LOG_BASE**rank + break + + return levels def get_chunk_location(id: str): diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 91abfc92..762f5a59 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -20,6 +20,7 @@ from mergin.config import Configuration from mergin.sync.errors import ( BigChunkError, + DiffDownloadError, ProjectLocked, ProjectVersionExists, AnotherUploadRunning, @@ -198,6 +199,14 @@ def test_file_diff_download(client, diff_project): assert response.content_type == "application/octet-stream" assert os.path.exists(diff.abs_path) + # try with reconstruction failure + with patch.object(FileDiff, "construct_checkpoint") as construct_checkpoint_mock: + os.remove(diff.abs_path) + construct_checkpoint_mock.return_value = False + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff.path}") + assert response.status_code == 422 + assert response.json["code"] == DiffDownloadError.code + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff.path}+1") assert response.status_code == 404 diff --git a/server/mergin/tests/test_utils.py b/server/mergin/tests/test_utils.py index 23bb2aee..c854f0c1 100644 --- a/server/mergin/tests/test_utils.py +++ b/server/mergin/tests/test_utils.py @@ -16,8 +16,6 @@ from ..utils import save_diagnostic_log_file from ..sync.utils import ( - get_cached_levels, - get_merged_versions, parse_gpkgb_header_size, gpkg_wkb_to_wkt, is_reserved_word, @@ -26,6 +24,7 @@ check_filename, is_valid_path, get_x_accel_uri, + Checkpoint, ) from ..auth.models import LoginHistory, User from . import json_headers @@ -286,29 +285,14 @@ def test_save_diagnostic_log_file(client, app): def test_checkpoint_utils(): """Test util functions to construct merged versions of higher ranks (checkpoints)""" - - # all cached versions ending with 64 would be v61-v64 (4) v49-v64 (16) and v1-v64 (64) - cached_levels = get_cached_levels(64) - assert len(cached_levels) == 3 - assert cached_levels[0].rank == 1 - assert cached_levels[0].index == 16 - assert cached_levels[1].rank == 2 - assert cached_levels[1].index == 4 - assert cached_levels[2].rank == 3 - assert cached_levels[2].index == 1 - - # there would not be any cached versions ending with 65 - cached_levels = get_cached_levels(65) - assert len(cached_levels) == 0 - # exact match to single rank - versions = get_merged_versions(1, 64) + versions = Checkpoint.get_checkpoints(1, 64) assert len(versions) == 1 assert versions[0].rank == 3 assert versions[0].index == 1 # v21 would be created from v1-16, v17-20 and v21 - versions = get_merged_versions(1, 21) + versions = Checkpoint.get_checkpoints(1, 21) assert len(versions) == 3 assert versions[0].rank == 2 assert versions[0].index == 1 @@ -318,7 +302,7 @@ def test_checkpoint_utils(): assert versions[2].index == 21 # no cached versions at all, only basic levels v1-v3 - versions = get_merged_versions(1, 3) + versions = Checkpoint.get_checkpoints(1, 3) assert len(versions) == 3 assert versions[0].rank == 0 assert versions[0].index == 1 @@ -328,5 +312,5 @@ def test_checkpoint_utils(): assert versions[2].index == 3 # dummy request - versions = get_merged_versions(2, 1) + versions = Checkpoint.get_checkpoints(2, 1) assert len(versions) == 0 From 83f858f0693d6a9e36ea21fc1e16d1856f0bdea0 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Fri, 3 Oct 2025 14:29:17 +0200 Subject: [PATCH 13/36] Initial version for merging diffs --- server/mergin/sync/files.py | 38 +++++- server/mergin/sync/models.py | 120 +++++++++++++++++- server/mergin/sync/utils.py | 27 ++++ ...3adc90fca0c_add_project_version_changes.py | 63 ++++++--- 4 files changed, 229 insertions(+), 19 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 5015e626..f7b6e2a7 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -2,8 +2,9 @@ # # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial import datetime +from enum import Enum import os -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional, List from marshmallow import fields, EXCLUDE, pre_load, post_load, post_dump from pathvalidate import sanitize_filename @@ -11,6 +12,17 @@ from ..app import DateTimeWithZ, ma +class PushChangeType(Enum): + CREATE = "create" + UPDATE = "update" + DELETE = "delete" + UPDATE_DIFF = "update_diff" + + @classmethod + def values(cls): + return [member.value for member in cls.__members__.values()] + + def mergin_secure_filename(filename: str) -> str: """Generate secure filename for given file""" filename = os.path.normpath(filename) @@ -126,3 +138,27 @@ def patch_field(self, data, **kwargs): if not data.get("diff"): data.pop("diff") return data + + +@dataclass +class ProjectVersionChangeData: + path: str + size: int + checksum: str + change: PushChangeType + version: str + diffs: Optional[List[str]] = None + + +class ProjectVersionChangeDataSchema(ma.Schema): + """Schema for changes data in ProjectVersionChange changes column""" + + path = fields.String(required=True) + size = fields.Integer(required=True) + checksum = fields.String(required=True) + diffs = fields.List(fields.String(), required=False) + change = fields.Enum(PushChangeType, by_value=True, required=True) + + @post_load + def make_object(self, data, **kwargs): + return ProjectVersionChangeData(**data) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index d0475ec6..7f1e93bb 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -24,6 +24,8 @@ from .files import ( File, + ProjectVersionChangeData, + ProjectVersionChangeDataSchema, UploadChanges, ChangesSchema, ProjectFile, @@ -36,6 +38,7 @@ LOG_BASE, CachedLevel, generate_checksum, + get_all_cached_levels, get_merged_versions, is_versioned_file, is_qgis, @@ -910,7 +913,7 @@ class ProjectVersionChange(db.Model): nullable=False, ) # cached changes for versions from start to end (inclusive) - changes = db.Column(JSONB, nullable=False) + data = db.Column(JSONB, nullable=False) __table_args__ = ( db.UniqueConstraint("version_id", "rank", name="unique_changes"), @@ -925,6 +928,121 @@ class ProjectVersionChange(db.Model): uselist=False, ) + @staticmethod + def merge_data( + changes: List[ProjectVersionChange], + ) -> List[ProjectVersionChangeData]: + """Merge changes from another ProjectVersionChange into this one""" + identifier = "path" + result: Dict[str, ProjectVersionChangeData] = {} + for change in changes: + for item in change.data: + current_data: ProjectVersionChangeData = ( + ProjectVersionChangeDataSchema().load(item) + ) + existing_data = result.get(current_data.path) + if existing_data: + # merge changes data jsons + if existing_data.change == PushChangeType.CREATE: + if current_data.change == PushChangeType.DELETE: + # create + delete = nothing + del result[identifier] + elif current_data.change in ( + PushChangeType.UPDATE.value, + PushChangeType.UPDATE_DIFF.value, + ): + # create + update = create with updated info + current_data.change = existing_data.change + current_data.diffs = None + result[identifier] = current_data + elif existing_data.change == PushChangeType.UPDATE: + if current_data.change == PushChangeType.UPDATE_DIFF: + # update + update_diff = update_diff with latest info + current_data.change = existing_data.change + current_data.diffs = None + result[identifier] = current_data + elif existing_data.change == PushChangeType.UPDATE_DIFF.value: + if current_data.change == PushChangeType.UPDATE_DIFF.value: + # update_diff + update_diff = update_diff with latest info + current_data.diffs.extend(existing_data.diffs or []) + result[identifier] = current_data + else: + result[current_data.path] = current_data + return list(result.values()) + + def get_data(self, start_version=0) -> None: + """Create a changes json checkpoint (aka. merged changes). + Find all smaller changes which are needed to create the final changes json. + In case of missing some lower rank checkpoint, use individual changes instead. + """ + if self.changes: + return + version_name = self.version.name + project_id = self.version.project_id + if start_version > version_name: + logging.error( + f"Start version {start_version} is higher than end version {version_name} - broken history" + ) + return + + # TODO: rename get_merged_versions to get_merged_checkpoints and move it ProjectVersion class + expected_checkpoints = get_merged_versions(start_version, version_name) + expected_changes: List[ProjectVersionChange] = ( + ProjectVersionChange.query.join( + ProjectVersion, ProjectVersionChange.version_id == ProjectVersion.id + ) + .filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name >= start_version, + ProjectVersion.name <= version_name, + tuple_(ProjectVersionChange.rank, ProjectVersion.name).in_( + [(item.rank, item.end) for item in expected_checkpoints] + ), + ) + .order_by(ProjectVersion.name) + .all() + ) + expected_diffs = FileHistory.query.join( + ProjectVersion, FileHistory.version_id == ProjectVersion.id + ).filter( + ProjectVersion.project_id == project_id, + FileHistory.project_version_name >= start_version, + FileHistory.project_version_name <= version_name, + FileHistory.change == PushChangeType.UPDATE_DIFF.value, + ) + + changes = [] + for checkpoint in expected_checkpoints: + cached_change = next( + ( + c + for c in expected_changes + if c.rank == checkpoint.rank and c.version.name == checkpoint.end + ), + None, + ) + if not cached_change and checkpoint.rank > 0: + # Filter all changes that are in previous checkpoint range + individual_changes = expected_changes[: checkpoint.end + 1] + if not individual_changes: + logging.error( + f"Unable to find rank 0 changes for {checkpoint.rank} for project {project_id}" + ) + return + merged_data = self.merge_data(individual_changes) + changes.append(self.merge_data(individual_changes)) + checkpoint_change = ProjectVersionChange( + version_id=self.version_id, + rank=checkpoint.rank, + changes=[asdict(c) for c in merged_data], + ) + db.session.add(checkpoint_change) + db.session.flush() + else: + changes.append(cached_change) + changes = self.merge_data(changes) + db.session.commit() + class ProjectVersion(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) diff --git a/server/mergin/sync/utils.py b/server/mergin/sync/utils.py index a0f61fd6..a8d592e6 100644 --- a/server/mergin/sync/utils.py +++ b/server/mergin/sync/utils.py @@ -661,6 +661,25 @@ def get_cached_levels(version: int) -> List[CachedLevel]: return levels +def get_all_cached_levels(start: int, end: int) -> List[CachedLevel]: + """ + Get all cached levels between start version and end version. + This basically provide the list of cached levels which are fully contained in the range. + """ + levels = [] + rank_max = math.floor(math.log(end, LOG_BASE)) + + for rank in range(1, rank_max + 1): + index_start = (start - 1) // LOG_BASE**rank + 1 + index_end = end // LOG_BASE**rank + for index in range(index_start, index_end + 1): + level = CachedLevel(rank=rank, index=index) + if level.start >= start and level.end <= end: + levels.append(level) + + return levels + + def get_merged_versions(start: int, end: int) -> List[CachedLevel]: """ Get all (merged) versions between start version and end version while respecting cached levels. @@ -682,3 +701,11 @@ def get_merged_versions(start: int, end: int) -> List[CachedLevel]: break return levels + + +def merge_dict_lists(base=[], new=[], key="path"): + """Merge two lists of dictionaries based on 'path' key, updating existing entries and adding new ones.""" + merged = {item[key]: item for item in base} + for item in new: + merged[item[key]] = item + return list(merged.values()) diff --git a/server/migrations/community/63adc90fca0c_add_project_version_changes.py b/server/migrations/community/63adc90fca0c_add_project_version_changes.py index f7836df6..83c9f10a 100644 --- a/server/migrations/community/63adc90fca0c_add_project_version_changes.py +++ b/server/migrations/community/63adc90fca0c_add_project_version_changes.py @@ -5,38 +5,67 @@ Create Date: 2025-10-01 16:50:08.343639 """ + from alembic import op import sqlalchemy as sa from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = '63adc90fca0c' -down_revision = 'bd1ec73db389' +revision = "63adc90fca0c" +down_revision = "bd1ec73db389" branch_labels = None depends_on = None def upgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.create_table('project_version_change', - sa.Column('id', sa.BigInteger(), autoincrement=True, nullable=False), - sa.Column('rank', sa.Integer(), nullable=False), - sa.Column('version_id', sa.Integer(), nullable=False), - sa.Column('changes', postgresql.JSONB(astext_type=sa.Text()), nullable=False), - sa.ForeignKeyConstraint(['version_id'], ['project_version.id'], name=op.f('fk_project_version_change_version_id_project_version'), ondelete='CASCADE'), - sa.PrimaryKeyConstraint('id', name=op.f('pk_project_version_change')), - sa.UniqueConstraint('version_id', 'rank', name='unique_changes') + op.create_table( + "project_version_change", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("rank", sa.Integer(), nullable=False), + sa.Column("version_id", sa.Integer(), nullable=False), + sa.Column("data", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.ForeignKeyConstraint( + ["version_id"], + ["project_version.id"], + name=op.f("fk_project_version_change_version_id_project_version"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_project_version_change")), + sa.UniqueConstraint("version_id", "rank", name="unique_changes"), + ) + op.create_index( + op.f("ix_project_version_change_rank"), + "project_version_change", + ["rank"], + unique=False, + ) + op.create_index( + op.f("ix_project_version_change_version_id"), + "project_version_change", + ["version_id"], + unique=False, + ) + op.create_index( + "ix_project_version_change_version_id_rank", + "project_version_change", + ["version_id", "rank"], + unique=False, ) - op.create_index(op.f('ix_project_version_change_rank'), 'project_version_change', ['rank'], unique=False) - op.create_index(op.f('ix_project_version_change_version_id'), 'project_version_change', ['version_id'], unique=False) - op.create_index('ix_project_version_change_version_id_rank', 'project_version_change', ['version_id', 'rank'], unique=False) # ### end Alembic commands ### def downgrade(): # ### commands auto generated by Alembic - please adjust! ### - op.drop_index('ix_project_version_change_version_id_rank', table_name='project_version_change') - op.drop_index(op.f('ix_project_version_change_version_id'), table_name='project_version_change') - op.drop_index(op.f('ix_project_version_change_rank'), table_name='project_version_change') - op.drop_table('project_version_change') + op.drop_index( + "ix_project_version_change_version_id_rank", table_name="project_version_change" + ) + op.drop_index( + op.f("ix_project_version_change_version_id"), + table_name="project_version_change", + ) + op.drop_index( + op.f("ix_project_version_change_rank"), table_name="project_version_change" + ) + op.drop_table("project_version_change") # ### end Alembic commands ### From 3db1550c2c4967a3cfc420e26bb457c89c8b6b19 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Fri, 3 Oct 2025 14:40:45 +0200 Subject: [PATCH 14/36] Adapt merge versions --- server/mergin/sync/files.py | 16 ++++++++++------ server/mergin/sync/models.py | 4 +--- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 6a4c2143..f5a9db12 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -4,11 +4,18 @@ import datetime from enum import Enum import os -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Optional, List import uuid from flask import current_app -from marshmallow import ValidationError, fields, EXCLUDE, post_dump, validates_schema +from marshmallow import ( + ValidationError, + fields, + EXCLUDE, + post_dump, + validates_schema, + post_load, +) from pathvalidate import sanitize_filename from .utils import ( @@ -237,10 +244,7 @@ def patch_field(self, data, **kwargs): @dataclass -class ProjectVersionChangeData: - path: str - size: int - checksum: str +class ProjectVersionChangeData(File): change: PushChangeType version: str diffs: Optional[List[str]] = None diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index f0592b86..7f37e458 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -24,7 +24,6 @@ from flask import current_app from .files import ( - File, ProjectVersionChangeData, ProjectVersionChangeDataSchema, ProjectDiffFile, @@ -988,8 +987,7 @@ def get_data(self, start_version=0) -> None: ) return - # TODO: rename get_merged_versions to get_merged_checkpoints and move it ProjectVersion class - expected_checkpoints = get_merged_versions(start_version, version_name) + expected_checkpoints = Checkpoint.get_checkpoints(start_version, version_name) expected_changes: List[ProjectVersionChange] = ( ProjectVersionChange.query.join( ProjectVersion, ProjectVersionChange.version_id == ProjectVersion.id From 03f5098370866d99ce00e39a5dcd9321c17d1203 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Tue, 7 Oct 2025 18:06:07 +0200 Subject: [PATCH 15/36] Delta endpoints + logic improvements: - introduce get_delta method with caching - introduce delta contrller --- server/mergin/auth/models.py | 1 - server/mergin/sync/files.py | 29 ++- server/mergin/sync/models.py | 210 ++++++++++++++---- server/mergin/sync/public_api_v2.yaml | 70 ++++++ .../mergin/sync/public_api_v2_controller.py | 32 ++- server/mergin/tests/test_public_api_v2.py | 128 ++++++++++- server/mergin/tests/utils.py | 6 +- ...3adc90fca0c_add_project_version_changes.py | 2 +- 8 files changed, 418 insertions(+), 60 deletions(-) diff --git a/server/mergin/auth/models.py b/server/mergin/auth/models.py index 5dcf275e..3ab05f6b 100644 --- a/server/mergin/auth/models.py +++ b/server/mergin/auth/models.py @@ -11,7 +11,6 @@ from sqlalchemy import or_, func, text from ..app import db -from ..sync.models import ProjectUser from ..sync.utils import get_user_agent, get_ip, get_device_id, is_reserved_word MAX_USERNAME_LENGTH = 50 diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index f5a9db12..7cc49e9a 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -244,21 +244,40 @@ def patch_field(self, data, **kwargs): @dataclass -class ProjectVersionChangeData(File): +class ChangeDiffFile: + path: str + size: Optional[int] = None + + +class ChangeDiffFileSchema(ma.Schema): + path = fields.String(required=True) + size = fields.Integer(required=False) + + +@dataclass +class ProjectVersionChangeDelta(File): change: PushChangeType version: str - diffs: Optional[List[str]] = None + diffs: Optional[List[ChangeDiffFile]] = None -class ProjectVersionChangeDataSchema(ma.Schema): +class ProjectVersionChangeDeltaSchema(ma.Schema): """Schema for changes data in ProjectVersionChange changes column""" path = fields.String(required=True) size = fields.Integer(required=True) checksum = fields.String(required=True) - diffs = fields.List(fields.String(), required=False) + version = fields.String(required=True) + diffs = fields.List(fields.Nested(ChangeDiffFileSchema())) change = fields.Enum(PushChangeType, by_value=True, required=True) @post_load def make_object(self, data, **kwargs): - return ProjectVersionChangeData(**data) + return ProjectVersionChangeDelta(**data) + + @post_dump + def patch_field(self, data, **kwargs): + # drop 'diffs' key entirely if empty or None as clients would expect + if not data.get("diffs"): + data.pop("diffs", None) + return data diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 7f37e458..6b4109bd 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -24,8 +24,9 @@ from flask import current_app from .files import ( - ProjectVersionChangeData, - ProjectVersionChangeDataSchema, + ChangeDiffFile, + ProjectVersionChangeDelta, + ProjectVersionChangeDeltaSchema, ProjectDiffFile, ProjectFileChange, ChangesSchema, @@ -915,7 +916,7 @@ class ProjectVersionChange(db.Model): nullable=False, ) # cached changes for versions from start to end (inclusive) - data = db.Column(JSONB, nullable=False) + delta = db.Column(JSONB, nullable=False) __table_args__ = ( db.UniqueConstraint("version_id", "rank", name="unique_changes"), @@ -925,30 +926,33 @@ class ProjectVersionChange(db.Model): rank, ), ) - project = db.relationship( + version = db.relationship( "ProjectVersion", uselist=False, ) @staticmethod - def merge_data( + def merge_changes( changes: List[ProjectVersionChange], - ) -> List[ProjectVersionChangeData]: - """Merge changes from another ProjectVersionChange into this one""" - identifier = "path" - result: Dict[str, ProjectVersionChangeData] = {} + ) -> List[ProjectVersionChangeDelta]: + """ + Merge multiple changes jsons into one list of changes. + Changes are merged based on file path and change type. + """ + result: Dict[str, ProjectVersionChangeDelta] = {} for change in changes: - for item in change.data: - current_data: ProjectVersionChangeData = ( - ProjectVersionChangeDataSchema().load(item) + for item in change.delta: + current_data: ProjectVersionChangeDelta = ( + ProjectVersionChangeDeltaSchema().load(item) ) existing_data = result.get(current_data.path) + path = current_data.path if existing_data: # merge changes data jsons if existing_data.change == PushChangeType.CREATE: if current_data.change == PushChangeType.DELETE: # create + delete = nothing - del result[identifier] + del result[path] elif current_data.change in ( PushChangeType.UPDATE.value, PushChangeType.UPDATE_DIFF.value, @@ -956,45 +960,59 @@ def merge_data( # create + update = create with updated info current_data.change = existing_data.change current_data.diffs = None - result[identifier] = current_data + else: + result[path] = current_data elif existing_data.change == PushChangeType.UPDATE: if current_data.change == PushChangeType.UPDATE_DIFF: # update + update_diff = update_diff with latest info current_data.change = existing_data.change current_data.diffs = None - result[identifier] = current_data + result[path] = current_data elif existing_data.change == PushChangeType.UPDATE_DIFF.value: if current_data.change == PushChangeType.UPDATE_DIFF.value: # update_diff + update_diff = update_diff with latest info current_data.diffs.extend(existing_data.diffs or []) - result[identifier] = current_data + result[path] = current_data + else: + # delete + anything = anything + result[path] = current_data else: result[current_data.path] = current_data return list(result.values()) - def get_data(self, start_version=0) -> None: - """Create a changes json checkpoint (aka. merged changes). - Find all smaller changes which are needed to create the final changes json. - In case of missing some lower rank checkpoint, use individual changes instead. + def get_delta(self, from_version=1) -> Optional[List[ProjectVersionChangeDelta]]: + """ + Get changes between two versions, merging them if needed. """ - if self.changes: - return version_name = self.version.name project_id = self.version.project_id - if start_version > version_name: + if from_version > version_name: logging.error( - f"Start version {start_version} is higher than end version {version_name} - broken history" + f"Start version {from_version} is higher than end version {version_name} - broken history" ) return - expected_checkpoints = Checkpoint.get_checkpoints(start_version, version_name) - expected_changes: List[ProjectVersionChange] = ( - ProjectVersionChange.query.join( - ProjectVersion, ProjectVersionChange.version_id == ProjectVersion.id + if from_version == version_name: + # Return only changes for this version + changes = ( + ProjectVersionChange.query.join( + ProjectVersion, ProjectVersionChange.version_id == ProjectVersion.id + ) + .filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name == version_name, + ) + .order_by(ProjectVersion.name) + .all() ) + return self.merge_changes(changes) + + expected_checkpoints = Checkpoint.get_checkpoints(from_version, version_name) + expected_changes: List[ProjectVersionChange] = ( + ProjectVersionChange.query.join(ProjectVersion) .filter( ProjectVersion.project_id == project_id, - ProjectVersion.name >= start_version, + ProjectVersion.name > from_version, ProjectVersion.name <= version_name, tuple_(ProjectVersionChange.rank, ProjectVersion.name).in_( [(item.rank, item.end) for item in expected_checkpoints] @@ -1003,18 +1021,12 @@ def get_data(self, start_version=0) -> None: .order_by(ProjectVersion.name) .all() ) - expected_diffs = FileHistory.query.join( - ProjectVersion, FileHistory.version_id == ProjectVersion.id - ).filter( - ProjectVersion.project_id == project_id, - FileHistory.project_version_name >= start_version, - FileHistory.project_version_name <= version_name, - FileHistory.change == PushChangeType.UPDATE_DIFF.value, - ) + individual_changes: List[ProjectVersionChange] = [] changes = [] for checkpoint in expected_checkpoints: - cached_change = next( + # find checkpoint change in already cached changes or any zero rank change needed + expected_change = next( ( c for c in expected_changes @@ -1022,28 +1034,103 @@ def get_data(self, start_version=0) -> None: ), None, ) - if not cached_change and checkpoint.rank > 0: - # Filter all changes that are in previous checkpoint range - individual_changes = expected_changes[: checkpoint.end + 1] + if expected_change: + changes.append(expected_change) + continue + + if checkpoint.rank > 0: + # Filter all changes that are 0 rank and are in the range of the checkpoint, cache them to prevent multiple queries + individual_changes = ( + ProjectVersionChange.query.join(ProjectVersion) + .filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name > from_version, + ProjectVersion.name <= version_name, + ProjectVersionChange.rank == 0, + ) + .all() + if not individual_changes + else individual_changes + ) if not individual_changes: logging.error( f"Unable to find rank 0 changes for {checkpoint.rank} for project {project_id}" ) return - merged_data = self.merge_data(individual_changes) - changes.append(self.merge_data(individual_changes)) + merged_history = self.merge_changes( + [ + change + for change in individual_changes + if checkpoint.start < change.version.name <= checkpoint.end + ] + ) + for item in merged_history: + path = item.path + if not is_versioned_file(path): + continue + + file_history = ( + FileHistory.query.join(ProjectFilePath) + .filter( + ProjectFilePath.project_id == project_id, + ProjectFilePath.path == path, + FileHistory.project_version_name <= checkpoint.end, + ) + .order_by(FileHistory.project_version_name.desc()) + .limit(1) + .first() + ) + existing_diff_checkpoint = ( + FileDiff.query.filter( + FileDiff.file_path_id == file_history.file_path_id, + FileDiff.rank == checkpoint.rank, + tuple_(FileDiff.rank, FileDiff.version).in_( + [(item.rank, item.end) for item in expected_checkpoints] + ), + ) + .order_by(FileDiff.version.desc()) + .limit(1) + .first() + ) + if not existing_diff_checkpoint: + base_file = FileHistory.get_basefile( + file_history.file_path_id, checkpoint.end + ) + if not base_file: + continue + diff_path = mergin_secure_filename( + path + "-diff-" + str(uuid.uuid4()) + ) + checkpoint_diff = FileDiff( + basefile=base_file, + path=diff_path, + rank=checkpoint.rank, + version=checkpoint.end, + ) + # patching diff to new rank file name + item.diffs = [ChangeDiffFile(path=diff_path, size=0)] + db.session.add(checkpoint_diff) + db.session.flush() + latest_change = next( + (c for c in individual_changes if c.version.name == checkpoint.end), + None, + ) checkpoint_change = ProjectVersionChange( - version_id=self.version_id, + version_id=latest_change.version_id, rank=checkpoint.rank, - changes=[asdict(c) for c in merged_data], + delta=[ + {**asdict(c), "change": c.change.value} for c in merged_history + ], ) + changes.append(checkpoint_change) + db.session.add(checkpoint_change) db.session.flush() - else: - changes.append(cached_change) - changes = self.merge_data(changes) db.session.commit() + history_data = self.merge_changes(changes) + return history_data + class ProjectVersion(db.Model): id = db.Column(db.Integer, primary_key=True, autoincrement=True) @@ -1111,7 +1198,6 @@ def __init__( .filter(ProjectFilePath.path.in_(changed_files_paths)) .all() } - for item in changes: # get existing DB file reference or create a new one (for added files) db_file = existing_files_map.get( @@ -1140,6 +1226,32 @@ def __init__( else: latest_files_map[fh.path] = fh.id + # cache changes data json for version checkpoints + # rank 0 is for all changes from start to current version + changes_data = [ + ProjectVersionChangeDelta( + path=c.path, + change=c.change, + size=c.size, + checksum=c.checksum, + version=self.to_v_name(name), + diffs=( + [ChangeDiffFile(path=c.diff.path, size=c.diff.size)] + if c.diff + else None + ), + ) + for c in changes + ] + pvc = ProjectVersionChange( + version=self, + rank=0, + delta=ProjectVersionChangeDeltaSchema(many=True).dump(changes_data), + ) + + db.session.add(pvc) + db.session.flush() + # update cached values in project and push to transaction buffer so that self.files is up-to-date self.project.latest_project_files.file_history_ids = latest_files_map.values() db.session.flush() diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index d4c87016..bec63d4b 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -381,6 +381,40 @@ paths: $ref: "#/components/schemas/ProjectLocked" x-openapi-router-controller: mergin.sync.public_api_v2_controller + /projects/{id}/delta: + get: + tags: + - project + summary: Get project changes (delta) between two versions + operationId: get_project_delta + parameters: + - $ref: "#/components/parameters/ProjectId" + - name: since + in: query + required: true + schema: + type: string + example: v1 + description: Start version (exclusive) + - name: to + in: query + required: true + schema: + type: string + example: v2 + description: End version (inclusive) + responses: + "200": + description: Project changes between two versions + content: + application/json: + schema: + $ref: "#/components/schemas/ProjectDelta" + "400": + $ref: "#/components/responses/BadRequest" + "404": + $ref: "#/components/responses/NotFound" + x-openapi-router-controller: mergin.sync.public_api_v2_controller components: responses: NoContent: @@ -800,3 +834,39 @@ components: - editor - writer - owner + ProjectDelta: + type: object + required: + - path + - size + - checksum + - version + - change + properties: + path: + type: string + example: survey.gpkg + size: + type: integer + example: 1024 + checksum: + type: string + example: 9adb76bf81a34880209040ffe5ee262a090b62ab + version: + type: string + example: v2 + change: + type: string + enum: [create, update, delete, update_diff] + example: update + diffs: + type: array + items: + type: object + properties: + path: + type: string + example: survey.gpkg-diff-1 + size: + type: integer + example: 512 diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 6bac0ff6..17d89b2b 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -22,7 +22,7 @@ from ..app import db from ..auth import auth_required from ..auth.models import User -from .models import FileDiff, Project, ProjectRole, ProjectMember +from .models import FileDiff, Project, ProjectRole, ProjectMember, ProjectVersionChange from .permissions import ProjectPermissions, require_project_by_uuid from .utils import prepare_download_response from ..app import db @@ -38,7 +38,7 @@ StorageLimitHit, UploadError, ) -from .files import ChangesSchema +from .files import ChangesSchema, ProjectVersionChangeDeltaSchema from .forms import project_name_validation from .models import ( Project, @@ -402,3 +402,31 @@ def upload_chunk(id: str): UploadChunkSchema().dump({"id": chunk_id, "valid_until": valid_until}), 200, ) + + +@auth_required +def get_project_delta(id: str): + """Get project changes (delta) between two versions""" + since = request.args.get("since") + to = request.args.get("to") + if not since or not to: + abort(400, "Missing 'since' or 'to' query parameter") + + project = require_project_by_uuid(id, ProjectPermissions.Read) + since_version = ProjectVersion.from_v_name(since) + to_version = ProjectVersion.from_v_name(to) + + if since_version > to_version: + abort(400, "'since' version must be less than 'to' version") + + to_change = ( + ProjectVersionChange.query.join(ProjectVersion) + .filter( + ProjectVersion.project_id == project.id, + ProjectVersion.name == to_version, + ) + .first_or_404() + ) + changes = to_change.get_delta(since_version) + + return ProjectVersionChangeDeltaSchema(many=True).dump(changes), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 762f5a59..433ebab0 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-MerginMaps-Commercial import os import shutil +from typing import List from unittest.mock import patch import uuid from pygeodiff import GeoDiffLibError @@ -11,7 +12,15 @@ from ..app import db from tests import test_project, test_workspace_id from ..config import Configuration -from ..sync.models import FileDiff, FileHistory, Project, ProjectFilePath, ProjectRole +from ..sync.models import ( + FileDiff, + FileHistory, + Project, + ProjectFilePath, + ProjectRole, + ProjectVersionChange, +) +from ..sync.files import PushChangeType from sqlalchemy.exc import IntegrityError import pytest from datetime import datetime, timedelta, timezone @@ -306,6 +315,82 @@ def test_create_diff_checkpoint(diff_project): assert not os.path.exists(diff.abs_path) +def test_project_version_change_delta(diff_project): + """Test that ProjectVersionChangeData and its schema work as expected""" + version = diff_project.get_latest_version() + assert version.name == 10 + pvcs: List[ProjectVersionChange] = ( + ProjectVersionChange.query.join(ProjectVersion) + .filter(ProjectVersion.project_id == diff_project.id) + .all() + ) + assert len(pvcs) == 10 + initial_pvc = pvcs[0] + assert initial_pvc.get_delta(3) is None + initial_version = initial_pvc.version + assert initial_pvc.rank == 0 + assert initial_pvc.version.id == initial_version.id + assert len(initial_pvc.get_delta()) == len(initial_version.files) + # no ranks created as we get here just first version with get_delta + assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 + second_pvc = pvcs[1] + second_data = second_pvc.get_delta() + assert len(second_data) == 1 + assert second_data[0].change == PushChangeType.DELETE + # no ranks created as we get here just first version with get_delta + assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 + + # delete + create version + create_pvc = pvcs[2] + create_data = create_pvc.get_delta() + assert len(create_data) == 1 + assert create_data[0].change == PushChangeType.CREATE + + # get_delta with rank creation + checkpoint_pvc = pvcs[3] + checkpoint_data = checkpoint_pvc.get_delta() + assert len(checkpoint_data) == 1 + assert checkpoint_data[0].change == PushChangeType.UPDATE_DIFF + + checkpoint_changes = ProjectVersionChange.query.filter_by(rank=1) + filediff_checkpoints = FileDiff.query.filter_by(rank=1) + checkpoint_change = checkpoint_changes.first() + filediff_checkpoint = filediff_checkpoints.first() + + assert checkpoint_changes.count() == 1 + assert checkpoint_change.version_id == checkpoint_pvc.version_id + assert filediff_checkpoints.count() == 1 + assert filediff_checkpoint.version == 4 + # check if filediff basefile is correctly set + assert ( + filediff_checkpoint.basefile_id + == FileHistory.query.filter_by(project_version_name=3).first().id + ) + file_history = FileHistory.query.filter_by(project_version_name=4).first() + assert checkpoint_data[0].version == f"v4" + assert checkpoint_data[0].path == file_history.path + assert checkpoint_data[0].size == file_history.size + assert checkpoint_data[0].checksum == file_history.checksum + assert len(checkpoint_data[0].diffs) == 1 + assert checkpoint_data[0].diffs[0]["path"] == filediff_checkpoint.path + assert checkpoint_data[0].diffs[0]["size"] == 0 + + # get data with multiple ranks = 1 level checkpoints 1-8 + 2 level checkpoint 9-10 + latest_pvc = pvcs[-1] + latest_data = latest_pvc.get_delta() + assert len(latest_data) == 2 + assert latest_data[0].change == PushChangeType.DELETE + assert latest_data[1].change == PushChangeType.CREATE + pv = push_change( + diff_project, "removed", "test.gpkg", diff_project.storage.project_dir + ) + latest_pvc = ProjectVersionChange.query.filter_by(version_id=pv.id).first() + latest_data = latest_pvc.get_delta(pv.name - 3) + assert len(latest_data) == 1 + # test.gpkg is transparent as it was created and deleted in this range + assert not next((c for c in latest_data if c.path == "test.gpkg"), None) + + push_data = [ # success ( @@ -647,3 +732,44 @@ def test_full_push(client): os.path.join(project.storage.project_dir, "v2", test_file["path"]) ) assert not Upload.query.filter_by(project_id=project.id).first() + + +def test_project_delta(client, diff_project): + """Test project delta endpoint""" + response = client.get(f"v2/projects/{diff_project.id}/delta") + assert response.status_code == 400 + + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1&to=v10") + assert response.status_code == 200 + assert len(response.json) == 10 + assert response.json[0]["version"] == "v1" + assert response.json[-1]["version"] == "v10" + + response = client.get(f"v2/projects/{project.id}/delta?since=v3&to=v7") + assert response.status_code == 200 + assert len(response.json) == 5 + assert response.json[0]["version"] == "v4" + assert response.json[-1]["version"] == "v8" + + response = client.get(f"v2/projects/{project.id}/delta?since=v10") + assert response.status_code == 200 + assert len(response.json) == 0 + + response = client.get(f"v2/projects/{project.id}/delta?to=v1") + assert response.status_code == 200 + assert len(response.json) == 0 + + response = client.get(f"v2/projects/{project.id}/delta?since=v8&to=v12") + assert response.status_code == 422 + assert response.json["code"] == "InvalidVersionRange" + + response = client.get(f"v2/projects/{project.id}/delta?since=3&to=7") + assert response.status_code == 422 + assert response.json["code"] == "InvalidVersionFormat" + + response = client.get(f"v2/projects/{project.id}/delta?since=v3&to=7") + assert response.status_code == 422 + assert response.json["code"] == "InvalidVersionFormat" + + response = client.get(f"v2/projects/9999/delta") + assert response.status_code == 404 diff --git a/server/mergin/tests/utils.py b/server/mergin/tests/utils.py index fc7e4240..89766d24 100644 --- a/server/mergin/tests/utils.py +++ b/server/mergin/tests/utils.py @@ -303,9 +303,12 @@ def push_change(project, action, path, src_dir): current_files = project.files new_version = ProjectVersion.to_v_name(project.next_version()) changes = {"added": [], "updated": [], "removed": []} - metadata = {**file_info(src_dir, path), "location": os.path.join(new_version, path)} if action == "added": + metadata = { + **file_info(src_dir, path), + "location": os.path.join(new_version, path), + } new_file = os.path.join(project.storage.project_dir, metadata["location"]) os.makedirs(os.path.dirname(new_file), exist_ok=True) shutil.copy(os.path.join(src_dir, metadata["path"]), new_file) @@ -349,6 +352,7 @@ def push_change(project, action, path, src_dir): changes["updated"].append(metadata) elif action == "removed": f_removed = next(f for f in current_files if f.path == path) + os.remove(os.path.join(project.storage.project_dir, f_removed.location)) changes["removed"].append(asdict(f_removed)) else: return diff --git a/server/migrations/community/63adc90fca0c_add_project_version_changes.py b/server/migrations/community/63adc90fca0c_add_project_version_changes.py index 83c9f10a..5d7ffe8d 100644 --- a/server/migrations/community/63adc90fca0c_add_project_version_changes.py +++ b/server/migrations/community/63adc90fca0c_add_project_version_changes.py @@ -24,7 +24,7 @@ def upgrade(): sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), sa.Column("rank", sa.Integer(), nullable=False), sa.Column("version_id", sa.Integer(), nullable=False), - sa.Column("data", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("delta", postgresql.JSONB(astext_type=sa.Text()), nullable=False), sa.ForeignKeyConstraint( ["version_id"], ["project_version.id"], From 4fb5c83ec75158fc2053eb6c0ccf719a9218bb87 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Wed, 8 Oct 2025 18:00:48 +0200 Subject: [PATCH 16/36] extract method for create checkpoint - added tests for pull and checkpoints --- server/mergin/sync/models.py | 225 +++++++++--------- server/mergin/sync/public_api_v2.yaml | 5 +- .../mergin/sync/public_api_v2_controller.py | 35 ++- server/mergin/tests/test_public_api_v2.py | 131 +++++----- 4 files changed, 204 insertions(+), 192 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 6b4109bd..5eb1601c 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -980,40 +980,118 @@ def merge_changes( result[current_data.path] = current_data return list(result.values()) - def get_delta(self, from_version=1) -> Optional[List[ProjectVersionChangeDelta]]: + @classmethod + def _create_checkpoint( + cls, + project_id: str, + checkpoint: Checkpoint, + individual_changes: List[ProjectVersionChange], + ) -> Optional[ProjectVersionChange]: + """ + Creates and caches a new ProjectVersionChange checkpoint and any required FileDiff checkpoints. + """ + individual_changes_range = [ + change + for change in individual_changes + if checkpoint.start < change.version.name <= checkpoint.end + ] + + if not individual_changes_range: + logging.warning( + f"No individual changes found for project {project_id} in range v{checkpoint.start}-v{checkpoint.end} to create checkpoint." + ) + return None + + merged_deltas = cls.merge_changes(individual_changes_range) + versioned_deltas = [ + item for item in merged_deltas if is_versioned_file(item.path) + ] + + # Pre-fetch data for all versioned files to create FileDiff checkpoints + versioned_file_paths = [delta.path for delta in versioned_deltas] + if versioned_file_paths: + file_paths = ProjectFilePath.query.filter( + ProjectFilePath.project_id == project_id, + ProjectFilePath.path.in_(versioned_file_paths), + ).all() + file_path_map = {fp.path: fp.id for fp in file_paths} + + for item in versioned_deltas: + file_path_id = file_path_map.get(item.path) + if not file_path_id: + continue + + # Check if a FileDiff checkpoint already exists + existing_diff_checkpoint = FileDiff.query.filter_by( + file_path_id=file_path_id, + rank=checkpoint.rank, + version=checkpoint.end, + ).first() + + if not existing_diff_checkpoint: + base_file = FileHistory.get_basefile(file_path_id, checkpoint.end) + if not base_file: + continue + + diff_path = mergin_secure_filename( + f"{item.path}-diff-{uuid.uuid4()}" + ) + checkpoint_diff = FileDiff( + basefile=base_file, + path=diff_path, + rank=checkpoint.rank, + version=checkpoint.end, + ) + # Patch the delta with the path to the new diff checkpoint + item.diffs = [ChangeDiffFile(path=diff_path, size=0)] + db.session.add(checkpoint_diff) + + checkpoint_change = ProjectVersionChange( + version_id=individual_changes_range[-1].version_id, + rank=checkpoint.rank, + delta=[{**asdict(c), "change": c.change.value} for c in merged_deltas], + ) + db.session.add(checkpoint_change) + return checkpoint_change + + @classmethod + def get_delta( + cls, project_id: str, since: int, to: int + ) -> Optional[List[ProjectVersionChangeDelta]]: """ Get changes between two versions, merging them if needed. + - create FileDiff checkpoints if needed + - create ProjectVersionChange checkpoints if needed with delta json """ - version_name = self.version.name - project_id = self.version.project_id - if from_version > version_name: + if since > to: logging.error( - f"Start version {from_version} is higher than end version {version_name} - broken history" + f"Start version {since} is higher than end version {to} - broken history" ) return - - if from_version == version_name: + if since == to: # Return only changes for this version - changes = ( - ProjectVersionChange.query.join( - ProjectVersion, ProjectVersionChange.version_id == ProjectVersion.id - ) + change = ( + ProjectVersionChange.query.join(ProjectVersion) .filter( ProjectVersion.project_id == project_id, - ProjectVersion.name == version_name, + ProjectVersion.name == to, + ProjectVersionChange.rank == 0, ) - .order_by(ProjectVersion.name) - .all() + .first() + ) + return ( + [ProjectVersionChangeDeltaSchema().load(item) for item in change.delta] + if change + else [] ) - return self.merge_changes(changes) - expected_checkpoints = Checkpoint.get_checkpoints(from_version, version_name) + expected_checkpoints = Checkpoint.get_checkpoints(since + 1, to) expected_changes: List[ProjectVersionChange] = ( ProjectVersionChange.query.join(ProjectVersion) .filter( ProjectVersion.project_id == project_id, - ProjectVersion.name > from_version, - ProjectVersion.name <= version_name, + ProjectVersion.name > since, + ProjectVersion.name <= to, tuple_(ProjectVersionChange.rank, ProjectVersion.name).in_( [(item.rank, item.end) for item in expected_checkpoints] ), @@ -1021,115 +1099,46 @@ def get_delta(self, from_version=1) -> Optional[List[ProjectVersionChangeDelta]] .order_by(ProjectVersion.name) .all() ) + existing_changes_map = {(c.rank, c.version.name): c for c in expected_changes} + + # Cache all individual (rank 0) changes in the required range. individual_changes: List[ProjectVersionChange] = [] changes = [] for checkpoint in expected_checkpoints: - # find checkpoint change in already cached changes or any zero rank change needed - expected_change = next( - ( - c - for c in expected_changes - if c.rank == checkpoint.rank and c.version.name == checkpoint.end - ), - None, + expected_change = existing_changes_map.get( + (checkpoint.rank, checkpoint.end) ) + if expected_change: changes.append(expected_change) continue if checkpoint.rank > 0: - # Filter all changes that are 0 rank and are in the range of the checkpoint, cache them to prevent multiple queries individual_changes = ( - ProjectVersionChange.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name > from_version, - ProjectVersion.name <= version_name, - ProjectVersionChange.rank == 0, - ) - .all() - if not individual_changes - else individual_changes - ) - if not individual_changes: - logging.error( - f"Unable to find rank 0 changes for {checkpoint.rank} for project {project_id}" - ) - return - merged_history = self.merge_changes( - [ - change - for change in individual_changes - if checkpoint.start < change.version.name <= checkpoint.end - ] - ) - for item in merged_history: - path = item.path - if not is_versioned_file(path): - continue - - file_history = ( - FileHistory.query.join(ProjectFilePath) + ( + ProjectVersionChange.query.join(ProjectVersion) .filter( - ProjectFilePath.project_id == project_id, - ProjectFilePath.path == path, - FileHistory.project_version_name <= checkpoint.end, - ) - .order_by(FileHistory.project_version_name.desc()) - .limit(1) - .first() - ) - existing_diff_checkpoint = ( - FileDiff.query.filter( - FileDiff.file_path_id == file_history.file_path_id, - FileDiff.rank == checkpoint.rank, - tuple_(FileDiff.rank, FileDiff.version).in_( - [(item.rank, item.end) for item in expected_checkpoints] - ), + ProjectVersion.project_id == project_id, + ProjectVersion.name > since, + ProjectVersion.name <= to, + ProjectVersionChange.rank == 0, ) - .order_by(FileDiff.version.desc()) - .limit(1) - .first() + .all() ) - if not existing_diff_checkpoint: - base_file = FileHistory.get_basefile( - file_history.file_path_id, checkpoint.end - ) - if not base_file: - continue - diff_path = mergin_secure_filename( - path + "-diff-" + str(uuid.uuid4()) - ) - checkpoint_diff = FileDiff( - basefile=base_file, - path=diff_path, - rank=checkpoint.rank, - version=checkpoint.end, - ) - # patching diff to new rank file name - item.diffs = [ChangeDiffFile(path=diff_path, size=0)] - db.session.add(checkpoint_diff) - db.session.flush() - latest_change = next( - (c for c in individual_changes if c.version.name == checkpoint.end), - None, + if not individual_changes + else individual_changes ) - checkpoint_change = ProjectVersionChange( - version_id=latest_change.version_id, - rank=checkpoint.rank, - delta=[ - {**asdict(c), "change": c.change.value} for c in merged_history - ], + new_checkpoint = cls._create_checkpoint( + project_id, checkpoint, individual_changes ) - changes.append(checkpoint_change) + if new_checkpoint: + changes.append(new_checkpoint) - db.session.add(checkpoint_change) - db.session.flush() db.session.commit() - history_data = self.merge_changes(changes) - return history_data + deltas = cls.merge_changes(changes) + return deltas class ProjectVersion(db.Model): diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index bec63d4b..c3f6d15a 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -409,7 +409,9 @@ paths: content: application/json: schema: - $ref: "#/components/schemas/ProjectDelta" + type: array + items: + $ref: "#/components/schemas/ProjectDelta" "400": $ref: "#/components/responses/BadRequest" "404": @@ -861,6 +863,7 @@ components: example: update diffs: type: array + nullable: true items: type: object properties: diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 17d89b2b..3a2ec924 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -407,26 +407,23 @@ def upload_chunk(id: str): @auth_required def get_project_delta(id: str): """Get project changes (delta) between two versions""" - since = request.args.get("since") - to = request.args.get("to") - if not since or not to: - abort(400, "Missing 'since' or 'to' query parameter") - + since = ProjectVersion.from_v_name(request.args.get("since")) + to = ProjectVersion.from_v_name(request.args.get("to")) project = require_project_by_uuid(id, ProjectPermissions.Read) - since_version = ProjectVersion.from_v_name(since) - to_version = ProjectVersion.from_v_name(to) - - if since_version > to_version: + if since < 0 or to < 0: + abort(400, "Invalid 'since' or 'to' version") + if since > to: abort(400, "'since' version must be less than 'to' version") - to_change = ( - ProjectVersionChange.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project.id, - ProjectVersion.name == to_version, - ) - .first_or_404() - ) - changes = to_change.get_delta(since_version) + ProjectVersion.query.filter( + ProjectVersion.project_id == project.id, + ProjectVersion.name == since, + ).first_or_404() + ProjectVersion.query.filter( + ProjectVersion.project_id == project.id, + ProjectVersion.name == to, + ).first_or_404() + + delta = ProjectVersionChange.get_delta(project.id, since, to) - return ProjectVersionChangeDeltaSchema(many=True).dump(changes), 200 + return ProjectVersionChangeDeltaSchema(many=True).dump(delta), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 433ebab0..ed6d8912 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -318,7 +318,9 @@ def test_create_diff_checkpoint(diff_project): def test_project_version_change_delta(diff_project): """Test that ProjectVersionChangeData and its schema work as expected""" version = diff_project.get_latest_version() + project_id = diff_project.id assert version.name == 10 + assert ProjectVersionChange.get_delta(project_id, 2, 1) is None pvcs: List[ProjectVersionChange] = ( ProjectVersionChange.query.join(ProjectVersion) .filter(ProjectVersion.project_id == diff_project.id) @@ -326,39 +328,41 @@ def test_project_version_change_delta(diff_project): ) assert len(pvcs) == 10 initial_pvc = pvcs[0] - assert initial_pvc.get_delta(3) is None initial_version = initial_pvc.version assert initial_pvc.rank == 0 assert initial_pvc.version.id == initial_version.id - assert len(initial_pvc.get_delta()) == len(initial_version.files) + # if version is the same, return just its delta in v1 + assert len(ProjectVersionChange.get_delta(project_id, 1, 1)) == len( + initial_version.files + ) # no ranks created as we get here just first version with get_delta assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 - second_pvc = pvcs[1] - second_data = second_pvc.get_delta() - assert len(second_data) == 1 - assert second_data[0].change == PushChangeType.DELETE + + delta = ProjectVersionChange.get_delta(project_id, 1, 2) + assert len(delta) == 1 + assert delta[0].change == PushChangeType.DELETE # no ranks created as we get here just first version with get_delta assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 # delete + create version - create_pvc = pvcs[2] - create_data = create_pvc.get_delta() - assert len(create_data) == 1 - assert create_data[0].change == PushChangeType.CREATE - - # get_delta with rank creation - checkpoint_pvc = pvcs[3] - checkpoint_data = checkpoint_pvc.get_delta() - assert len(checkpoint_data) == 1 - assert checkpoint_data[0].change == PushChangeType.UPDATE_DIFF + delta = ProjectVersionChange.get_delta(project_id, 1, 3) + assert len(delta) == 1 + assert delta[0].change == PushChangeType.CREATE + + # get_delta with update diff + delta = ProjectVersionChange.get_delta(project_id, 1, 4) + assert len(delta) == 1 + assert delta[0].change == PushChangeType.UPDATE_DIFF + assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 + # create rank 1 checkpoint for v4 + delta = ProjectVersionChange.get_delta(project_id, 0, 4) checkpoint_changes = ProjectVersionChange.query.filter_by(rank=1) filediff_checkpoints = FileDiff.query.filter_by(rank=1) checkpoint_change = checkpoint_changes.first() filediff_checkpoint = filediff_checkpoints.first() - assert checkpoint_changes.count() == 1 - assert checkpoint_change.version_id == checkpoint_pvc.version_id + assert checkpoint_change.version_id == pvcs[3].version_id assert filediff_checkpoints.count() == 1 assert filediff_checkpoint.version == 4 # check if filediff basefile is correctly set @@ -367,28 +371,29 @@ def test_project_version_change_delta(diff_project): == FileHistory.query.filter_by(project_version_name=3).first().id ) file_history = FileHistory.query.filter_by(project_version_name=4).first() - assert checkpoint_data[0].version == f"v4" - assert checkpoint_data[0].path == file_history.path - assert checkpoint_data[0].size == file_history.size - assert checkpoint_data[0].checksum == file_history.checksum - assert len(checkpoint_data[0].diffs) == 1 - assert checkpoint_data[0].diffs[0]["path"] == filediff_checkpoint.path - assert checkpoint_data[0].diffs[0]["size"] == 0 - - # get data with multiple ranks = 1 level checkpoints 1-8 + 2 level checkpoint 9-10 - latest_pvc = pvcs[-1] - latest_data = latest_pvc.get_delta() - assert len(latest_data) == 2 - assert latest_data[0].change == PushChangeType.DELETE - assert latest_data[1].change == PushChangeType.CREATE + assert delta[0].change == PushChangeType.UPDATE_DIFF + assert delta[0].version == "v4" + assert delta[0].path == file_history.path + assert delta[0].size == file_history.size + assert delta[0].checksum == file_history.checksum + assert len(delta[0].diffs) == 1 + assert delta[0].diffs[0]["path"] == filediff_checkpoint.path + assert delta[0].diffs[0]["size"] == 0 + + # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 + delta = ProjectVersionChange.get_delta(project_id, 0, 10) + assert len(delta) == 2 + assert delta[0].change == PushChangeType.DELETE + assert delta[1].change == PushChangeType.CREATE + assert ProjectVersionChange.query.filter_by(rank=1).count() == 2 + pv = push_change( diff_project, "removed", "test.gpkg", diff_project.storage.project_dir ) - latest_pvc = ProjectVersionChange.query.filter_by(version_id=pv.id).first() - latest_data = latest_pvc.get_delta(pv.name - 3) - assert len(latest_data) == 1 + delta = ProjectVersionChange.get_delta(project_id, pv.name - 3, pv.name) + assert len(delta) == 1 # test.gpkg is transparent as it was created and deleted in this range - assert not next((c for c in latest_data if c.path == "test.gpkg"), None) + assert not next((c for c in delta if c.path == "test.gpkg"), None) push_data = [ @@ -739,37 +744,35 @@ def test_project_delta(client, diff_project): response = client.get(f"v2/projects/{diff_project.id}/delta") assert response.status_code == 400 + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v-1&to=v1") + assert response.status_code == 400 + + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1000&to=v2000") + assert response.status_code == 404 + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1&to=v10") assert response.status_code == 200 - assert len(response.json) == 10 - assert response.json[0]["version"] == "v1" - assert response.json[-1]["version"] == "v10" + assert response.json[0]["change"] == PushChangeType.DELETE.value + assert response.json[1]["change"] == PushChangeType.CREATE.value - response = client.get(f"v2/projects/{project.id}/delta?since=v3&to=v7") - assert response.status_code == 200 - assert len(response.json) == 5 - assert response.json[0]["version"] == "v4" - assert response.json[-1]["version"] == "v8" - response = client.get(f"v2/projects/{project.id}/delta?since=v10") +# integration test for pull mechanism +def test_project_pull(client, diff_project): + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v4&to=v8") assert response.status_code == 200 - assert len(response.json) == 0 - - response = client.get(f"v2/projects/{project.id}/delta?to=v1") + delta = response.json + assert len(delta) == 1 + + # simulate pull of delta[0] + assert delta[0]["change"] == PushChangeType.UPDATE_DIFF.value + assert delta[0]["version"] == "v7" + assert len(delta[0]["diffs"]) == 1 + diff = delta[0]["diffs"][0] + assert diff["path"].startswith("base.gpkg-") + assert diff["size"] == 0 # empty diff as in test data + response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff['path']}") assert response.status_code == 200 - assert len(response.json) == 0 - - response = client.get(f"v2/projects/{project.id}/delta?since=v8&to=v12") - assert response.status_code == 422 - assert response.json["code"] == "InvalidVersionRange" - - response = client.get(f"v2/projects/{project.id}/delta?since=3&to=7") - assert response.status_code == 422 - assert response.json["code"] == "InvalidVersionFormat" - - response = client.get(f"v2/projects/{project.id}/delta?since=v3&to=7") - assert response.status_code == 422 - assert response.json["code"] == "InvalidVersionFormat" - - response = client.get(f"v2/projects/9999/delta") - assert response.status_code == 404 + created_diff = FileDiff.query.filter_by(path=diff["path"]).first() + assert created_diff and os.path.exists(created_diff.abs_path) + assert created_diff.size > 0 + assert created_diff.checksum From 63100a68144b9970616668e0c2b3c8d965e5141b Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 9 Oct 2025 18:13:48 +0200 Subject: [PATCH 17/36] Final fixes and changing schema - accept numbers in endpoint - consider to store diff instead of diffs in db --- server/mergin/sync/files.py | 81 ++++++-- server/mergin/sync/models.py | 180 +++++++++--------- server/mergin/sync/public_api_v2.yaml | 15 +- .../mergin/sync/public_api_v2_controller.py | 10 +- server/mergin/tests/test_public_api_v2.py | 116 +++++++---- ...3adc90fca0c_add_project_version_changes.py | 38 ++++ 6 files changed, 291 insertions(+), 149 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 7cc49e9a..9838fba6 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -4,7 +4,7 @@ import datetime from enum import Enum import os -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional, List import uuid from flask import current_app @@ -244,36 +244,91 @@ def patch_field(self, data, **kwargs): @dataclass -class ChangeDiffFile: +class DeltaDiffFile: path: str - size: Optional[int] = None -class ChangeDiffFileSchema(ma.Schema): +class DeltaDiffFileSchema(ma.Schema): path = fields.String(required=True) - size = fields.Integer(required=False) @dataclass -class ProjectVersionChangeDelta(File): +class DeltaBase(File): change: PushChangeType - version: str - diffs: Optional[List[ChangeDiffFile]] = None + version: int -class ProjectVersionChangeDeltaSchema(ma.Schema): - """Schema for changes data in ProjectVersionChange changes column""" +@dataclass +class DeltaMerged(DeltaBase): + diffs: List[DeltaDiffFile] = field(default_factory=list) + + def to_data_delta(self): + """Convert DeltaMerged to DeltaData with single diff""" + result = DeltaData( + path=self.path, + size=self.size, + checksum=self.checksum, + change=self.change, + version=self.version, + ) + if self.diffs: + result.diff = self.diffs[0].path + return result + + +@dataclass +class DeltaData(File): + """Delta data stored in database""" + + change: PushChangeType + version: int + diff: Optional[str] = None + + def to_merged_delta(self) -> DeltaMerged: + """Convert DeltaData to DeltaMerged with multiple diffs""" + result = DeltaMerged( + path=self.path, + size=self.size, + checksum=self.checksum, + change=self.change, + version=self.version, + ) + if self.diff: + result.diffs = [DeltaDiffFile(path=self.diff)] + return result + + +class DeltaBaseSchema(ma.Schema): + """Base schema for detla json and response from delta endpoint""" path = fields.String(required=True) size = fields.Integer(required=True) checksum = fields.String(required=True) - version = fields.String(required=True) - diffs = fields.List(fields.Nested(ChangeDiffFileSchema())) + version = fields.Integer(required=True) change = fields.Enum(PushChangeType, by_value=True, required=True) + +class DeltaDataSchema(DeltaBaseSchema): + """Schema for delta data in database""" + + diff = fields.String(required=False) + @post_load def make_object(self, data, **kwargs): - return ProjectVersionChangeDelta(**data) + return DeltaData(**data) + + @post_dump + def patch_field(self, data, **kwargs): + # drop 'diff' key entirely if empty or None as database would expect + if not data.get("diff"): + data.pop("diff", None) + return data + + +class DeltaRespSchema(DeltaBaseSchema): + """Schema for delta data response""" + + diffs = fields.List(fields.Nested(DeltaDiffFileSchema())) @post_dump def patch_field(self, data, **kwargs): diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 5eb1601c..0d3634e5 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -24,9 +24,10 @@ from flask import current_app from .files import ( - ChangeDiffFile, - ProjectVersionChangeDelta, - ProjectVersionChangeDeltaSchema, + DeltaMerged, + DeltaDiffFile, + DeltaData, + DeltaDataSchema, ProjectDiffFile, ProjectFileChange, ChangesSchema, @@ -932,52 +933,49 @@ class ProjectVersionChange(db.Model): ) @staticmethod - def merge_changes( - changes: List[ProjectVersionChange], - ) -> List[ProjectVersionChangeDelta]: + def merge_delta_items( + items: List[DeltaData], + ) -> List[DeltaMerged]: """ - Merge multiple changes jsons into one list of changes. + Merge multiple changes json array objects into one list of changes. Changes are merged based on file path and change type. """ - result: Dict[str, ProjectVersionChangeDelta] = {} - for change in changes: - for item in change.delta: - current_data: ProjectVersionChangeDelta = ( - ProjectVersionChangeDeltaSchema().load(item) - ) - existing_data = result.get(current_data.path) - path = current_data.path - if existing_data: - # merge changes data jsons - if existing_data.change == PushChangeType.CREATE: - if current_data.change == PushChangeType.DELETE: - # create + delete = nothing - del result[path] - elif current_data.change in ( - PushChangeType.UPDATE.value, - PushChangeType.UPDATE_DIFF.value, - ): - # create + update = create with updated info - current_data.change = existing_data.change - current_data.diffs = None - else: - result[path] = current_data - elif existing_data.change == PushChangeType.UPDATE: - if current_data.change == PushChangeType.UPDATE_DIFF: - # update + update_diff = update_diff with latest info - current_data.change = existing_data.change - current_data.diffs = None - result[path] = current_data - elif existing_data.change == PushChangeType.UPDATE_DIFF.value: - if current_data.change == PushChangeType.UPDATE_DIFF.value: - # update_diff + update_diff = update_diff with latest info - current_data.diffs.extend(existing_data.diffs or []) - result[path] = current_data + result: Dict[str, DeltaMerged] = {} + for item in items: + current = item.to_merged_delta() + existing = result.get(current.path) + path = current.path + if existing: + # merge changes data jsons + if existing.change == PushChangeType.CREATE: + if current.change == PushChangeType.DELETE: + # create + delete = nothing + del result[path] + elif current.change in ( + PushChangeType.UPDATE, + PushChangeType.UPDATE_DIFF, + ): + # create + update = create with updated info + current.change = existing.change + current.diffs = [] else: - # delete + anything = anything - result[path] = current_data + result[path] = current + elif existing.change == PushChangeType.UPDATE: + if current.change == PushChangeType.UPDATE_DIFF: + # update + update_diff = update_diff with latest info + current.change = existing.change + current.diffs = [] + result[path] = current + elif existing.change == PushChangeType.UPDATE_DIFF: + if current.change == PushChangeType.UPDATE_DIFF: + # update_diff + update_diff = update_diff with latest info + current.diffs.extend(existing.diffs or []) + result[path] = current else: - result[current_data.path] = current_data + # delete + anything = anything + result[path] = current + else: + result[current.path] = current return list(result.values()) @classmethod @@ -985,30 +983,37 @@ def _create_checkpoint( cls, project_id: str, checkpoint: Checkpoint, - individual_changes: List[ProjectVersionChange], + changes: List[ProjectVersionChange] = [], ) -> Optional[ProjectVersionChange]: """ Creates and caches a new ProjectVersionChange checkpoint and any required FileDiff checkpoints. """ - individual_changes_range = [ + changes_range = [ change - for change in individual_changes - if checkpoint.start < change.version.name <= checkpoint.end + for change in changes + if checkpoint.start <= change.version.name <= checkpoint.end ] - if not individual_changes_range: + if not changes_range: logging.warning( f"No individual changes found for project {project_id} in range v{checkpoint.start}-v{checkpoint.end} to create checkpoint." ) return None - merged_deltas = cls.merge_changes(individual_changes_range) - versioned_deltas = [ - item for item in merged_deltas if is_versioned_file(item.path) + # dump delta objects from database and flatten list for merging + deltas = [] + for change in changes_range: + delta_items = DeltaDataSchema(many=True).load(change.delta) + deltas.extend(delta_items) + merged_delta_items: List[DeltaData] = [ + d.to_data_delta() for d in cls.merge_delta_items(deltas) ] # Pre-fetch data for all versioned files to create FileDiff checkpoints - versioned_file_paths = [delta.path for delta in versioned_deltas] + versioned_delta_items = [ + item for item in merged_delta_items if is_versioned_file(item.path) + ] + versioned_file_paths = [delta.path for delta in versioned_delta_items] if versioned_file_paths: file_paths = ProjectFilePath.query.filter( ProjectFilePath.project_id == project_id, @@ -1016,7 +1021,7 @@ def _create_checkpoint( ).all() file_path_map = {fp.path: fp.id for fp in file_paths} - for item in versioned_deltas: + for item in versioned_delta_items: file_path_id = file_path_map.get(item.path) if not file_path_id: continue @@ -1043,21 +1048,35 @@ def _create_checkpoint( version=checkpoint.end, ) # Patch the delta with the path to the new diff checkpoint - item.diffs = [ChangeDiffFile(path=diff_path, size=0)] + if item.change == PushChangeType.UPDATE_DIFF: + item.diff = diff_path db.session.add(checkpoint_diff) checkpoint_change = ProjectVersionChange( - version_id=individual_changes_range[-1].version_id, + version_id=changes_range[-1].version_id, rank=checkpoint.rank, - delta=[{**asdict(c), "change": c.change.value} for c in merged_deltas], + delta=DeltaDataSchema(many=True).dump(merged_delta_items), ) db.session.add(checkpoint_change) + db.session.commit() return checkpoint_change + @classmethod + def query_changes(cls, project_id, since, to, rank=None): + """Query changes with specified parameters""" + query = cls.query.join(ProjectVersion).filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name >= since, + ProjectVersion.name <= to, + ) + if rank is not None: + query = query.filter(ProjectVersionChange.rank == rank) + return query.order_by(ProjectVersion.name).all() + @classmethod def get_delta( cls, project_id: str, since: int, to: int - ) -> Optional[List[ProjectVersionChangeDelta]]: + ) -> Optional[List[DeltaMerged]]: """ Get changes between two versions, merging them if needed. - create FileDiff checkpoints if needed @@ -1079,11 +1098,7 @@ def get_delta( ) .first() ) - return ( - [ProjectVersionChangeDeltaSchema().load(item) for item in change.delta] - if change - else [] - ) + return [DeltaMerged(**item) for item in change.delta] if change else None expected_checkpoints = Checkpoint.get_checkpoints(since + 1, to) expected_changes: List[ProjectVersionChange] = ( @@ -1104,28 +1119,21 @@ def get_delta( # Cache all individual (rank 0) changes in the required range. individual_changes: List[ProjectVersionChange] = [] - changes = [] + result: List[DeltaData] = [] for checkpoint in expected_checkpoints: expected_change = existing_changes_map.get( (checkpoint.rank, checkpoint.end) ) + # we have change in database, just return delta data from it if expected_change: - changes.append(expected_change) + deltas = DeltaDataSchema(many=True).load(expected_change.delta) + result.extend(deltas) continue if checkpoint.rank > 0: individual_changes = ( - ( - ProjectVersionChange.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name > since, - ProjectVersion.name <= to, - ProjectVersionChange.rank == 0, - ) - .all() - ) + cls.query_changes(project_id, checkpoint.start, checkpoint.end, 0) if not individual_changes else individual_changes ) @@ -1133,12 +1141,10 @@ def get_delta( project_id, checkpoint, individual_changes ) if new_checkpoint: - changes.append(new_checkpoint) + deltas = DeltaDataSchema(many=True).load(new_checkpoint.delta) + result.extend(deltas) - db.session.commit() - - deltas = cls.merge_changes(changes) - return deltas + return cls.merge_delta_items(result) class ProjectVersion(db.Model): @@ -1237,25 +1243,21 @@ def __init__( # cache changes data json for version checkpoints # rank 0 is for all changes from start to current version - changes_data = [ - ProjectVersionChangeDelta( + delta_data = [ + DeltaData( path=c.path, change=c.change, size=c.size, checksum=c.checksum, - version=self.to_v_name(name), - diffs=( - [ChangeDiffFile(path=c.diff.path, size=c.diff.size)] - if c.diff - else None - ), + version=name, + diff=c.diff.path if c.diff else None, ) for c in changes ] pvc = ProjectVersionChange( version=self, rank=0, - delta=ProjectVersionChangeDeltaSchema(many=True).dump(changes_data), + delta=DeltaDataSchema(many=True).dump(delta_data), ) db.session.add(pvc) diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index c3f6d15a..4cd045c5 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -393,15 +393,15 @@ paths: in: query required: true schema: - type: string - example: v1 + type: integer + example: 1 description: Start version (exclusive) - name: to in: query required: true schema: - type: string - example: v2 + type: integer + example: 2 description: End version (inclusive) responses: "200": @@ -855,8 +855,8 @@ components: type: string example: 9adb76bf81a34880209040ffe5ee262a090b62ab version: - type: string - example: v2 + type: integer + example: 1 change: type: string enum: [create, update, delete, update_diff] @@ -870,6 +870,3 @@ components: path: type: string example: survey.gpkg-diff-1 - size: - type: integer - example: 512 diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 3a2ec924..a63332c4 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -38,7 +38,7 @@ StorageLimitHit, UploadError, ) -from .files import ChangesSchema, ProjectVersionChangeDeltaSchema +from .files import ChangesSchema, DeltaRespSchema from .forms import project_name_validation from .models import ( Project, @@ -407,8 +407,8 @@ def upload_chunk(id: str): @auth_required def get_project_delta(id: str): """Get project changes (delta) between two versions""" - since = ProjectVersion.from_v_name(request.args.get("since")) - to = ProjectVersion.from_v_name(request.args.get("to")) + since = int(request.args.get("since")) + to = int(request.args.get("to")) project = require_project_by_uuid(id, ProjectPermissions.Read) if since < 0 or to < 0: abort(400, "Invalid 'since' or 'to' version") @@ -424,6 +424,6 @@ def get_project_delta(id: str): ProjectVersion.name == to, ).first_or_404() - delta = ProjectVersionChange.get_delta(project.id, since, to) + deltas = ProjectVersionChange.get_delta(project.id, since, to) - return ProjectVersionChangeDeltaSchema(many=True).dump(delta), 200 + return DeltaRespSchema(many=True).dump(deltas), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index ed6d8912..83d31b72 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -8,7 +8,15 @@ import uuid from pygeodiff import GeoDiffLibError -from .utils import add_user, diffs_are_equal, execute_query, push_change +from .utils import ( + add_user, + create_project, + create_workspace, + diffs_are_equal, + execute_query, + login_as_admin, + push_change, +) from ..app import db from tests import test_project, test_workspace_id from ..config import Configuration @@ -21,6 +29,7 @@ ProjectVersionChange, ) from ..sync.files import PushChangeType +from ..sync.utils import is_versioned_file from sqlalchemy.exc import IntegrityError import pytest from datetime import datetime, timedelta, timezone @@ -317,9 +326,9 @@ def test_create_diff_checkpoint(diff_project): def test_project_version_change_delta(diff_project): """Test that ProjectVersionChangeData and its schema work as expected""" - version = diff_project.get_latest_version() + latest_version = diff_project.get_latest_version() project_id = diff_project.id - assert version.name == 10 + assert latest_version.name == 10 assert ProjectVersionChange.get_delta(project_id, 2, 1) is None pvcs: List[ProjectVersionChange] = ( ProjectVersionChange.query.join(ProjectVersion) @@ -352,40 +361,47 @@ def test_project_version_change_delta(diff_project): # get_delta with update diff delta = ProjectVersionChange.get_delta(project_id, 1, 4) assert len(delta) == 1 - assert delta[0].change == PushChangeType.UPDATE_DIFF + assert delta[0].change == PushChangeType.CREATE assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 # create rank 1 checkpoint for v4 delta = ProjectVersionChange.get_delta(project_id, 0, 4) + fh = FileHistory.query.filter_by(project_version_name=3).first() checkpoint_changes = ProjectVersionChange.query.filter_by(rank=1) filediff_checkpoints = FileDiff.query.filter_by(rank=1) checkpoint_change = checkpoint_changes.first() - filediff_checkpoint = filediff_checkpoints.first() + # find checkpoint for base gpkg + base_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() assert checkpoint_changes.count() == 1 assert checkpoint_change.version_id == pvcs[3].version_id - assert filediff_checkpoints.count() == 1 - assert filediff_checkpoint.version == 4 - # check if filediff basefile is correctly set - assert ( - filediff_checkpoint.basefile_id - == FileHistory.query.filter_by(project_version_name=3).first().id + assert filediff_checkpoints.count() == len( + [file for file in initial_version.files if is_versioned_file(file.path)] ) + assert base_gpkg_checkpoint.version == 4 + # check if filediff basefile is correctly set + assert base_gpkg_checkpoint.basefile_id == fh.id file_history = FileHistory.query.filter_by(project_version_name=4).first() - assert delta[0].change == PushChangeType.UPDATE_DIFF - assert delta[0].version == "v4" - assert delta[0].path == file_history.path - assert delta[0].size == file_history.size - assert delta[0].checksum == file_history.checksum - assert len(delta[0].diffs) == 1 - assert delta[0].diffs[0]["path"] == filediff_checkpoint.path - assert delta[0].diffs[0]["size"] == 0 + assert len(delta) == len(initial_version.files) + delta_base_gpkg = [d for d in delta if d.path == "base.gpkg"] + assert len(delta_base_gpkg) == 1 + # from history is clear, that we are just creating geopackage in this range + assert delta_base_gpkg[0].change == PushChangeType.CREATE + assert delta_base_gpkg[0].version == 3 + assert delta_base_gpkg[0].path == file_history.path + assert delta_base_gpkg[0].size == file_history.size + assert delta_base_gpkg[0].checksum == file_history.checksum + assert len(delta_base_gpkg[0].diffs) == 1 + assert delta_base_gpkg[0].diffs[0].path == base_gpkg_checkpoint.path # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 delta = ProjectVersionChange.get_delta(project_id, 0, 10) - assert len(delta) == 2 - assert delta[0].change == PushChangeType.DELETE - assert delta[1].change == PushChangeType.CREATE + assert len(delta) == len(latest_version.files) + delta_test_gpkg = [d for d in delta if d.path == "test.gpkg"] + assert delta_test_gpkg + assert delta_test_gpkg[0].change == PushChangeType.CREATE assert ProjectVersionChange.query.filter_by(rank=1).count() == 2 + # base gpgk is transparent + assert not next((c for c in delta if c.path == "base.gpkg"), None) pv = push_change( diff_project, "removed", "test.gpkg", diff_project.storage.project_dir @@ -394,6 +410,16 @@ def test_project_version_change_delta(diff_project): assert len(delta) == 1 # test.gpkg is transparent as it was created and deleted in this range assert not next((c for c in delta if c.path == "test.gpkg"), None) + delta_base_gpkg = next((c for c in delta if c.path == "base.gpkg"), None) + assert delta_base_gpkg.change == PushChangeType.DELETE + + # check update diff + delta = ProjectVersionChange.get_delta(project_id, 5, 7) + assert len(delta) == 1 + assert delta[0].change == PushChangeType.UPDATE_DIFF + assert len(delta[0].diffs) == 2 + # find related diff file in file diffs to check relation + assert FileDiff.query.filter_by(path=delta[0].diffs[0].path) push_data = [ @@ -741,35 +767,59 @@ def test_full_push(client): def test_project_delta(client, diff_project): """Test project delta endpoint""" + login_as_admin(client) + user = add_user() + workspace = create_workspace() + initial_project = create_project("empty_project", workspace=workspace, user=user) + working_dir = os.path.join(TMP_DIR, "empty_work_dir") + os.makedirs(os.path.join(TMP_DIR, "empty_work_dir"), exist_ok=True) + # add basefile + shutil.copy( + os.path.join(test_project_dir, "base.gpkg"), + os.path.join(working_dir, "base.gpkg"), + ) + push_change(initial_project, "added", "base.gpkg", working_dir) + response = client.get(f"v2/projects/{initial_project.id}/delta?since=0&to=1") + assert response.status_code == 200 response = client.get(f"v2/projects/{diff_project.id}/delta") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=v-1&to=v1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=-1&to=1") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1000&to=v2000") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=1000&to=2000") assert response.status_code == 404 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1&to=v10") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=1&to=10") assert response.status_code == 200 - assert response.json[0]["change"] == PushChangeType.DELETE.value - assert response.json[1]["change"] == PushChangeType.CREATE.value + assert len(response.json) == 1 + assert response.json[0]["change"] == PushChangeType.CREATE.value + assert response.json[0]["version"] == 9 + + # simplate update + response = client.get(f"v2/projects/{diff_project.id}/delta?since=4&to=8") + assert response.status_code == 200 + delta = response.json + assert len(delta) == 1 + + # simulate pull of delta[0] + assert delta[0]["change"] == PushChangeType.UPDATE.value + assert delta[0]["version"] == 7 + assert not delta[0].get("diffs") # integration test for pull mechanism def test_project_pull(client, diff_project): - response = client.get(f"v2/projects/{diff_project.id}/delta?since=v4&to=v8") + """Test project pull mechanisom in v2""" + + response = client.get(f"v2/projects/{diff_project.id}/delta?since=5&to=7") assert response.status_code == 200 delta = response.json assert len(delta) == 1 - - # simulate pull of delta[0] assert delta[0]["change"] == PushChangeType.UPDATE_DIFF.value - assert delta[0]["version"] == "v7" - assert len(delta[0]["diffs"]) == 1 + assert delta[0]["version"] == 7 diff = delta[0]["diffs"][0] assert diff["path"].startswith("base.gpkg-") - assert diff["size"] == 0 # empty diff as in test data response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff['path']}") assert response.status_code == 200 created_diff = FileDiff.query.filter_by(path=diff["path"]).first() diff --git a/server/migrations/community/63adc90fca0c_add_project_version_changes.py b/server/migrations/community/63adc90fca0c_add_project_version_changes.py index 5d7ffe8d..bb279743 100644 --- a/server/migrations/community/63adc90fca0c_add_project_version_changes.py +++ b/server/migrations/community/63adc90fca0c_add_project_version_changes.py @@ -54,6 +54,44 @@ def upgrade(): ) # ### end Alembic commands ### + # data migration + op.execute( + """ + INSERT INTO project_version_change (version_id, rank, delta) + SELECT + h.version_id, + 0 AS rank, + jsonb_agg( + jsonb_strip_nulls( + jsonb_build_object( + 'path', fp.path, + 'size', h.size, + 'change', h.change, + 'version', 'v' || h.project_version_name, + 'checksum', h.checksum, + 'diff', fdj.diff_path + ) + ) + ) AS delta + FROM + file_history h + JOIN + project_file_path fp ON h.file_path_id = fp.id + LEFT JOIN LATERAL ( + SELECT + fd.path AS diff_path + FROM + file_diff fd + WHERE + fd.file_path_id = fp.id + AND fd.version = h.project_version_name + AND fd.rank = 0 + ) fdj ON TRUE + GROUP BY + h.version_id; + """ + ) + def downgrade(): # ### commands auto generated by Alembic - please adjust! ### From 29a9eeff19263164acdda211ef7be240b41740cd Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 9 Oct 2025 18:26:12 +0200 Subject: [PATCH 18/36] Fix missing import --- server/mergin/auth/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/server/mergin/auth/models.py b/server/mergin/auth/models.py index 3ab05f6b..5dcf275e 100644 --- a/server/mergin/auth/models.py +++ b/server/mergin/auth/models.py @@ -11,6 +11,7 @@ from sqlalchemy import or_, func, text from ..app import db +from ..sync.models import ProjectUser from ..sync.utils import get_user_agent, get_ip, get_device_id, is_reserved_word MAX_USERNAME_LENGTH = 50 From c53b9284cbb4c608b43c8941d9d176c4a7f2c2a4 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 9 Oct 2025 19:42:05 +0200 Subject: [PATCH 19/36] Fix tests and add checkpoints just i UPDATE_DIFF --- server/mergin/sync/models.py | 10 +++--- server/mergin/tests/test_public_api_v2.py | 44 ++++++++++++++--------- 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 0d3634e5..7d4bc6af 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -1011,7 +1011,10 @@ def _create_checkpoint( # Pre-fetch data for all versioned files to create FileDiff checkpoints versioned_delta_items = [ - item for item in merged_delta_items if is_versioned_file(item.path) + item + for item in merged_delta_items + if is_versioned_file(item.path) + and item.change == PushChangeType.UPDATE_DIFF ] versioned_file_paths = [delta.path for delta in versioned_delta_items] if versioned_file_paths: @@ -1048,8 +1051,7 @@ def _create_checkpoint( version=checkpoint.end, ) # Patch the delta with the path to the new diff checkpoint - if item.change == PushChangeType.UPDATE_DIFF: - item.diff = diff_path + item.diff = diff_path db.session.add(checkpoint_diff) checkpoint_change = ProjectVersionChange( @@ -1133,7 +1135,7 @@ def get_delta( if checkpoint.rank > 0: individual_changes = ( - cls.query_changes(project_id, checkpoint.start, checkpoint.end, 0) + cls.query_changes(project_id, since, to, 0) if not individual_changes else individual_changes ) diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 83d31b72..c80117e0 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -366,20 +366,13 @@ def test_project_version_change_delta(diff_project): # create rank 1 checkpoint for v4 delta = ProjectVersionChange.get_delta(project_id, 0, 4) - fh = FileHistory.query.filter_by(project_version_name=3).first() checkpoint_changes = ProjectVersionChange.query.filter_by(rank=1) filediff_checkpoints = FileDiff.query.filter_by(rank=1) checkpoint_change = checkpoint_changes.first() - # find checkpoint for base gpkg - base_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() assert checkpoint_changes.count() == 1 assert checkpoint_change.version_id == pvcs[3].version_id - assert filediff_checkpoints.count() == len( - [file for file in initial_version.files if is_versioned_file(file.path)] - ) - assert base_gpkg_checkpoint.version == 4 + assert filediff_checkpoints.count() == 0 # check if filediff basefile is correctly set - assert base_gpkg_checkpoint.basefile_id == fh.id file_history = FileHistory.query.filter_by(project_version_name=4).first() assert len(delta) == len(initial_version.files) delta_base_gpkg = [d for d in delta if d.path == "base.gpkg"] @@ -390,8 +383,7 @@ def test_project_version_change_delta(diff_project): assert delta_base_gpkg[0].path == file_history.path assert delta_base_gpkg[0].size == file_history.size assert delta_base_gpkg[0].checksum == file_history.checksum - assert len(delta_base_gpkg[0].diffs) == 1 - assert delta_base_gpkg[0].diffs[0].path == base_gpkg_checkpoint.path + assert len(delta_base_gpkg[0].diffs) == 0 # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 delta = ProjectVersionChange.get_delta(project_id, 0, 10) @@ -403,13 +395,9 @@ def test_project_version_change_delta(diff_project): # base gpgk is transparent assert not next((c for c in delta if c.path == "base.gpkg"), None) - pv = push_change( - diff_project, "removed", "test.gpkg", diff_project.storage.project_dir + delta = ProjectVersionChange.get_delta( + project_id, latest_version.name - 3, latest_version.name ) - delta = ProjectVersionChange.get_delta(project_id, pv.name - 3, pv.name) - assert len(delta) == 1 - # test.gpkg is transparent as it was created and deleted in this range - assert not next((c for c in delta if c.path == "test.gpkg"), None) delta_base_gpkg = next((c for c in delta if c.path == "base.gpkg"), None) assert delta_base_gpkg.change == PushChangeType.DELETE @@ -421,6 +409,30 @@ def test_project_version_change_delta(diff_project): # find related diff file in file diffs to check relation assert FileDiff.query.filter_by(path=delta[0].diffs[0].path) + # create just update_diff versions with checkpoint + base_gpkg = os.path.join(diff_project.storage.project_dir, "test.gpkg") + shutil.copy( + os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), base_gpkg + ) + for i in range(7): + sql = f"UPDATE simple SET rating={i}" + execute_query(base_gpkg, sql) + pv = push_change( + diff_project, "updated", "test.gpkg", diff_project.storage.project_dir + ) + delta = ProjectVersionChange.get_delta(project_id, 8, latest_version.name + 6) + assert len(delta) == 2 + # file history in 9.th version is basefile + fh = FileHistory.query.filter_by( + project_version_name=latest_version.name - 1 + ).first() + base_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() + assert base_gpkg_checkpoint.basefile_id == fh.id + + delta = ProjectVersionChange.get_delta(project_id, 12, latest_version.name + 6) + assert len(delta) == 1 + assert delta[0].diffs[0].path == base_gpkg_checkpoint.path + push_data = [ # success From dce0e1147427749b1bfaed9a4e1ade0321c9f2b8 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 9 Oct 2025 19:43:46 +0200 Subject: [PATCH 20/36] add safe check for dwonloading --- server/mergin/tests/test_public_api_v2.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index c80117e0..e5667f8a 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -324,7 +324,7 @@ def test_create_diff_checkpoint(diff_project): assert not os.path.exists(diff.abs_path) -def test_project_version_change_delta(diff_project): +def test_project_version_change_delta(client, diff_project): """Test that ProjectVersionChangeData and its schema work as expected""" latest_version = diff_project.get_latest_version() project_id = diff_project.id @@ -432,6 +432,11 @@ def test_project_version_change_delta(diff_project): delta = ProjectVersionChange.get_delta(project_id, 12, latest_version.name + 6) assert len(delta) == 1 assert delta[0].diffs[0].path == base_gpkg_checkpoint.path + # check if checkpoint will be there + response = client.get( + f"v2/projects/{diff_project.id}/raw/diff/{delta[0].diffs[0].path}" + ) + assert response.status_code == 200 push_data = [ From 832603dc4a0c0bc34a54ca478f623fc751fa37b5 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 9 Oct 2025 20:37:16 +0200 Subject: [PATCH 21/36] Imporve tests --- server/mergin/tests/test_public_api_v2.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index e5667f8a..85fcbe9f 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -414,7 +414,7 @@ def test_project_version_change_delta(client, diff_project): shutil.copy( os.path.join(diff_project.storage.project_dir, "v9", "test.gpkg"), base_gpkg ) - for i in range(7): + for i in range(6): sql = f"UPDATE simple SET rating={i}" execute_query(base_gpkg, sql) pv = push_change( @@ -427,11 +427,20 @@ def test_project_version_change_delta(client, diff_project): project_version_name=latest_version.name - 1 ).first() base_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() - assert base_gpkg_checkpoint.basefile_id == fh.id + assert base_gpkg_checkpoint + assert base_gpkg_checkpoint.version == latest_version.name + 6 + fh = FileHistory.query.filter_by( + project_version_name=latest_version.name + 6 + ).first() delta = ProjectVersionChange.get_delta(project_id, 12, latest_version.name + 6) assert len(delta) == 1 + assert len(delta[0].diffs) == 1 assert delta[0].diffs[0].path == base_gpkg_checkpoint.path + assert delta[0].change == PushChangeType.UPDATE_DIFF + assert delta[0].checksum == fh.checksum + assert delta[0].size == fh.size + # check if checkpoint will be there response = client.get( f"v2/projects/{diff_project.id}/raw/diff/{delta[0].diffs[0].path}" From a9140e804d8369eb64b65e7038a6d67ddb01c836 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Fri, 10 Oct 2025 18:46:24 +0200 Subject: [PATCH 22/36] Address comments @varmar05 1. part - rename table to project version delta with column changes - rename classes - add get_delta_changes to project instance (nice) - fix migration to int --- server/mergin/sync/files.py | 40 +-- server/mergin/sync/models.py | 257 +++++++++--------- server/mergin/sync/public_api_v2.yaml | 8 +- .../mergin/sync/public_api_v2_controller.py | 18 +- server/mergin/tests/test_public_api_v2.py | 47 ++-- ...9acf967e58ad_add_project_version_delta.py} | 45 ++- 6 files changed, 203 insertions(+), 212 deletions(-) rename server/migrations/community/{63adc90fca0c_add_project_version_changes.py => 9acf967e58ad_add_project_version_delta.py} (65%) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 9838fba6..f8103926 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -245,26 +245,34 @@ def patch_field(self, data, **kwargs): @dataclass class DeltaDiffFile: + """Diff file path in diffs list""" + path: str -class DeltaDiffFileSchema(ma.Schema): +class DeltaChangeDiffFileSchema(ma.Schema): + """Schema for diff file path in diffs list""" + path = fields.String(required=True) @dataclass -class DeltaBase(File): +class DeltaChangeBase(File): + """Base class for changes stored in json list or returned from delta endpoint""" + change: PushChangeType version: int @dataclass -class DeltaMerged(DeltaBase): +class DeltaChangeMerged(DeltaChangeBase): + """Delta item with merged diffs to list of multiple diff files""" + diffs: List[DeltaDiffFile] = field(default_factory=list) def to_data_delta(self): """Convert DeltaMerged to DeltaData with single diff""" - result = DeltaData( + result = DeltaChange( path=self.path, size=self.size, checksum=self.checksum, @@ -277,16 +285,14 @@ def to_data_delta(self): @dataclass -class DeltaData(File): - """Delta data stored in database""" +class DeltaChange(DeltaChangeBase): + """Delta items stored in database as list of this item with single diff file""" - change: PushChangeType - version: int diff: Optional[str] = None - def to_merged_delta(self) -> DeltaMerged: + def to_merged_delta(self) -> DeltaChangeMerged: """Convert DeltaData to DeltaMerged with multiple diffs""" - result = DeltaMerged( + result = DeltaChangeMerged( path=self.path, size=self.size, checksum=self.checksum, @@ -298,8 +304,8 @@ def to_merged_delta(self) -> DeltaMerged: return result -class DeltaBaseSchema(ma.Schema): - """Base schema for detla json and response from delta endpoint""" +class DeltaChangeBaseSchema(ma.Schema): + """Base schema for delta json and response from delta endpoint""" path = fields.String(required=True) size = fields.Integer(required=True) @@ -308,14 +314,14 @@ class DeltaBaseSchema(ma.Schema): change = fields.Enum(PushChangeType, by_value=True, required=True) -class DeltaDataSchema(DeltaBaseSchema): - """Schema for delta data in database""" +class DeltaChangeSchema(DeltaChangeBaseSchema): + """Schema for change data in changes column""" diff = fields.String(required=False) @post_load def make_object(self, data, **kwargs): - return DeltaData(**data) + return DeltaChange(**data) @post_dump def patch_field(self, data, **kwargs): @@ -325,10 +331,10 @@ def patch_field(self, data, **kwargs): return data -class DeltaRespSchema(DeltaBaseSchema): +class DeltaChangeRespSchema(DeltaChangeBaseSchema): """Schema for delta data response""" - diffs = fields.List(fields.Nested(DeltaDiffFileSchema())) + diffs = fields.List(fields.Nested(DeltaChangeDiffFileSchema())) @post_dump def patch_field(self, data, **kwargs): diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 7d4bc6af..6fd6a6a4 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -24,10 +24,10 @@ from flask import current_app from .files import ( - DeltaMerged, + DeltaChangeMerged, DeltaDiffFile, - DeltaData, - DeltaDataSchema, + DeltaChange, + DeltaChangeSchema, ProjectDiffFile, ProjectFileChange, ChangesSchema, @@ -353,6 +353,86 @@ def bulk_roles_update(self, access: Dict) -> Set[int]: return set(id_diffs) + def get_delta_changes( + self, since: int, to: int + ) -> Optional[List[DeltaChangeMerged]]: + """ + Get changes between two versions, merging them if needed. + - create FileDiff checkpoints if needed + - create ProjectVersionDelta checkpoints if needed with delta json + """ + if since > to: + logging.error( + f"Start version {since} is higher than end version {to} - broken history" + ) + return + if since == to: + return None + project_id = self.id + expected_checkpoints = Checkpoint.get_checkpoints(since + 1, to) + expected_deltas: List[ProjectVersionDelta] = ( + ProjectVersionDelta.query.join(ProjectVersion) + .filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name > since, + ProjectVersion.name <= to, + tuple_(ProjectVersionDelta.rank, ProjectVersion.name).in_( + [(item.rank, item.end) for item in expected_checkpoints] + ), + ) + .order_by(ProjectVersion.name) + .all() + ) + existing_delta_map = {(c.rank, c.version.name): c for c in expected_deltas} + + # Cache all individual (rank 0) delta rows in the required range. + individual_deltas: List[ProjectVersionDelta] = [] + + result: List[DeltaChange] = [] + for checkpoint in expected_checkpoints: + existing_delta = existing_delta_map.get((checkpoint.rank, checkpoint.end)) + + # we have change in database, just return delta data from it + if existing_delta: + result.extend(DeltaChangeSchema(many=True).load(existing_delta.changes)) + continue + + # If higher rank delta checkopoint does not exists, we are using rank=0 deltas to create checkopoint + if checkpoint.rank > 0: + individual_deltas = ( + ProjectVersionDelta.query.join(ProjectVersion) + .filter( + ProjectVersion.project_id == project_id, + ProjectVersion.name >= since, + ProjectVersion.name <= to, + ProjectVersionDelta.rank == 0, + ) + .order_by(ProjectVersion.name) + .all() + if not individual_deltas + else individual_deltas + ) + + if not individual_deltas: + logging.error( + f"No individual deltas found for project {project_id} in range {since} / {to} to create checkpoint." + ) + return + + new_checkpoint = ProjectVersionDelta.create_checkpoint( + project_id, checkpoint, individual_deltas + ) + if new_checkpoint: + result.extend( + DeltaChangeSchema(many=True).load(new_checkpoint.changes) + ) + else: + logging.error( + f"Not possible to create checkpoint for project {project_id} in range {checkpoint.start}-{checkpoint.end}" + ) + + return ProjectVersionDelta.merge_delta_changes(result) + class ProjectRole(Enum): """Project roles ordered by rank (do not change)""" @@ -905,7 +985,7 @@ def construct_checkpoint(self) -> bool: return True -class ProjectVersionChange(db.Model): +class ProjectVersionDelta(db.Model): id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) # exponential order of changes json rank = db.Column(db.Integer, nullable=False, index=True) @@ -917,12 +997,12 @@ class ProjectVersionChange(db.Model): nullable=False, ) # cached changes for versions from start to end (inclusive) - delta = db.Column(JSONB, nullable=False) + changes = db.Column(JSONB, nullable=False) __table_args__ = ( - db.UniqueConstraint("version_id", "rank", name="unique_changes"), + db.UniqueConstraint("version_id", "rank", name="unique_deltas"), db.Index( - "ix_project_version_change_version_id_rank", + "ix_project_version_delta_version_id_rank", version_id, rank, ), @@ -933,80 +1013,78 @@ class ProjectVersionChange(db.Model): ) @staticmethod - def merge_delta_items( - items: List[DeltaData], - ) -> List[DeltaMerged]: + def merge_delta_changes( + items: List[DeltaChange], + ) -> List[DeltaChangeMerged]: """ Merge multiple changes json array objects into one list of changes. Changes are merged based on file path and change type. """ - result: Dict[str, DeltaMerged] = {} + result: Dict[str, DeltaChangeMerged] = {} for item in items: current = item.to_merged_delta() - existing = result.get(current.path) + previous = result.get(current.path) path = current.path - if existing: + if previous: # merge changes data jsons - if existing.change == PushChangeType.CREATE: + if previous.change == PushChangeType.CREATE: if current.change == PushChangeType.DELETE: - # create + delete = nothing + # create + delete = file is transparent for current changes -> delete it del result[path] elif current.change in ( PushChangeType.UPDATE, PushChangeType.UPDATE_DIFF, ): - # create + update = create with updated info - current.change = existing.change + # create + update = create with with the most recent metadata + current.change = previous.change current.diffs = [] - else: - result[path] = current - elif existing.change == PushChangeType.UPDATE: + elif previous.change == PushChangeType.UPDATE: if current.change == PushChangeType.UPDATE_DIFF: - # update + update_diff = update_diff with latest info - current.change = existing.change + # update + update_diff = update with latest info + current.change = previous.change current.diffs = [] result[path] = current - elif existing.change == PushChangeType.UPDATE_DIFF: + elif previous.change == PushChangeType.UPDATE_DIFF: if current.change == PushChangeType.UPDATE_DIFF: # update_diff + update_diff = update_diff with latest info - current.diffs.extend(existing.diffs or []) - result[path] = current - else: - # delete + anything = anything + current.diffs.extend(previous.diffs or []) result[path] = current + elif previous.change == PushChangeType.DELETE: + if current.change == PushChangeType.CREATE: + # delete + create = create + result[path] = current else: result[current.path] = current return list(result.values()) @classmethod - def _create_checkpoint( + def create_checkpoint( cls, project_id: str, checkpoint: Checkpoint, - changes: List[ProjectVersionChange] = [], - ) -> Optional[ProjectVersionChange]: + changes: List[ProjectVersionDelta] = [], + ) -> Optional[ProjectVersionDelta]: """ - Creates and caches a new ProjectVersionChange checkpoint and any required FileDiff checkpoints. + Creates and caches new checkpoint and any required FileDiff checkpoints. """ - changes_range = [ + delta_range = [ change for change in changes if checkpoint.start <= change.version.name <= checkpoint.end ] - if not changes_range: + if not delta_range: logging.warning( f"No individual changes found for project {project_id} in range v{checkpoint.start}-v{checkpoint.end} to create checkpoint." ) return None # dump delta objects from database and flatten list for merging - deltas = [] - for change in changes_range: - delta_items = DeltaDataSchema(many=True).load(change.delta) - deltas.extend(delta_items) - merged_delta_items: List[DeltaData] = [ - d.to_data_delta() for d in cls.merge_delta_items(deltas) + changes = [] + for delta in delta_range: + changes.extend(DeltaChangeSchema(many=True).load(delta.changes)) + merged_delta_items: List[DeltaChange] = [ + d.to_data_delta() for d in cls.merge_delta_changes(changes) ] # Pre-fetch data for all versioned files to create FileDiff checkpoints @@ -1054,99 +1132,14 @@ def _create_checkpoint( item.diff = diff_path db.session.add(checkpoint_diff) - checkpoint_change = ProjectVersionChange( - version_id=changes_range[-1].version_id, + checkpoint_delta = ProjectVersionDelta( + version_id=delta_range[-1].version_id, rank=checkpoint.rank, - delta=DeltaDataSchema(many=True).dump(merged_delta_items), + changes=DeltaChangeSchema(many=True).dump(merged_delta_items), ) - db.session.add(checkpoint_change) + db.session.add(checkpoint_delta) db.session.commit() - return checkpoint_change - - @classmethod - def query_changes(cls, project_id, since, to, rank=None): - """Query changes with specified parameters""" - query = cls.query.join(ProjectVersion).filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name >= since, - ProjectVersion.name <= to, - ) - if rank is not None: - query = query.filter(ProjectVersionChange.rank == rank) - return query.order_by(ProjectVersion.name).all() - - @classmethod - def get_delta( - cls, project_id: str, since: int, to: int - ) -> Optional[List[DeltaMerged]]: - """ - Get changes between two versions, merging them if needed. - - create FileDiff checkpoints if needed - - create ProjectVersionChange checkpoints if needed with delta json - """ - if since > to: - logging.error( - f"Start version {since} is higher than end version {to} - broken history" - ) - return - if since == to: - # Return only changes for this version - change = ( - ProjectVersionChange.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name == to, - ProjectVersionChange.rank == 0, - ) - .first() - ) - return [DeltaMerged(**item) for item in change.delta] if change else None - - expected_checkpoints = Checkpoint.get_checkpoints(since + 1, to) - expected_changes: List[ProjectVersionChange] = ( - ProjectVersionChange.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name > since, - ProjectVersion.name <= to, - tuple_(ProjectVersionChange.rank, ProjectVersion.name).in_( - [(item.rank, item.end) for item in expected_checkpoints] - ), - ) - .order_by(ProjectVersion.name) - .all() - ) - existing_changes_map = {(c.rank, c.version.name): c for c in expected_changes} - - # Cache all individual (rank 0) changes in the required range. - individual_changes: List[ProjectVersionChange] = [] - - result: List[DeltaData] = [] - for checkpoint in expected_checkpoints: - expected_change = existing_changes_map.get( - (checkpoint.rank, checkpoint.end) - ) - - # we have change in database, just return delta data from it - if expected_change: - deltas = DeltaDataSchema(many=True).load(expected_change.delta) - result.extend(deltas) - continue - - if checkpoint.rank > 0: - individual_changes = ( - cls.query_changes(project_id, since, to, 0) - if not individual_changes - else individual_changes - ) - new_checkpoint = cls._create_checkpoint( - project_id, checkpoint, individual_changes - ) - if new_checkpoint: - deltas = DeltaDataSchema(many=True).load(new_checkpoint.delta) - result.extend(deltas) - - return cls.merge_delta_items(result) + return checkpoint_delta class ProjectVersion(db.Model): @@ -1246,7 +1239,7 @@ def __init__( # cache changes data json for version checkpoints # rank 0 is for all changes from start to current version delta_data = [ - DeltaData( + DeltaChange( path=c.path, change=c.change, size=c.size, @@ -1256,10 +1249,10 @@ def __init__( ) for c in changes ] - pvc = ProjectVersionChange( + pvc = ProjectVersionDelta( version=self, rank=0, - delta=DeltaDataSchema(many=True).dump(delta_data), + changes=DeltaChangeSchema(many=True).dump(delta_data), ) db.session.add(pvc) diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index 4cd045c5..e30d49c1 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -836,6 +836,10 @@ components: - editor - writer - owner + ProjectChangeType: + type: string + enum: [create, update, delete, update_diff] + example: update ProjectDelta: type: object required: @@ -858,9 +862,7 @@ components: type: integer example: 1 change: - type: string - enum: [create, update, delete, update_diff] - example: update + $ref: "#/components/schemas/ProjectChangeType" diffs: type: array nullable: true diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index a63332c4..5debe031 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -22,7 +22,7 @@ from ..app import db from ..auth import auth_required from ..auth.models import User -from .models import FileDiff, Project, ProjectRole, ProjectMember, ProjectVersionChange +from .models import FileDiff, Project, ProjectRole, ProjectMember, ProjectVersionDelta from .permissions import ProjectPermissions, require_project_by_uuid from .utils import prepare_download_response from ..app import db @@ -38,7 +38,7 @@ StorageLimitHit, UploadError, ) -from .files import ChangesSchema, DeltaRespSchema +from .files import ChangesSchema, DeltaChangeRespSchema from .forms import project_name_validation from .models import ( Project, @@ -404,12 +404,12 @@ def upload_chunk(id: str): ) -@auth_required def get_project_delta(id: str): """Get project changes (delta) between two versions""" since = int(request.args.get("since")) to = int(request.args.get("to")) - project = require_project_by_uuid(id, ProjectPermissions.Read) + project: Project = require_project_by_uuid(id, ProjectPermissions.Read) + latest_version = project.latest_version if since < 0 or to < 0: abort(400, "Invalid 'since' or 'to' version") if since > to: @@ -419,11 +419,9 @@ def get_project_delta(id: str): ProjectVersion.project_id == project.id, ProjectVersion.name == since, ).first_or_404() - ProjectVersion.query.filter( - ProjectVersion.project_id == project.id, - ProjectVersion.name == to, - ).first_or_404() + if to > latest_version: + abort(404) - deltas = ProjectVersionChange.get_delta(project.id, since, to) + delta_changes = project.get_delta_changes(since, to) - return DeltaRespSchema(many=True).dump(deltas), 200 + return DeltaChangeRespSchema(many=True).dump(delta_changes), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 85fcbe9f..c0eb6191 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -26,7 +26,7 @@ Project, ProjectFilePath, ProjectRole, - ProjectVersionChange, + ProjectVersionDelta, ) from ..sync.files import PushChangeType from ..sync.utils import is_versioned_file @@ -324,14 +324,15 @@ def test_create_diff_checkpoint(diff_project): assert not os.path.exists(diff.abs_path) -def test_project_version_change_delta(client, diff_project): - """Test that ProjectVersionChangeData and its schema work as expected""" +def test_project_version_delta_changes(client, diff_project: Project): + """Test that get_delta_changes and its schema work as expected""" latest_version = diff_project.get_latest_version() project_id = diff_project.id assert latest_version.name == 10 - assert ProjectVersionChange.get_delta(project_id, 2, 1) is None - pvcs: List[ProjectVersionChange] = ( - ProjectVersionChange.query.join(ProjectVersion) + assert diff_project.get_delta_changes(2, 1) is None + assert diff_project.get_delta_changes(2, 2) is None + pvcs: List[ProjectVersionDelta] = ( + ProjectVersionDelta.query.join(ProjectVersion) .filter(ProjectVersion.project_id == diff_project.id) .all() ) @@ -340,33 +341,27 @@ def test_project_version_change_delta(client, diff_project): initial_version = initial_pvc.version assert initial_pvc.rank == 0 assert initial_pvc.version.id == initial_version.id - # if version is the same, return just its delta in v1 - assert len(ProjectVersionChange.get_delta(project_id, 1, 1)) == len( - initial_version.files - ) - # no ranks created as we get here just first version with get_delta - assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 - delta = ProjectVersionChange.get_delta(project_id, 1, 2) + delta = diff_project.get_delta_changes(1, 2) assert len(delta) == 1 assert delta[0].change == PushChangeType.DELETE # no ranks created as we get here just first version with get_delta - assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 + assert ProjectVersionDelta.query.filter_by(rank=1).count() == 0 # delete + create version - delta = ProjectVersionChange.get_delta(project_id, 1, 3) + delta = diff_project.get_delta_changes(1, 3) assert len(delta) == 1 assert delta[0].change == PushChangeType.CREATE # get_delta with update diff - delta = ProjectVersionChange.get_delta(project_id, 1, 4) + delta = diff_project.get_delta_changes(1, 4) assert len(delta) == 1 assert delta[0].change == PushChangeType.CREATE - assert ProjectVersionChange.query.filter_by(rank=1).count() == 0 + assert ProjectVersionDelta.query.filter_by(rank=1).count() == 0 # create rank 1 checkpoint for v4 - delta = ProjectVersionChange.get_delta(project_id, 0, 4) - checkpoint_changes = ProjectVersionChange.query.filter_by(rank=1) + delta = diff_project.get_delta_changes(0, 4) + checkpoint_changes = ProjectVersionDelta.query.filter_by(rank=1) filediff_checkpoints = FileDiff.query.filter_by(rank=1) checkpoint_change = checkpoint_changes.first() assert checkpoint_changes.count() == 1 @@ -386,23 +381,21 @@ def test_project_version_change_delta(client, diff_project): assert len(delta_base_gpkg[0].diffs) == 0 # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 - delta = ProjectVersionChange.get_delta(project_id, 0, 10) + delta = diff_project.get_delta_changes(0, 10) assert len(delta) == len(latest_version.files) delta_test_gpkg = [d for d in delta if d.path == "test.gpkg"] assert delta_test_gpkg assert delta_test_gpkg[0].change == PushChangeType.CREATE - assert ProjectVersionChange.query.filter_by(rank=1).count() == 2 + assert ProjectVersionDelta.query.filter_by(rank=1).count() == 2 # base gpgk is transparent assert not next((c for c in delta if c.path == "base.gpkg"), None) - delta = ProjectVersionChange.get_delta( - project_id, latest_version.name - 3, latest_version.name - ) + delta = diff_project.get_delta_changes(latest_version.name - 3, latest_version.name) delta_base_gpkg = next((c for c in delta if c.path == "base.gpkg"), None) assert delta_base_gpkg.change == PushChangeType.DELETE # check update diff - delta = ProjectVersionChange.get_delta(project_id, 5, 7) + delta = diff_project.get_delta_changes(5, 7) assert len(delta) == 1 assert delta[0].change == PushChangeType.UPDATE_DIFF assert len(delta[0].diffs) == 2 @@ -420,7 +413,7 @@ def test_project_version_change_delta(client, diff_project): pv = push_change( diff_project, "updated", "test.gpkg", diff_project.storage.project_dir ) - delta = ProjectVersionChange.get_delta(project_id, 8, latest_version.name + 6) + delta = diff_project.get_delta_changes(8, latest_version.name + 6) assert len(delta) == 2 # file history in 9.th version is basefile fh = FileHistory.query.filter_by( @@ -433,7 +426,7 @@ def test_project_version_change_delta(client, diff_project): fh = FileHistory.query.filter_by( project_version_name=latest_version.name + 6 ).first() - delta = ProjectVersionChange.get_delta(project_id, 12, latest_version.name + 6) + delta = diff_project.get_delta_changes(12, latest_version.name + 6) assert len(delta) == 1 assert len(delta[0].diffs) == 1 assert delta[0].diffs[0].path == base_gpkg_checkpoint.path diff --git a/server/migrations/community/63adc90fca0c_add_project_version_changes.py b/server/migrations/community/9acf967e58ad_add_project_version_delta.py similarity index 65% rename from server/migrations/community/63adc90fca0c_add_project_version_changes.py rename to server/migrations/community/9acf967e58ad_add_project_version_delta.py index bb279743..793a5978 100644 --- a/server/migrations/community/63adc90fca0c_add_project_version_changes.py +++ b/server/migrations/community/9acf967e58ad_add_project_version_delta.py @@ -1,8 +1,8 @@ -"""Add project version changes +"""Add project version delta -Revision ID: 63adc90fca0c +Revision ID: 9acf967e58ad Revises: bd1ec73db389 -Create Date: 2025-10-01 16:50:08.343639 +Create Date: 2025-10-10 17:33:31.740232 """ @@ -11,7 +11,7 @@ from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. -revision = "63adc90fca0c" +revision = "9acf967e58ad" down_revision = "bd1ec73db389" branch_labels = None depends_on = None @@ -20,35 +20,35 @@ def upgrade(): # ### commands auto generated by Alembic - please adjust! ### op.create_table( - "project_version_change", + "project_version_delta", sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), sa.Column("rank", sa.Integer(), nullable=False), sa.Column("version_id", sa.Integer(), nullable=False), - sa.Column("delta", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.Column("changes", postgresql.JSONB(astext_type=sa.Text()), nullable=False), sa.ForeignKeyConstraint( ["version_id"], ["project_version.id"], - name=op.f("fk_project_version_change_version_id_project_version"), + name=op.f("fk_project_version_delta_version_id_project_version"), ondelete="CASCADE", ), - sa.PrimaryKeyConstraint("id", name=op.f("pk_project_version_change")), - sa.UniqueConstraint("version_id", "rank", name="unique_changes"), + sa.PrimaryKeyConstraint("id", name=op.f("pk_project_version_delta")), + sa.UniqueConstraint("version_id", "rank", name="unique_deltas"), ) op.create_index( - op.f("ix_project_version_change_rank"), - "project_version_change", + op.f("ix_project_version_delta_rank"), + "project_version_delta", ["rank"], unique=False, ) op.create_index( - op.f("ix_project_version_change_version_id"), - "project_version_change", + op.f("ix_project_version_delta_version_id"), + "project_version_delta", ["version_id"], unique=False, ) op.create_index( - "ix_project_version_change_version_id_rank", - "project_version_change", + "ix_project_version_delta_version_id_rank", + "project_version_delta", ["version_id", "rank"], unique=False, ) @@ -57,7 +57,7 @@ def upgrade(): # data migration op.execute( """ - INSERT INTO project_version_change (version_id, rank, delta) + INSERT INTO project_version_delta (version_id, rank, changes) SELECT h.version_id, 0 AS rank, @@ -67,12 +67,12 @@ def upgrade(): 'path', fp.path, 'size', h.size, 'change', h.change, - 'version', 'v' || h.project_version_name, + 'version', h.project_version_name, 'checksum', h.checksum, 'diff', fdj.diff_path ) ) - ) AS delta + ) AS changes FROM file_history h JOIN @@ -96,14 +96,13 @@ def upgrade(): def downgrade(): # ### commands auto generated by Alembic - please adjust! ### op.drop_index( - "ix_project_version_change_version_id_rank", table_name="project_version_change" + "ix_project_version_delta_version_id_rank", table_name="project_version_delta" ) op.drop_index( - op.f("ix_project_version_change_version_id"), - table_name="project_version_change", + op.f("ix_project_version_delta_version_id"), table_name="project_version_delta" ) op.drop_index( - op.f("ix_project_version_change_rank"), table_name="project_version_change" + op.f("ix_project_version_delta_rank"), table_name="project_version_delta" ) - op.drop_table("project_version_change") + op.drop_table("project_version_delta") # ### end Alembic commands ### From 8f1361311ef5aa1b98a6ab272e21dece45ab925c Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Mon, 13 Oct 2025 08:38:36 +0200 Subject: [PATCH 23/36] Fix alembic migration for file diff --- .../bd1ec73db389_create_file_diff_table.py | 57 +++++++++++++++---- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/server/migrations/community/bd1ec73db389_create_file_diff_table.py b/server/migrations/community/bd1ec73db389_create_file_diff_table.py index ebc95bd4..1ee671dd 100644 --- a/server/migrations/community/bd1ec73db389_create_file_diff_table.py +++ b/server/migrations/community/bd1ec73db389_create_file_diff_table.py @@ -86,19 +86,23 @@ def upgrade(): b.basefile_version FROM diffs d LEFT OUTER JOIN basefiles b ON b.file_path_id = d.file_path_id AND b.basefile_version < d.project_version_name - ) + ), + file_diffs AS ( + SELECT DISTINCT + d.file_path_id, + FIRST_VALUE(rb.basefile_id) OVER (PARTITION BY rb.id ORDER BY rb.basefile_version DESC) as basefile_id, + 0 AS rank, + d.project_version_name AS version, + (d.diff ->> 'path') AS path, + (d.diff ->> 'size')::bigint AS size, + d.diff ->> 'checksum' AS checksum, + d.diff ->> 'location' AS location + FROM diffs d + LEFT OUTER JOIN relevant_basefiles rb ON rb.id = d.id + ) INSERT INTO file_diff (file_path_id, basefile_id, rank, version, path, size, checksum, location) - SELECT DISTINCT - d.file_path_id, - FIRST_VALUE(rb.basefile_id) OVER (PARTITION BY rb.id ORDER BY rb.basefile_version DESC) as basefile_id, - 0 AS rank, - d.project_version_name AS version, - (d.diff ->> 'path') AS path, - (d.diff ->> 'size')::bigint AS size, - d.diff ->> 'checksum' AS checksum, - d.diff ->> 'location' AS location - FROM diffs d - LEFT OUTER JOIN relevant_basefiles rb ON rb.id = d.id; + -- it seems that some projects / files might be broken so we need to play it safe here + SELECT * FROM file_diffs WHERE basefile_id IS NOT NULL; """ ) @@ -132,6 +136,35 @@ def downgrade(): """ ) + # if there were any broken gpkg files (ommited in upgrade), let's add there a dummy diff + conn.execute( + """ + UPDATE file_history fh + SET diff = jsonb_build_object( + 'path', 'missing-diff', + 'size', 0, + 'checksum', '', + 'location', '' + ) + WHERE fh.change = 'update_diff' AND fh.diff IS NULL; + """ + ) + + # add back consistency constraint + conn.execute( + sa.text( + """ + ALTER TABLE file_history + ADD CONSTRAINT ck_file_history_changes_with_diff CHECK ( + CASE + WHEN (change = 'update_diff') THEN diff IS NOT NULL + ELSE diff IS NULL + END + ); + """ + ) + ) + op.drop_index(op.f("ix_file_diff_version"), table_name="file_diff") op.drop_index(op.f("ix_file_diff_rank"), table_name="file_diff") op.drop_index(op.f("ix_file_diff_path"), table_name="file_diff") From d0ef271437b2524208ee948dcb6bbf71bed7dc6d Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 16 Oct 2025 09:13:09 +0200 Subject: [PATCH 24/36] enhancements v2 --- server/mergin/sync/models.py | 8 ++++++++ server/mergin/sync/public_api_v2.yaml | 5 +++-- .../mergin/sync/public_api_v2_controller.py | 19 ++++++------------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 6fd6a6a4..9343db13 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -1038,6 +1038,8 @@ def merge_delta_changes( # create + update = create with with the most recent metadata current.change = previous.change current.diffs = [] + else: + result[path] = current elif previous.change == PushChangeType.UPDATE: if current.change == PushChangeType.UPDATE_DIFF: # update + update_diff = update with latest info @@ -1053,6 +1055,12 @@ def merge_delta_changes( if current.change == PushChangeType.CREATE: # delete + create = create result[path] = current + elif current.change in ( + PushChangeType.UPDATE, + PushChangeType.UPDATE_DIFF, + ): + # delete + update = invalid sequence, keep delete + continue else: result[current.path] = current return list(result.values()) diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index e30d49c1..9ac4a2d0 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -394,14 +394,15 @@ paths: required: true schema: type: integer - example: 1 + example: + minimum: 0 description: Start version (exclusive) - name: to in: query - required: true schema: type: integer example: 2 + minimum: 1 description: End version (inclusive) responses: "200": diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 5debe031..8e5eed23 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -4,6 +4,7 @@ import os from datetime import datetime +from typing import Optional import uuid import gevent import logging @@ -404,24 +405,16 @@ def upload_chunk(id: str): ) -def get_project_delta(id: str): +def get_project_delta(id: str, since: int, to: Optional[int] = None): """Get project changes (delta) between two versions""" - since = int(request.args.get("since")) - to = int(request.args.get("to")) + project: Project = require_project_by_uuid(id, ProjectPermissions.Read) - latest_version = project.latest_version - if since < 0 or to < 0: - abort(400, "Invalid 'since' or 'to' version") + if to is None: + to = project.latest_version + if since > to: abort(400, "'since' version must be less than 'to' version") - ProjectVersion.query.filter( - ProjectVersion.project_id == project.id, - ProjectVersion.name == since, - ).first_or_404() - if to > latest_version: - abort(404) - delta_changes = project.get_delta_changes(since, to) return DeltaChangeRespSchema(many=True).dump(delta_changes), 200 From 8ceee04ea557bf617014c40fe31ca863d634c09a Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Fri, 24 Oct 2025 17:48:58 +0200 Subject: [PATCH 25/36] Address disscussions: - update new table schema for project version delta - add tests and upgrade existing to handle order of diffs --- server/mergin/sync/models.py | 168 ++++++++------ .../mergin/sync/public_api_v2_controller.py | 11 +- server/mergin/tests/test_public_api_v2.py | 213 ++++++++++++++---- .../4b4648483770_add_project_version_delta.py | 129 +++++++++++ .../9acf967e58ad_add_project_version_delta.py | 108 --------- 5 files changed, 396 insertions(+), 233 deletions(-) create mode 100644 server/migrations/community/4b4648483770_add_project_version_delta.py delete mode 100644 server/migrations/community/9acf967e58ad_add_project_version_delta.py diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 9343db13..0d342303 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -359,7 +359,7 @@ def get_delta_changes( """ Get changes between two versions, merging them if needed. - create FileDiff checkpoints if needed - - create ProjectVersionDelta checkpoints if needed with delta json + - create ProjectVersionDelta checkpoints if needed with changes json """ if since > to: logging.error( @@ -371,19 +371,18 @@ def get_delta_changes( project_id = self.id expected_checkpoints = Checkpoint.get_checkpoints(since + 1, to) expected_deltas: List[ProjectVersionDelta] = ( - ProjectVersionDelta.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name > since, - ProjectVersion.name <= to, - tuple_(ProjectVersionDelta.rank, ProjectVersion.name).in_( + ProjectVersionDelta.query.filter( + ProjectVersionDelta.project_id == project_id, + ProjectVersionDelta.version > since, + ProjectVersionDelta.version <= to, + tuple_(ProjectVersionDelta.rank, ProjectVersionDelta.version).in_( [(item.rank, item.end) for item in expected_checkpoints] ), ) - .order_by(ProjectVersion.name) + .order_by(ProjectVersionDelta.version) .all() ) - existing_delta_map = {(c.rank, c.version.name): c for c in expected_deltas} + existing_delta_map = {(c.rank, c.version): c for c in expected_deltas} # Cache all individual (rank 0) delta rows in the required range. individual_deltas: List[ProjectVersionDelta] = [] @@ -400,14 +399,13 @@ def get_delta_changes( # If higher rank delta checkopoint does not exists, we are using rank=0 deltas to create checkopoint if checkpoint.rank > 0: individual_deltas = ( - ProjectVersionDelta.query.join(ProjectVersion) - .filter( - ProjectVersion.project_id == project_id, - ProjectVersion.name >= since, - ProjectVersion.name <= to, + ProjectVersionDelta.query.filter( + ProjectVersionDelta.project_id == project_id, + ProjectVersionDelta.version >= since, + ProjectVersionDelta.version <= to, ProjectVersionDelta.rank == 0, ) - .order_by(ProjectVersion.name) + .order_by(ProjectVersionDelta.version) .all() if not individual_deltas else individual_deltas @@ -431,7 +429,7 @@ def get_delta_changes( f"Not possible to create checkpoint for project {project_id} in range {checkpoint.start}-{checkpoint.end}" ) - return ProjectVersionDelta.merge_delta_changes(result) + return ProjectVersionDelta.merge_changes(result) class ProjectRole(Enum): @@ -987,12 +985,13 @@ def construct_checkpoint(self) -> bool: class ProjectVersionDelta(db.Model): id = db.Column(db.BigInteger, primary_key=True, autoincrement=True) + version = db.Column(db.Integer, nullable=False, index=True) # exponential order of changes json rank = db.Column(db.Integer, nullable=False, index=True) # to which project version is this linked - version_id = db.Column( - db.Integer, - db.ForeignKey("project_version.id", ondelete="CASCADE"), + project_id = db.Column( + UUID(as_uuid=True), + db.ForeignKey("project.id", ondelete="CASCADE"), index=True, nullable=False, ) @@ -1000,69 +999,91 @@ class ProjectVersionDelta(db.Model): changes = db.Column(JSONB, nullable=False) __table_args__ = ( - db.UniqueConstraint("version_id", "rank", name="unique_deltas"), + db.UniqueConstraint("project_id", "version", "rank", name="unique_deltas"), db.Index( - "ix_project_version_delta_version_id_rank", - version_id, + "ix_project_version_delta_project_id_version_rank", + project_id, + version, rank, ), ) - version = db.relationship( - "ProjectVersion", + project = db.relationship( + "Project", uselist=False, ) @staticmethod - def merge_delta_changes( + def merge_changes( items: List[DeltaChange], ) -> List[DeltaChangeMerged]: """ - Merge multiple changes json array objects into one list of changes. + Merge changes json array objects into one list of changes. Changes are merged based on file path and change type. """ result: Dict[str, DeltaChangeMerged] = {} + # sorting changes by version to apply them in correct order + items.sort(key=lambda x: x.version) + + def handle_replace(result, path, current, previous): + result[path] = current + + def handle_delete(result, path, current, previous): + del result[path] + + def handle_update(result, path, current, previous): + # handle update case, when previous change was create - just revert to create with new metadata + current.change = previous.change + current.version = previous.version + current.diffs = [] + result[path] = current + + def handle_update_diff(result, path, current, previous): + current.diffs = (previous.diffs or []) + (current.diffs or []) + result[path] = current + + dispatch = { + # create + delete = file is transparent for current changes -> delete it + (PushChangeType.CREATE, PushChangeType.DELETE): handle_delete, + # create + update = create with updated info + (PushChangeType.CREATE, PushChangeType.UPDATE): handle_update, + (PushChangeType.CREATE, PushChangeType.UPDATE_DIFF): handle_update, + (PushChangeType.CREATE, PushChangeType.CREATE): None, + # update + update_diff = update with latest info + ( + PushChangeType.UPDATE, + PushChangeType.UPDATE_DIFF, + ): handle_update, + (PushChangeType.UPDATE, PushChangeType.UPDATE): handle_replace, + (PushChangeType.UPDATE, PushChangeType.DELETE): handle_replace, + (PushChangeType.UPDATE, PushChangeType.CREATE): handle_replace, + # update_diff + update_diff = update_diff with latest info with proper order of diffs + ( + PushChangeType.UPDATE_DIFF, + PushChangeType.UPDATE_DIFF, + ): handle_update_diff, + (PushChangeType.UPDATE_DIFF, PushChangeType.UPDATE): handle_replace, + (PushChangeType.UPDATE_DIFF, PushChangeType.DELETE): handle_replace, + (PushChangeType.UPDATE_DIFF, PushChangeType.CREATE): None, + (PushChangeType.DELETE, PushChangeType.CREATE): handle_replace, + # delete + update = invalid sequence, keep delete + (PushChangeType.DELETE, PushChangeType.UPDATE): None, + (PushChangeType.DELETE, PushChangeType.UPDATE_DIFF): None, + (PushChangeType.DELETE, PushChangeType.DELETE): None, + } + for item in items: current = item.to_merged_delta() - previous = result.get(current.path) path = current.path - if previous: - # merge changes data jsons - if previous.change == PushChangeType.CREATE: - if current.change == PushChangeType.DELETE: - # create + delete = file is transparent for current changes -> delete it - del result[path] - elif current.change in ( - PushChangeType.UPDATE, - PushChangeType.UPDATE_DIFF, - ): - # create + update = create with with the most recent metadata - current.change = previous.change - current.diffs = [] - else: - result[path] = current - elif previous.change == PushChangeType.UPDATE: - if current.change == PushChangeType.UPDATE_DIFF: - # update + update_diff = update with latest info - current.change = previous.change - current.diffs = [] - result[path] = current - elif previous.change == PushChangeType.UPDATE_DIFF: - if current.change == PushChangeType.UPDATE_DIFF: - # update_diff + update_diff = update_diff with latest info - current.diffs.extend(previous.diffs or []) - result[path] = current - elif previous.change == PushChangeType.DELETE: - if current.change == PushChangeType.CREATE: - # delete + create = create - result[path] = current - elif current.change in ( - PushChangeType.UPDATE, - PushChangeType.UPDATE_DIFF, - ): - # delete + update = invalid sequence, keep delete - continue - else: - result[current.path] = current + previous = result.get(path) + + if not previous: + result[path] = current + continue + + handler = dispatch.get((previous.change, current.change)) + if handler: + handler(result, path, current, previous) + return list(result.values()) @classmethod @@ -1078,7 +1099,7 @@ def create_checkpoint( delta_range = [ change for change in changes - if checkpoint.start <= change.version.name <= checkpoint.end + if checkpoint.start <= change.version <= checkpoint.end ] if not delta_range: @@ -1092,7 +1113,7 @@ def create_checkpoint( for delta in delta_range: changes.extend(DeltaChangeSchema(many=True).load(delta.changes)) merged_delta_items: List[DeltaChange] = [ - d.to_data_delta() for d in cls.merge_delta_changes(changes) + d.to_data_delta() for d in cls.merge_changes(changes) ] # Pre-fetch data for all versioned files to create FileDiff checkpoints @@ -1104,6 +1125,7 @@ def create_checkpoint( ] versioned_file_paths = [delta.path for delta in versioned_delta_items] if versioned_file_paths: + # get versioned files from DB and lookup their paths to next processing file_paths = ProjectFilePath.query.filter( ProjectFilePath.project_id == project_id, ProjectFilePath.path.in_(versioned_file_paths), @@ -1121,7 +1143,7 @@ def create_checkpoint( rank=checkpoint.rank, version=checkpoint.end, ).first() - + # If does not exists, let's create diff with higher rank and some generated path (name of diff file) if not existing_diff_checkpoint: base_file = FileHistory.get_basefile(file_path_id, checkpoint.end) if not base_file: @@ -1141,7 +1163,8 @@ def create_checkpoint( db.session.add(checkpoint_diff) checkpoint_delta = ProjectVersionDelta( - version_id=delta_range[-1].version_id, + project_id=project_id, + version=checkpoint.end, rank=checkpoint.rank, changes=DeltaChangeSchema(many=True).dump(merged_delta_items), ) @@ -1257,13 +1280,14 @@ def __init__( ) for c in changes ] - pvc = ProjectVersionDelta( - version=self, + pvd = ProjectVersionDelta( + project_id=project.id, + version=name, rank=0, changes=DeltaChangeSchema(many=True).dump(delta_data), ) - db.session.add(pvc) + db.session.add(pvd) db.session.flush() # update cached values in project and push to transaction buffer so that self.files is up-to-date diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 8e5eed23..5f539b50 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -409,12 +409,15 @@ def get_project_delta(id: str, since: int, to: Optional[int] = None): """Get project changes (delta) between two versions""" project: Project = require_project_by_uuid(id, ProjectPermissions.Read) - if to is None: - to = project.latest_version + to = project.latest_version if to is None else to + if to > project.latest_version: + abort(400, "'to' version exceeds latest project version") - if since > to: - abort(400, "'since' version must be less than 'to' version") + if since >= to: + abort(400, "'since' version must be less than or equal to 'to' version") delta_changes = project.get_delta_changes(since, to) + if delta_changes is None: + abort(404) return DeltaChangeRespSchema(many=True).dump(delta_changes), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index c0eb6191..eceee077 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -28,7 +28,7 @@ ProjectRole, ProjectVersionDelta, ) -from ..sync.files import PushChangeType +from ..sync.files import DeltaChange, PushChangeType from ..sync.utils import is_versioned_file from sqlalchemy.exc import IntegrityError import pytest @@ -324,6 +324,89 @@ def test_create_diff_checkpoint(diff_project): assert not os.path.exists(diff.abs_path) +def test_delta_merge_changes(): + """Test merging of delta changes works as expected""" + + create = DeltaChange( + path="file1.gpkg", + change=PushChangeType.CREATE, + version=1, + size=100, + checksum="abc", + ) + update = DeltaChange( + path="file1.gpkg", + change=PushChangeType.UPDATE, + version=2, + size=120, + checksum="def", + ) + delete = DeltaChange( + path="file1.gpkg", + change=PushChangeType.DELETE, + version=3, + size=0, + checksum="ghi", + ) + update_diff1 = DeltaChange( + path="file1.gpkg", + change=PushChangeType.UPDATE_DIFF, + version=4, + size=130, + checksum="xyz", + diff="diff1", + ) + update_diff2 = DeltaChange( + path="file1.gpkg", + change=PushChangeType.UPDATE_DIFF, + version=5, + size=140, + checksum="uvw", + diff="diff2", + ) + + # CREATE + UPDATE -> CREATE + merged = ProjectVersionDelta.merge_changes([create, update]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.CREATE + assert merged[0].version == create.version + # check reverse order as well + merged = ProjectVersionDelta.merge_changes([update, create]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.CREATE + assert merged[0].version == create.version + + # CREATE + DELETE -> removed + merged = ProjectVersionDelta.merge_changes([create, delete]) + assert len(merged) == 0 + + # UPDATE + DELETE -> DELETE + merged = ProjectVersionDelta.merge_changes([update, delete]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.DELETE + + # CREATE + UPDATE_DIFF -> CREATE + merged = ProjectVersionDelta.merge_changes([create, update_diff1]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.CREATE + assert merged[0].diffs == [] + + # UPDATE + UPDATE_DIFF -> UPDATE + merged = ProjectVersionDelta.merge_changes([update, update_diff1]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.UPDATE + assert merged[0].diffs == [] + + # UPDATE_DIFF + UPDATE_DIFF -> merged diffs + merged = ProjectVersionDelta.merge_changes([update_diff1, update_diff2]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.UPDATE_DIFF + assert merged[0].version == update_diff2.version + assert merged[0].size == update_diff2.size + assert merged[0].checksum == update_diff2.checksum + assert [d.path for d in merged[0].diffs] == ["diff1", "diff2"] + + def test_project_version_delta_changes(client, diff_project: Project): """Test that get_delta_changes and its schema work as expected""" latest_version = diff_project.get_latest_version() @@ -331,16 +414,20 @@ def test_project_version_delta_changes(client, diff_project: Project): assert latest_version.name == 10 assert diff_project.get_delta_changes(2, 1) is None assert diff_project.get_delta_changes(2, 2) is None - pvcs: List[ProjectVersionDelta] = ( - ProjectVersionDelta.query.join(ProjectVersion) - .filter(ProjectVersion.project_id == diff_project.id) + deltas: List[ProjectVersionDelta] = ( + ProjectVersionDelta.query.filter_by(project_id=project_id) + .order_by(ProjectVersionDelta.version) .all() ) - assert len(pvcs) == 10 - initial_pvc = pvcs[0] - initial_version = initial_pvc.version - assert initial_pvc.rank == 0 - assert initial_pvc.version.id == initial_version.id + assert len(deltas) == 10 + initial_delta = deltas[0] + initial_version = ProjectVersion.query.filter_by( + project_id=project_id, name=initial_delta.version + ).first() + assert initial_version + assert initial_delta.version + assert initial_delta.rank == 0 + assert initial_delta.version == 1 delta = diff_project.get_delta_changes(1, 2) assert len(delta) == 1 @@ -352,6 +439,8 @@ def test_project_version_delta_changes(client, diff_project: Project): delta = diff_project.get_delta_changes(1, 3) assert len(delta) == 1 assert delta[0].change == PushChangeType.CREATE + assert delta[0].version == 3 + assert delta[0].checksum == deltas[3].changes[0]["checksum"] # get_delta with update diff delta = diff_project.get_delta_changes(1, 4) @@ -365,27 +454,27 @@ def test_project_version_delta_changes(client, diff_project: Project): filediff_checkpoints = FileDiff.query.filter_by(rank=1) checkpoint_change = checkpoint_changes.first() assert checkpoint_changes.count() == 1 - assert checkpoint_change.version_id == pvcs[3].version_id + assert checkpoint_change.version == deltas[3].version assert filediff_checkpoints.count() == 0 # check if filediff basefile is correctly set file_history = FileHistory.query.filter_by(project_version_name=4).first() assert len(delta) == len(initial_version.files) - delta_base_gpkg = [d for d in delta if d.path == "base.gpkg"] - assert len(delta_base_gpkg) == 1 + delta_base_gpkg = next((d for d in delta if d.path == "base.gpkg"), None) + assert delta_base_gpkg # from history is clear, that we are just creating geopackage in this range - assert delta_base_gpkg[0].change == PushChangeType.CREATE - assert delta_base_gpkg[0].version == 3 - assert delta_base_gpkg[0].path == file_history.path - assert delta_base_gpkg[0].size == file_history.size - assert delta_base_gpkg[0].checksum == file_history.checksum - assert len(delta_base_gpkg[0].diffs) == 0 + assert delta_base_gpkg.change == PushChangeType.CREATE + assert delta_base_gpkg.version == 3 + assert delta_base_gpkg.path == file_history.path + assert delta_base_gpkg.size == file_history.size + assert delta_base_gpkg.checksum == file_history.checksum + assert len(delta_base_gpkg.diffs) == 0 # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 delta = diff_project.get_delta_changes(0, 10) assert len(delta) == len(latest_version.files) - delta_test_gpkg = [d for d in delta if d.path == "test.gpkg"] + delta_test_gpkg = next((d for d in delta if d.path == "test.gpkg"), None) assert delta_test_gpkg - assert delta_test_gpkg[0].change == PushChangeType.CREATE + assert delta_test_gpkg.change == PushChangeType.CREATE assert ProjectVersionDelta.query.filter_by(rank=1).count() == 2 # base gpgk is transparent assert not next((c for c in delta if c.path == "base.gpkg"), None) @@ -394,14 +483,6 @@ def test_project_version_delta_changes(client, diff_project: Project): delta_base_gpkg = next((c for c in delta if c.path == "base.gpkg"), None) assert delta_base_gpkg.change == PushChangeType.DELETE - # check update diff - delta = diff_project.get_delta_changes(5, 7) - assert len(delta) == 1 - assert delta[0].change == PushChangeType.UPDATE_DIFF - assert len(delta[0].diffs) == 2 - # find related diff file in file diffs to check relation - assert FileDiff.query.filter_by(path=delta[0].diffs[0].path) - # create just update_diff versions with checkpoint base_gpkg = os.path.join(diff_project.storage.project_dir, "test.gpkg") shutil.copy( @@ -410,7 +491,7 @@ def test_project_version_delta_changes(client, diff_project: Project): for i in range(6): sql = f"UPDATE simple SET rating={i}" execute_query(base_gpkg, sql) - pv = push_change( + push_change( diff_project, "updated", "test.gpkg", diff_project.storage.project_dir ) delta = diff_project.get_delta_changes(8, latest_version.name + 6) @@ -798,24 +879,51 @@ def test_project_delta(client, diff_project): os.path.join(working_dir, "base.gpkg"), ) push_change(initial_project, "added", "base.gpkg", working_dir) - response = client.get(f"v2/projects/{initial_project.id}/delta?since=0&to=1") + response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") + assert response.status_code == 200 + delta = response.json + assert len(delta) == 1 + assert delta[0]["change"] == PushChangeType.CREATE.value + assert delta[0]["version"] == 1 + + # remove the file and get changes from 0 -> 2 where base gpgkg is removed -> transparent + push_change(initial_project, "removed", "base.gpkg", working_dir) + response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") assert response.status_code == 200 + delta = response.json + assert len(delta) == 0 + + # non valid cases response = client.get(f"v2/projects/{diff_project.id}/delta") assert response.status_code == 400 - - response = client.get(f"v2/projects/{diff_project.id}/delta?since=-1&to=1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=2&to=1") + assert response.status_code == 400 + response = client.get(f"v2/projects/{diff_project.id}/delta?since=-2") + assert response.status_code == 400 + response = client.get(f"v2/projects/{diff_project.id}/delta?since=-2&to=-1") + assert response.status_code == 400 + # exceeding latest version + response = client.get(f"v2/projects/{diff_project.id}/delta?since=0&to=2000") + assert response.status_code == 400 + # no changes between versions with same number + response = client.get(f"v2/projects/{diff_project.id}/delta?since=1&to=1") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=1000&to=2000") - assert response.status_code == 404 - - response = client.get(f"v2/projects/{diff_project.id}/delta?since=1&to=10") + # since 1 to latest version + response = client.get(f"v2/projects/{diff_project.id}/delta?since=1") assert response.status_code == 200 assert len(response.json) == 1 assert response.json[0]["change"] == PushChangeType.CREATE.value assert response.json[0]["version"] == 9 + files = ( + ProjectVersion.query.filter_by( + project_id=diff_project.id, name=response.json[0]["version"] + ) + .first() + .files + ) - # simplate update + # simple update response = client.get(f"v2/projects/{diff_project.id}/delta?since=4&to=8") assert response.status_code == 200 delta = response.json @@ -823,25 +931,32 @@ def test_project_delta(client, diff_project): # simulate pull of delta[0] assert delta[0]["change"] == PushChangeType.UPDATE.value - assert delta[0]["version"] == 7 + assert delta[0]["version"] == 5 assert not delta[0].get("diffs") # integration test for pull mechanism -def test_project_pull(client, diff_project): - """Test project pull mechanisom in v2""" - - response = client.get(f"v2/projects/{diff_project.id}/delta?since=5&to=7") +def test_project_pull_diffs(client, diff_project): + """Test project pull mechanisom in v2 with diff files""" + since = 5 + to = 7 + # check diff files in database where we can get them with right order and metadata + current_diffs = ( + FileDiff.query.filter(FileDiff.version > since, FileDiff.version <= to) + .order_by(FileDiff.version) + .all() + ) + response = client.get(f"v2/projects/{diff_project.id}/delta?since={since}&to={to}") assert response.status_code == 200 delta = response.json assert len(delta) == 1 assert delta[0]["change"] == PushChangeType.UPDATE_DIFF.value assert delta[0]["version"] == 7 - diff = delta[0]["diffs"][0] - assert diff["path"].startswith("base.gpkg-") - response = client.get(f"v2/projects/{diff_project.id}/raw/diff/{diff['path']}") + first_diff = delta[0]["diffs"][0] + second_diff = delta[0]["diffs"][1] + assert first_diff["path"] == current_diffs[0].path + assert second_diff["path"] == current_diffs[1].path + response = client.get( + f"v2/projects/{diff_project.id}/raw/diff/{first_diff['path']}" + ) assert response.status_code == 200 - created_diff = FileDiff.query.filter_by(path=diff["path"]).first() - assert created_diff and os.path.exists(created_diff.abs_path) - assert created_diff.size > 0 - assert created_diff.checksum diff --git a/server/migrations/community/4b4648483770_add_project_version_delta.py b/server/migrations/community/4b4648483770_add_project_version_delta.py new file mode 100644 index 00000000..9f13eced --- /dev/null +++ b/server/migrations/community/4b4648483770_add_project_version_delta.py @@ -0,0 +1,129 @@ +"""Add project version delta + +Revision ID: 4b4648483770 +Revises: bd1ec73db389 +Create Date: 2025-10-24 09:55:18.286286 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = "4b4648483770" +down_revision = "bd1ec73db389" +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.create_table( + "project_version_delta", + sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), + sa.Column("version", sa.Integer(), nullable=False), + sa.Column("rank", sa.Integer(), nullable=False), + sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=False), + sa.Column("changes", postgresql.JSONB(astext_type=sa.Text()), nullable=False), + sa.ForeignKeyConstraint( + ["project_id"], + ["project.id"], + name=op.f("fk_project_version_delta_project_id_project"), + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id", name=op.f("pk_project_version_delta")), + sa.UniqueConstraint("project_id", "version", "rank", name="unique_deltas"), + ) + op.create_index( + op.f("ix_project_version_delta_project_id"), + "project_version_delta", + ["project_id"], + unique=False, + ) + op.create_index( + "ix_project_version_delta_project_id_version_rank", + "project_version_delta", + ["project_id", "version", "rank"], + unique=False, + ) + op.create_index( + op.f("ix_project_version_delta_rank"), + "project_version_delta", + ["rank"], + unique=False, + ) + op.create_index( + op.f("ix_project_version_delta_version"), + "project_version_delta", + ["version"], + unique=False, + ) + # ### end Alembic commands ### + + op.execute( + """ + INSERT INTO project_version_delta (project_id, version, rank, changes) + WITH delta AS ( + SELECT + h.version_id, + jsonb_agg( + jsonb_strip_nulls( + jsonb_build_object( + 'path', fp.path, + 'size', h.size, + 'change', h.change, + 'version', h.project_version_name, + 'checksum', h.checksum, + 'diff', fdj.diff_path + ) + ) + ) AS changes + FROM + file_history h + JOIN + project_file_path fp ON h.file_path_id = fp.id + LEFT JOIN LATERAL ( + SELECT + fd.path AS diff_path + FROM + file_diff fd + WHERE + fd.file_path_id = fp.id + AND fd.version = h.project_version_name + AND fd.rank = 0 + ) fdj ON TRUE + GROUP BY + -- Group by the single unique version identifier + h.version_id + ) + SELECT + pv.project_id, + pv.name, + 0 AS rank, + d.changes + FROM + delta AS d + JOIN project_version AS pv ON d.version_id = pv.id + ; + """ + ) + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + op.drop_index( + op.f("ix_project_version_delta_version"), table_name="project_version_delta" + ) + op.drop_index( + op.f("ix_project_version_delta_rank"), table_name="project_version_delta" + ) + op.drop_index( + "ix_project_version_delta_project_id_version_rank", + table_name="project_version_delta", + ) + op.drop_index( + op.f("ix_project_version_delta_project_id"), table_name="project_version_delta" + ) + op.drop_table("project_version_delta") + # ### end Alembic commands ### diff --git a/server/migrations/community/9acf967e58ad_add_project_version_delta.py b/server/migrations/community/9acf967e58ad_add_project_version_delta.py deleted file mode 100644 index 793a5978..00000000 --- a/server/migrations/community/9acf967e58ad_add_project_version_delta.py +++ /dev/null @@ -1,108 +0,0 @@ -"""Add project version delta - -Revision ID: 9acf967e58ad -Revises: bd1ec73db389 -Create Date: 2025-10-10 17:33:31.740232 - -""" - -from alembic import op -import sqlalchemy as sa -from sqlalchemy.dialects import postgresql - -# revision identifiers, used by Alembic. -revision = "9acf967e58ad" -down_revision = "bd1ec73db389" -branch_labels = None -depends_on = None - - -def upgrade(): - # ### commands auto generated by Alembic - please adjust! ### - op.create_table( - "project_version_delta", - sa.Column("id", sa.BigInteger(), autoincrement=True, nullable=False), - sa.Column("rank", sa.Integer(), nullable=False), - sa.Column("version_id", sa.Integer(), nullable=False), - sa.Column("changes", postgresql.JSONB(astext_type=sa.Text()), nullable=False), - sa.ForeignKeyConstraint( - ["version_id"], - ["project_version.id"], - name=op.f("fk_project_version_delta_version_id_project_version"), - ondelete="CASCADE", - ), - sa.PrimaryKeyConstraint("id", name=op.f("pk_project_version_delta")), - sa.UniqueConstraint("version_id", "rank", name="unique_deltas"), - ) - op.create_index( - op.f("ix_project_version_delta_rank"), - "project_version_delta", - ["rank"], - unique=False, - ) - op.create_index( - op.f("ix_project_version_delta_version_id"), - "project_version_delta", - ["version_id"], - unique=False, - ) - op.create_index( - "ix_project_version_delta_version_id_rank", - "project_version_delta", - ["version_id", "rank"], - unique=False, - ) - # ### end Alembic commands ### - - # data migration - op.execute( - """ - INSERT INTO project_version_delta (version_id, rank, changes) - SELECT - h.version_id, - 0 AS rank, - jsonb_agg( - jsonb_strip_nulls( - jsonb_build_object( - 'path', fp.path, - 'size', h.size, - 'change', h.change, - 'version', h.project_version_name, - 'checksum', h.checksum, - 'diff', fdj.diff_path - ) - ) - ) AS changes - FROM - file_history h - JOIN - project_file_path fp ON h.file_path_id = fp.id - LEFT JOIN LATERAL ( - SELECT - fd.path AS diff_path - FROM - file_diff fd - WHERE - fd.file_path_id = fp.id - AND fd.version = h.project_version_name - AND fd.rank = 0 - ) fdj ON TRUE - GROUP BY - h.version_id; - """ - ) - - -def downgrade(): - # ### commands auto generated by Alembic - please adjust! ### - op.drop_index( - "ix_project_version_delta_version_id_rank", table_name="project_version_delta" - ) - op.drop_index( - op.f("ix_project_version_delta_version_id"), table_name="project_version_delta" - ) - op.drop_index( - op.f("ix_project_version_delta_rank"), table_name="project_version_delta" - ) - op.drop_table("project_version_delta") - # ### end Alembic commands ### From 99194b01d76a27a68740fa80473827c7b13ea1ae Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Tue, 28 Oct 2025 15:58:33 +0100 Subject: [PATCH 26/36] add mechanism for handling previous history files. - handle delete project cleanup of delta checkpoints --- server/mergin/sync/files.py | 6 +- server/mergin/sync/models.py | 22 +++++- .../mergin/tests/test_project_controller.py | 4 + server/mergin/tests/test_public_api_v2.py | 77 ++++++++++++++----- 4 files changed, 81 insertions(+), 28 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index f8103926..d7d995d6 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -286,12 +286,12 @@ def to_data_delta(self): @dataclass class DeltaChange(DeltaChangeBase): - """Delta items stored in database as list of this item with single diff file""" + """Change items stored in database as list of this item with single diff file""" diff: Optional[str] = None - def to_merged_delta(self) -> DeltaChangeMerged: - """Convert DeltaData to DeltaMerged with multiple diffs""" + def to_merged(self) -> DeltaChangeMerged: + """Convert to DeltaMerged with multiple diffs""" result = DeltaChangeMerged( path=self.path, size=self.size, diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 0d342303..9e12b68f 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -275,6 +275,11 @@ def delete(self, removed_by: int = None): db.session.execute( upload_table.delete().where(upload_table.c.project_id == self.id) ) + # remove project version delta related to project + delta_table = ProjectVersionDelta.__table__ + db.session.execute( + delta_table.delete().where(delta_table.c.project_id == self.id) + ) self.project_users.clear() access_requests = ( AccessRequest.query.filter_by(project_id=self.id) @@ -391,7 +396,7 @@ def get_delta_changes( for checkpoint in expected_checkpoints: existing_delta = existing_delta_map.get((checkpoint.rank, checkpoint.end)) - # we have change in database, just return delta data from it + # we have delta in database, just return delta data from it if existing_delta: result.extend(DeltaChangeSchema(many=True).load(existing_delta.changes)) continue @@ -1021,6 +1026,7 @@ def merge_changes( Changes are merged based on file path and change type. """ result: Dict[str, DeltaChangeMerged] = {} + updating_files: Set[str] = set() # sorting changes by version to apply them in correct order items.sort(key=lambda x: x.version) @@ -1028,12 +1034,15 @@ def handle_replace(result, path, current, previous): result[path] = current def handle_delete(result, path, current, previous): - del result[path] + + if path in updating_files: + result[path] = current + else: + del result[path] def handle_update(result, path, current, previous): # handle update case, when previous change was create - just revert to create with new metadata current.change = previous.change - current.version = previous.version current.diffs = [] result[path] = current @@ -1072,12 +1081,17 @@ def handle_update_diff(result, path, current, previous): } for item in items: - current = item.to_merged_delta() + current = item.to_merged() path = current.path + # path is key for merging changes previous = result.get(path) + # adding new file change if not seen before if not previous: result[path] = current + # track existing paths to avoid deleting created files later + if current.change != PushChangeType.CREATE: + updating_files.add(path) continue handler = dispatch.get((previous.change, current.change)) diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index 5b34bfba..b0d89502 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -28,6 +28,7 @@ from ..sync.models import ( FileDiff, Project, + ProjectVersionDelta, Upload, ProjectVersion, SyncFailuresHistory, @@ -536,6 +537,7 @@ def test_delete_project(client): assert not Project.query.filter_by( workspace_id=test_workspace_id, name=test_project ).count() + assert not ProjectVersionDelta.query.filter_by(project_id=project.id).count() assert not os.path.exists(project_dir) rm_project = Project.query.get(project.id) assert rm_project.removed_at and not rm_project.storage_params @@ -1781,6 +1783,8 @@ def test_optimize_storage(app, client, diff_project): diff_project.latest_version = 8 ProjectVersion.query.filter_by(project_id=diff_project.id, name=9).delete() ProjectVersion.query.filter_by(project_id=diff_project.id, name=10).delete() + ProjectVersionDelta.query.filter_by(project_id=diff_project.id, version=9).delete() + ProjectVersionDelta.query.filter_by(project_id=diff_project.id, version=10).delete() db.session.commit() diff_project.cache_latest_files() assert diff_project.latest_version == 8 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index eceee077..81ef52c1 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -369,12 +369,12 @@ def test_delta_merge_changes(): merged = ProjectVersionDelta.merge_changes([create, update]) assert len(merged) == 1 assert merged[0].change == PushChangeType.CREATE - assert merged[0].version == create.version + assert merged[0].version == update.version # check reverse order as well merged = ProjectVersionDelta.merge_changes([update, create]) assert len(merged) == 1 assert merged[0].change == PushChangeType.CREATE - assert merged[0].version == create.version + assert merged[0].version == update.version # CREATE + DELETE -> removed merged = ProjectVersionDelta.merge_changes([create, delete]) @@ -406,6 +406,36 @@ def test_delta_merge_changes(): assert merged[0].checksum == update_diff2.checksum assert [d.path for d in merged[0].diffs] == ["diff1", "diff2"] + # case when trying to delete already existing file in history + # copy create with new version number + delete = DeltaChange( + path="file1.gpkg", + change=PushChangeType.DELETE, + version=6, + size=0, + checksum="ghi", + ) + created = DeltaChange( + path="file1.gpkg", + change=PushChangeType.CREATE, + version=7, + size=100, + checksum="abc", + ) + delete8 = DeltaChange( + path="file1.gpkg", + change=PushChangeType.DELETE, + version=8, + size=0, + checksum="abc2", + ) + merged = ProjectVersionDelta.merge_changes([delete, create, delete8]) + assert len(merged) == 1 + assert merged[0].change == PushChangeType.DELETE + assert merged[0].version == delete8.version + assert merged[0].size == delete8.size + assert merged[0].checksum == delete8.checksum + def test_project_version_delta_changes(client, diff_project: Project): """Test that get_delta_changes and its schema work as expected""" @@ -419,6 +449,7 @@ def test_project_version_delta_changes(client, diff_project: Project): .order_by(ProjectVersionDelta.version) .all() ) + # check if deltas are created after pushes assert len(deltas) == 10 initial_delta = deltas[0] initial_version = ProjectVersion.query.filter_by( @@ -429,16 +460,16 @@ def test_project_version_delta_changes(client, diff_project: Project): assert initial_delta.rank == 0 assert initial_delta.version == 1 + # delete file delta = diff_project.get_delta_changes(1, 2) assert len(delta) == 1 assert delta[0].change == PushChangeType.DELETE - # no ranks created as we get here just first version with get_delta - assert ProjectVersionDelta.query.filter_by(rank=1).count() == 0 # delete + create version delta = diff_project.get_delta_changes(1, 3) assert len(delta) == 1 assert delta[0].change == PushChangeType.CREATE + # file was created in v3 assert delta[0].version == 3 assert delta[0].checksum == deltas[3].changes[0]["checksum"] @@ -450,10 +481,10 @@ def test_project_version_delta_changes(client, diff_project: Project): # create rank 1 checkpoint for v4 delta = diff_project.get_delta_changes(0, 4) - checkpoint_changes = ProjectVersionDelta.query.filter_by(rank=1) + checkpoint = ProjectVersionDelta.query.filter_by(rank=1) filediff_checkpoints = FileDiff.query.filter_by(rank=1) - checkpoint_change = checkpoint_changes.first() - assert checkpoint_changes.count() == 1 + checkpoint_change = checkpoint.first() + assert checkpoint.count() == 1 assert checkpoint_change.version == deltas[3].version assert filediff_checkpoints.count() == 0 # check if filediff basefile is correctly set @@ -463,7 +494,7 @@ def test_project_version_delta_changes(client, diff_project: Project): assert delta_base_gpkg # from history is clear, that we are just creating geopackage in this range assert delta_base_gpkg.change == PushChangeType.CREATE - assert delta_base_gpkg.version == 3 + assert delta_base_gpkg.version == 4 assert delta_base_gpkg.path == file_history.path assert delta_base_gpkg.size == file_history.size assert delta_base_gpkg.checksum == file_history.checksum @@ -476,7 +507,11 @@ def test_project_version_delta_changes(client, diff_project: Project): assert delta_test_gpkg assert delta_test_gpkg.change == PushChangeType.CREATE assert ProjectVersionDelta.query.filter_by(rank=1).count() == 2 - # base gpgk is transparent + assert ProjectVersionDelta.query.filter_by(rank=2).count() == 0 + # check if version is having rank 1 checkpoint with proper end version + assert ProjectVersionDelta.query.filter_by(rank=1, version=4).first() + assert ProjectVersionDelta.query.filter_by(rank=1, version=8).first() + # base gpgk is transparent, bacause we are requesting from 0 assert not next((c for c in delta if c.path == "base.gpkg"), None) delta = diff_project.get_delta_changes(latest_version.name - 3, latest_version.name) @@ -912,26 +947,26 @@ def test_project_delta(client, diff_project): # since 1 to latest version response = client.get(f"v2/projects/{diff_project.id}/delta?since=1") assert response.status_code == 200 - assert len(response.json) == 1 - assert response.json[0]["change"] == PushChangeType.CREATE.value + # create of test.gpkg and delete base.gpkg + assert len(response.json) == 2 + assert response.json[0]["change"] == PushChangeType.DELETE.value assert response.json[0]["version"] == 9 - files = ( - ProjectVersion.query.filter_by( - project_id=diff_project.id, name=response.json[0]["version"] - ) - .first() - .files - ) + assert response.json[0]["path"] == "base.gpkg" + assert response.json[0]["size"] == 98304 + + assert response.json[1]["change"] == PushChangeType.CREATE.value + assert response.json[1]["version"] == 9 + assert response.json[1]["path"] == "test.gpkg" + assert response.json[1]["size"] == 98304 # simple update response = client.get(f"v2/projects/{diff_project.id}/delta?since=4&to=8") assert response.status_code == 200 delta = response.json assert len(delta) == 1 - - # simulate pull of delta[0] assert delta[0]["change"] == PushChangeType.UPDATE.value - assert delta[0]["version"] == 5 + # version is new latest version of the change + assert delta[0]["version"] == 7 assert not delta[0].get("diffs") From 2077d89dcde02b157ed543a2d77b7b6740f197d1 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Tue, 28 Oct 2025 17:19:42 +0100 Subject: [PATCH 27/36] fix integrity test --- server/mergin/tests/test_project_controller.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/mergin/tests/test_project_controller.py b/server/mergin/tests/test_project_controller.py index b0d89502..916c2bc4 100644 --- a/server/mergin/tests/test_project_controller.py +++ b/server/mergin/tests/test_project_controller.py @@ -2341,6 +2341,10 @@ def _get_user_agent(): .order_by(desc(ProjectVersion.created)) .first() ) + # remove project version delta entries + ProjectVersionDelta.query.filter_by( + project_id=upload.project_id, version=pv.name + ).delete() db.session.delete(pv) db.session.commit() upload.project.cache_latest_files() From 3685e43c590fdb8e1cc5da14430d91f85bc26eff Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Wed, 29 Oct 2025 17:52:26 +0100 Subject: [PATCH 28/36] Upgrade logic - update reponse to items: [] - make it more clear with changes logic (?) - @varmar05 --- server/mergin/sync/files.py | 13 +- server/mergin/sync/models.py | 187 +++++++++++++----- server/mergin/sync/public_api_v2.yaml | 15 +- .../mergin/sync/public_api_v2_controller.py | 8 +- server/mergin/tests/test_public_api_v2.py | 67 ++++--- 5 files changed, 198 insertions(+), 92 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index d7d995d6..63c387fd 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -331,8 +331,8 @@ def patch_field(self, data, **kwargs): return data -class DeltaChangeRespSchema(DeltaChangeBaseSchema): - """Schema for delta data response""" +class DeltaChangeItemSchema(DeltaChangeBaseSchema): + """Schema for delta changes response""" diffs = fields.List(fields.Nested(DeltaChangeDiffFileSchema())) @@ -342,3 +342,12 @@ def patch_field(self, data, **kwargs): if not data.get("diffs"): data.pop("diffs", None) return data + + +class DeltaChangeRespSchema(ma.Schema): + """Schema for list of delta changes wrapped in items field""" + + items = fields.List(fields.Nested(DeltaChangeItemSchema())) + + class Meta: + unknown = EXCLUDE diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 9e12b68f..985453c8 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -65,6 +65,16 @@ class FileSyncErrorType(Enum): SYNC_ERROR = "sync error" +class ChangeComparisonAction(Enum): + """Actions to take when comparing two changes""" + + REPLACE = "replace" + DELETE = "delete" + UPDATE = "update" + UPDATE_DIFF = "update_diff" + EXCLUDE = "exclude" # Return None to exclude the file + + class Project(db.Model): id = db.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) name = db.Column(db.String, index=True) @@ -993,7 +1003,7 @@ class ProjectVersionDelta(db.Model): version = db.Column(db.Integer, nullable=False, index=True) # exponential order of changes json rank = db.Column(db.Integer, nullable=False, index=True) - # to which project version is this linked + # to which project is this linked project_id = db.Column( UUID(as_uuid=True), db.ForeignKey("project.id", ondelete="CASCADE"), @@ -1025,94 +1035,167 @@ def merge_changes( Merge changes json array objects into one list of changes. Changes are merged based on file path and change type. """ - result: Dict[str, DeltaChangeMerged] = {} updating_files: Set[str] = set() # sorting changes by version to apply them in correct order items.sort(key=lambda x: x.version) - def handle_replace(result, path, current, previous): - result[path] = current - - def handle_delete(result, path, current, previous): + # Merge changes for each file in a single pass + result: Dict[str, DeltaChangeMerged] = {} + for item in items: + path = item.path + current = item.to_merged() - if path in updating_files: + # First change for this file + if path not in result: result[path] = current + # track existing paths to avoid deleting files that are already in history before + if current.change != PushChangeType.CREATE: + updating_files.add(path) + continue + + # Compare and merge with previous change for this file + can_delete = path in updating_files + new_change = ProjectVersionDelta._compare_changes( + result[path], current, can_delete + ) + + # Update result (or remove if no change is detected) + if new_change is not None: + result[path] = new_change else: del result[path] - def handle_update(result, path, current, previous): - # handle update case, when previous change was create - just revert to create with new metadata - current.change = previous.change - current.diffs = [] - result[path] = current + return list(result.values()) - def handle_update_diff(result, path, current, previous): - current.diffs = (previous.diffs or []) + (current.diffs or []) - result[path] = current + @staticmethod + def _compare_changes( + previous: DeltaChangeMerged, + new: DeltaChangeMerged, + prevent_delete_change: bool, + ) -> Optional[DeltaChangeMerged]: + """ + Compare and merge two changes for the same file. + + Args: + previous: Previously accumulated change + new: New change to compare + prevent_delete_change: Whether the change can be deleted when resolving create+delete sequences + + Returns: + Merged change or None if file should be excluded + """ - dispatch = { + # Map change type pairs to actions + action_map = { # create + delete = file is transparent for current changes -> delete it - (PushChangeType.CREATE, PushChangeType.DELETE): handle_delete, + ( + PushChangeType.CREATE, + PushChangeType.DELETE, + ): ChangeComparisonAction.DELETE, # create + update = create with updated info - (PushChangeType.CREATE, PushChangeType.UPDATE): handle_update, - (PushChangeType.CREATE, PushChangeType.UPDATE_DIFF): handle_update, - (PushChangeType.CREATE, PushChangeType.CREATE): None, + ( + PushChangeType.CREATE, + PushChangeType.UPDATE, + ): ChangeComparisonAction.UPDATE, + ( + PushChangeType.CREATE, + PushChangeType.UPDATE_DIFF, + ): ChangeComparisonAction.UPDATE, + ( + PushChangeType.CREATE, + PushChangeType.CREATE, + ): ChangeComparisonAction.EXCLUDE, # update + update_diff = update with latest info ( PushChangeType.UPDATE, PushChangeType.UPDATE_DIFF, - ): handle_update, - (PushChangeType.UPDATE, PushChangeType.UPDATE): handle_replace, - (PushChangeType.UPDATE, PushChangeType.DELETE): handle_replace, - (PushChangeType.UPDATE, PushChangeType.CREATE): handle_replace, + ): ChangeComparisonAction.UPDATE, + ( + PushChangeType.UPDATE, + PushChangeType.UPDATE, + ): ChangeComparisonAction.REPLACE, + ( + PushChangeType.UPDATE, + PushChangeType.DELETE, + ): ChangeComparisonAction.REPLACE, + ( + PushChangeType.UPDATE, + PushChangeType.CREATE, + ): ChangeComparisonAction.REPLACE, # update_diff + update_diff = update_diff with latest info with proper order of diffs ( PushChangeType.UPDATE_DIFF, PushChangeType.UPDATE_DIFF, - ): handle_update_diff, - (PushChangeType.UPDATE_DIFF, PushChangeType.UPDATE): handle_replace, - (PushChangeType.UPDATE_DIFF, PushChangeType.DELETE): handle_replace, - (PushChangeType.UPDATE_DIFF, PushChangeType.CREATE): None, - (PushChangeType.DELETE, PushChangeType.CREATE): handle_replace, - # delete + update = invalid sequence, keep delete - (PushChangeType.DELETE, PushChangeType.UPDATE): None, - (PushChangeType.DELETE, PushChangeType.UPDATE_DIFF): None, - (PushChangeType.DELETE, PushChangeType.DELETE): None, + ): ChangeComparisonAction.UPDATE_DIFF, + ( + PushChangeType.UPDATE_DIFF, + PushChangeType.UPDATE, + ): ChangeComparisonAction.REPLACE, + ( + PushChangeType.UPDATE_DIFF, + PushChangeType.DELETE, + ): ChangeComparisonAction.REPLACE, + ( + PushChangeType.UPDATE_DIFF, + PushChangeType.CREATE, + ): ChangeComparisonAction.EXCLUDE, + ( + PushChangeType.DELETE, + PushChangeType.CREATE, + ): ChangeComparisonAction.REPLACE, + # delete + update = invalid sequence + ( + PushChangeType.DELETE, + PushChangeType.UPDATE, + ): ChangeComparisonAction.EXCLUDE, + ( + PushChangeType.DELETE, + PushChangeType.UPDATE_DIFF, + ): ChangeComparisonAction.EXCLUDE, + ( + PushChangeType.DELETE, + PushChangeType.DELETE, + ): ChangeComparisonAction.EXCLUDE, } - for item in items: - current = item.to_merged() - path = current.path - # path is key for merging changes - previous = result.get(path) + action = action_map.get((previous.change, new.change)) + result = None + if action == ChangeComparisonAction.REPLACE: + result = new - # adding new file change if not seen before - if not previous: - result[path] = current - # track existing paths to avoid deleting created files later - if current.change != PushChangeType.CREATE: - updating_files.add(path) - continue + elif action == ChangeComparisonAction.DELETE: + # if change is create + delete, we can just remove the change from accumulated changes + # only if this action is allowed (file existed before) + if prevent_delete_change: + result = new - handler = dispatch.get((previous.change, current.change)) - if handler: - handler(result, path, current, previous) + elif action == ChangeComparisonAction.UPDATE: + # handle update case, when previous change was create - just revert to create with new metadata + new.change = previous.change + new.diffs = [] + result = new - return list(result.values()) + elif action == ChangeComparisonAction.UPDATE_DIFF: + new.diffs = (previous.diffs or []) + (new.diffs or []) + result = new + + return result @classmethod def create_checkpoint( cls, project_id: str, checkpoint: Checkpoint, - changes: List[ProjectVersionDelta] = [], + from_deltas: List[ProjectVersionDelta] = [], ) -> Optional[ProjectVersionDelta]: """ Creates and caches new checkpoint and any required FileDiff checkpoints. + Use from_deltas to create checkpoint from existing individual deltas. + Returns created ProjectVersionDelta object with checkpoint. """ delta_range = [ change - for change in changes + for change in from_deltas if checkpoint.start <= change.version <= checkpoint.end ] @@ -1122,7 +1205,7 @@ def create_checkpoint( ) return None - # dump delta objects from database and flatten list for merging + # dump changes lists from database and flatten list for merging changes = [] for delta in delta_range: changes.extend(DeltaChangeSchema(many=True).load(delta.changes)) diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index 9ac4a2d0..7436e71d 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -410,9 +410,7 @@ paths: content: application/json: schema: - type: array - items: - $ref: "#/components/schemas/ProjectDelta" + $ref: "#/components/schemas/ProjectDeltaResponse" "400": $ref: "#/components/responses/BadRequest" "404": @@ -841,7 +839,7 @@ components: type: string enum: [create, update, delete, update_diff] example: update - ProjectDelta: + ProjectDeltaChange: type: object required: - path @@ -873,3 +871,12 @@ components: path: type: string example: survey.gpkg-diff-1 + ProjectDeltaResponse: + type: object + required: + - items + properties: + items: + type: array + items: + $ref: "#/components/schemas/ProjectDeltaChange" diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index 5f539b50..dd1802df 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -414,10 +414,8 @@ def get_project_delta(id: str, since: int, to: Optional[int] = None): abort(400, "'to' version exceeds latest project version") if since >= to: - abort(400, "'since' version must be less than or equal to 'to' version") + abort(400, "'since' version must be less than 'to' version") - delta_changes = project.get_delta_changes(since, to) - if delta_changes is None: - abort(404) + delta_changes = project.get_delta_changes(since, to) or [] - return DeltaChangeRespSchema(many=True).dump(delta_changes), 200 + return DeltaChangeRespSchema().dump({"items": delta_changes}), 200 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 81ef52c1..d9efe7f1 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -415,7 +415,7 @@ def test_delta_merge_changes(): size=0, checksum="ghi", ) - created = DeltaChange( + create = DeltaChange( path="file1.gpkg", change=PushChangeType.CREATE, version=7, @@ -449,7 +449,7 @@ def test_project_version_delta_changes(client, diff_project: Project): .order_by(ProjectVersionDelta.version) .all() ) - # check if deltas are created after pushes + # check if deltas are created after pushes within ProjectVersion creation assert len(deltas) == 10 initial_delta = deltas[0] initial_version = ProjectVersion.query.filter_by( @@ -535,9 +535,18 @@ def test_project_version_delta_changes(client, diff_project: Project): fh = FileHistory.query.filter_by( project_version_name=latest_version.name - 1 ).first() - base_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() - assert base_gpkg_checkpoint - assert base_gpkg_checkpoint.version == latest_version.name + 6 + # testing constistency of db entries FileDiff and ProjectVersionDelta + test_gpkg_checkpoint = FileDiff.query.filter_by(basefile_id=fh.id, rank=1).first() + assert test_gpkg_checkpoint + assert test_gpkg_checkpoint.version == latest_version.name + 6 + delta_checkpoint = ProjectVersionDelta.query.filter_by( + project_id=diff_project.id, version=latest_version.name + 6, rank=1 + ).first() + assert delta_checkpoint + assert len(delta_checkpoint.changes) == 1 + assert delta_checkpoint.changes[0]["version"] == latest_version.name + 6 + assert delta_checkpoint.changes[0]["change"] == PushChangeType.UPDATE_DIFF.value + assert delta_checkpoint.changes[0]["diff"] == test_gpkg_checkpoint.path fh = FileHistory.query.filter_by( project_version_name=latest_version.name + 6 @@ -545,7 +554,7 @@ def test_project_version_delta_changes(client, diff_project: Project): delta = diff_project.get_delta_changes(12, latest_version.name + 6) assert len(delta) == 1 assert len(delta[0].diffs) == 1 - assert delta[0].diffs[0].path == base_gpkg_checkpoint.path + assert delta[0].diffs[0].path == test_gpkg_checkpoint.path assert delta[0].change == PushChangeType.UPDATE_DIFF assert delta[0].checksum == fh.checksum assert delta[0].size == fh.size @@ -916,17 +925,17 @@ def test_project_delta(client, diff_project): push_change(initial_project, "added", "base.gpkg", working_dir) response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") assert response.status_code == 200 - delta = response.json - assert len(delta) == 1 - assert delta[0]["change"] == PushChangeType.CREATE.value - assert delta[0]["version"] == 1 + changes = response.json["items"] + assert len(changes) == 1 + assert changes[0]["change"] == PushChangeType.CREATE.value + assert changes[0]["version"] == 1 # remove the file and get changes from 0 -> 2 where base gpgkg is removed -> transparent push_change(initial_project, "removed", "base.gpkg", working_dir) response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") assert response.status_code == 200 - delta = response.json - assert len(delta) == 0 + changes = response.json["items"] + assert len(changes) == 0 # non valid cases response = client.get(f"v2/projects/{diff_project.id}/delta") @@ -947,32 +956,32 @@ def test_project_delta(client, diff_project): # since 1 to latest version response = client.get(f"v2/projects/{diff_project.id}/delta?since=1") assert response.status_code == 200 + changes = response.json["items"] # create of test.gpkg and delete base.gpkg - assert len(response.json) == 2 - assert response.json[0]["change"] == PushChangeType.DELETE.value - assert response.json[0]["version"] == 9 - assert response.json[0]["path"] == "base.gpkg" - assert response.json[0]["size"] == 98304 + assert len(changes) == 2 + assert changes[0]["change"] == PushChangeType.DELETE.value + assert changes[0]["version"] == 9 + assert changes[0]["path"] == "base.gpkg" + assert changes[0]["size"] == 98304 - assert response.json[1]["change"] == PushChangeType.CREATE.value - assert response.json[1]["version"] == 9 - assert response.json[1]["path"] == "test.gpkg" - assert response.json[1]["size"] == 98304 + assert changes[1]["change"] == PushChangeType.CREATE.value + assert changes[1]["version"] == 9 + assert changes[1]["path"] == "test.gpkg" + assert changes[1]["size"] == 98304 # simple update response = client.get(f"v2/projects/{diff_project.id}/delta?since=4&to=8") assert response.status_code == 200 - delta = response.json - assert len(delta) == 1 - assert delta[0]["change"] == PushChangeType.UPDATE.value + changes = response.json["items"] + assert len(changes) == 1 + assert changes[0]["change"] == PushChangeType.UPDATE.value # version is new latest version of the change - assert delta[0]["version"] == 7 - assert not delta[0].get("diffs") + assert changes[0]["version"] == 7 + assert not changes[0].get("diffs") -# integration test for pull mechanism def test_project_pull_diffs(client, diff_project): - """Test project pull mechanisom in v2 with diff files""" + """Test project pull mechanisom in v2 with diff files. Integration test for pull mechanism""" since = 5 to = 7 # check diff files in database where we can get them with right order and metadata @@ -983,7 +992,7 @@ def test_project_pull_diffs(client, diff_project): ) response = client.get(f"v2/projects/{diff_project.id}/delta?since={since}&to={to}") assert response.status_code == 200 - delta = response.json + delta = response.json["items"] assert len(delta) == 1 assert delta[0]["change"] == PushChangeType.UPDATE_DIFF.value assert delta[0]["version"] == 7 From 02ff0271ddd990595cb328cd51b4539a57fcdb81 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Thu, 6 Nov 2025 08:31:54 +0100 Subject: [PATCH 29/36] API: add 'v' prefix to version in delta endpoint --- server/mergin/sync/files.py | 1 + server/mergin/sync/public_api_v2.yaml | 15 ++++----- .../mergin/sync/public_api_v2_controller.py | 11 +++++-- server/mergin/tests/test_public_api_v2.py | 32 ++++++++++--------- 4 files changed, 34 insertions(+), 25 deletions(-) diff --git a/server/mergin/sync/files.py b/server/mergin/sync/files.py index 63c387fd..8fca7e39 100644 --- a/server/mergin/sync/files.py +++ b/server/mergin/sync/files.py @@ -334,6 +334,7 @@ def patch_field(self, data, **kwargs): class DeltaChangeItemSchema(DeltaChangeBaseSchema): """Schema for delta changes response""" + version = fields.Function(lambda obj: f"v{obj.version}") diffs = fields.List(fields.Nested(DeltaChangeDiffFileSchema())) @post_dump diff --git a/server/mergin/sync/public_api_v2.yaml b/server/mergin/sync/public_api_v2.yaml index 7436e71d..cb3282e7 100644 --- a/server/mergin/sync/public_api_v2.yaml +++ b/server/mergin/sync/public_api_v2.yaml @@ -393,16 +393,12 @@ paths: in: query required: true schema: - type: integer - example: - minimum: 0 + $ref: "#/components/schemas/VersionName" description: Start version (exclusive) - name: to in: query schema: - type: integer - example: 2 - minimum: 1 + $ref: "#/components/schemas/VersionName" description: End version (inclusive) responses: "200": @@ -858,8 +854,7 @@ components: type: string example: 9adb76bf81a34880209040ffe5ee262a090b62ab version: - type: integer - example: 1 + $ref: "#/components/schemas/VersionName" change: $ref: "#/components/schemas/ProjectChangeType" diffs: @@ -880,3 +875,7 @@ components: type: array items: $ref: "#/components/schemas/ProjectDeltaChange" + VersionName: + type: string + pattern: '^v\d+$' + example: v2 diff --git a/server/mergin/sync/public_api_v2_controller.py b/server/mergin/sync/public_api_v2_controller.py index dd1802df..11339f1e 100644 --- a/server/mergin/sync/public_api_v2_controller.py +++ b/server/mergin/sync/public_api_v2_controller.py @@ -405,11 +405,18 @@ def upload_chunk(id: str): ) -def get_project_delta(id: str, since: int, to: Optional[int] = None): +def get_project_delta(id: str, since: str, to: Optional[str] = None): """Get project changes (delta) between two versions""" project: Project = require_project_by_uuid(id, ProjectPermissions.Read) - to = project.latest_version if to is None else to + since = ProjectVersion.from_v_name(since) + to = project.latest_version if to is None else ProjectVersion.from_v_name(to) + if since < 0 or to < 1: + abort( + 400, + "Invalid version number, minimum version for 'since' is 0 and minimum version for 'to' is 1", + ) + if to > project.latest_version: abort(400, "'to' version exceeds latest project version") diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index d9efe7f1..b71f465b 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -923,16 +923,16 @@ def test_project_delta(client, diff_project): os.path.join(working_dir, "base.gpkg"), ) push_change(initial_project, "added", "base.gpkg", working_dir) - response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") + response = client.get(f"v2/projects/{initial_project.id}/delta?since=v0") assert response.status_code == 200 changes = response.json["items"] assert len(changes) == 1 assert changes[0]["change"] == PushChangeType.CREATE.value - assert changes[0]["version"] == 1 + assert changes[0]["version"] == "v1" # remove the file and get changes from 0 -> 2 where base gpgkg is removed -> transparent push_change(initial_project, "removed", "base.gpkg", working_dir) - response = client.get(f"v2/projects/{initial_project.id}/delta?since=0") + response = client.get(f"v2/projects/{initial_project.id}/delta?since=v0") assert response.status_code == 200 changes = response.json["items"] assert len(changes) == 0 @@ -940,43 +940,43 @@ def test_project_delta(client, diff_project): # non valid cases response = client.get(f"v2/projects/{diff_project.id}/delta") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=2&to=1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v2&to=v1") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=-2") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v-2") assert response.status_code == 400 - response = client.get(f"v2/projects/{diff_project.id}/delta?since=-2&to=-1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v-2&to=v-1") assert response.status_code == 400 # exceeding latest version - response = client.get(f"v2/projects/{diff_project.id}/delta?since=0&to=2000") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v0&to=v2000") assert response.status_code == 400 # no changes between versions with same number - response = client.get(f"v2/projects/{diff_project.id}/delta?since=1&to=1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1&to=v1") assert response.status_code == 400 # since 1 to latest version - response = client.get(f"v2/projects/{diff_project.id}/delta?since=1") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v1") assert response.status_code == 200 changes = response.json["items"] # create of test.gpkg and delete base.gpkg assert len(changes) == 2 assert changes[0]["change"] == PushChangeType.DELETE.value - assert changes[0]["version"] == 9 + assert changes[0]["version"] == "v9" assert changes[0]["path"] == "base.gpkg" assert changes[0]["size"] == 98304 assert changes[1]["change"] == PushChangeType.CREATE.value - assert changes[1]["version"] == 9 + assert changes[1]["version"] == "v9" assert changes[1]["path"] == "test.gpkg" assert changes[1]["size"] == 98304 # simple update - response = client.get(f"v2/projects/{diff_project.id}/delta?since=4&to=8") + response = client.get(f"v2/projects/{diff_project.id}/delta?since=v4&to=v8") assert response.status_code == 200 changes = response.json["items"] assert len(changes) == 1 assert changes[0]["change"] == PushChangeType.UPDATE.value # version is new latest version of the change - assert changes[0]["version"] == 7 + assert changes[0]["version"] == "v7" assert not changes[0].get("diffs") @@ -990,12 +990,14 @@ def test_project_pull_diffs(client, diff_project): .order_by(FileDiff.version) .all() ) - response = client.get(f"v2/projects/{diff_project.id}/delta?since={since}&to={to}") + response = client.get( + f"v2/projects/{diff_project.id}/delta?since=v{since}&to=v{to}" + ) assert response.status_code == 200 delta = response.json["items"] assert len(delta) == 1 assert delta[0]["change"] == PushChangeType.UPDATE_DIFF.value - assert delta[0]["version"] == 7 + assert delta[0]["version"] == "v7" first_diff = delta[0]["diffs"][0] second_diff = delta[0]["diffs"][1] assert first_diff["path"] == current_diffs[0].path From f4f00f0baec4d389f3c50660266bd40a5e04ba98 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Fri, 14 Nov 2025 09:58:51 +0100 Subject: [PATCH 30/36] Make construct diff method recursive --- server/mergin/sync/models.py | 47 +++++++++++++---------- server/mergin/tests/test_public_api_v2.py | 7 +++- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 985453c8..61df0637 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -775,7 +775,7 @@ def diffs_chain( ), None, ) - if diff: + if diff and os.path.exists(diff.abs_path): diffs.append(diff) elif item.rank > 0: # fallback if checkpoint does not exist: replace merged diff with individual diffs @@ -879,13 +879,17 @@ def abs_path(self) -> str: def construct_checkpoint(self) -> bool: """Create a diff file checkpoint (aka. merged diff). Find all smaller diffs which are needed to create the final diff file and merge them. - In case of missing some lower rank checkpoint, use individual diffs instead. + In case of missing some lower rank checkpoints, create them recursively. Once checkpoint is created, size and checksum are updated in the database. Returns: bool: True if checkpoint was successfully created or already present """ + logging.debug( + f"Construct checkpoint for file {self.path} v{self.version} of rank {self.rank}" + ) + if os.path.exists(self.abs_path): return True @@ -914,7 +918,7 @@ def construct_checkpoint(self) -> bool: return False diffs_paths = [] - # let's confirm we have all intermediate diffs needed, if not, we need to use individual diffs instead + # let's confirm we have all intermediate diffs needed, if not, we need to create them (recursively) first cached_items = Checkpoint.get_checkpoints( cache_level.start, cache_level.end - 1 ) @@ -944,27 +948,28 @@ def construct_checkpoint(self) -> bool: ), None, ) - if diff and os.path.exists(diff.abs_path): + + # lower rank diff not even in DB yet - create it and try to construct merged file + if not diff: + diff = FileDiff( + basefile=basefile, + version=item.end, + rank=item.rank, + path=f"{basefile.file.path}-diff-{uuid.uuid4()}", + size=None, + checksum=None, + ) + db.session.add(diff) + db.session.commit() + + diff_exists = diff.construct_checkpoint() + if diff_exists: diffs_paths.append(diff.abs_path) else: - individual_diffs = ( - FileDiff.query.filter_by( - basefile_id=basefile.id, - rank=0, - ) - .filter( - FileDiff.version >= item.start, FileDiff.version <= item.end - ) - .order_by(FileDiff.version) - .all() + logging.error( + f"Unable to create checkpoint diff for {item} for file {self.file_path_id}" ) - if individual_diffs: - diffs_paths.extend([i.abs_path for i in individual_diffs]) - else: - logging.error( - f"Unable to find diffs for {item} for file {self.file_path_id}" - ) - return False + return False # we apply latest change (if any) on previous version end_diff = FileDiff.query.filter_by( diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index b71f465b..cc96b5bf 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -280,7 +280,10 @@ def test_create_diff_checkpoint(diff_project): diff.construct_checkpoint() assert mtime == os.path.getmtime(diff.abs_path) - # diff for v17-v32 with merged diffs (using one above) + # some lower rank diffs still missing + assert not FileDiff.query.filter_by(version=24, rank=1).count() + + # diff for v17-v32 with merged diffs, this will also create lower missing ranks diff = FileDiff( basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=32, rank=2 ) @@ -288,6 +291,8 @@ def test_create_diff_checkpoint(diff_project): db.session.commit() diff.construct_checkpoint() assert os.path.exists(diff.abs_path) + lower_diff = FileDiff.query.filter_by(version=24, rank=1).first() + assert os.path.exists(lower_diff.abs_path) # assert gpkg diff is the same as it would be from merging all individual diffs individual_diffs = ( From 21dfb2d8196093b0763279cf6dc89562689d71d4 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Fri, 14 Nov 2025 16:43:12 +0100 Subject: [PATCH 31/36] Make delta project function to create missing checkpoints recursively - for file diffs we only create higher ranks if it makes sense (contains changes) - if delta cannot be created / fetched return on error --- server/mergin/sync/models.py | 134 ++++++++++++++++++++++------------- 1 file changed, 83 insertions(+), 51 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 61df0637..92f656d7 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -399,9 +399,6 @@ def get_delta_changes( ) existing_delta_map = {(c.rank, c.version): c for c in expected_deltas} - # Cache all individual (rank 0) delta rows in the required range. - individual_deltas: List[ProjectVersionDelta] = [] - result: List[DeltaChange] = [] for checkpoint in expected_checkpoints: existing_delta = existing_delta_map.get((checkpoint.rank, checkpoint.end)) @@ -411,29 +408,10 @@ def get_delta_changes( result.extend(DeltaChangeSchema(many=True).load(existing_delta.changes)) continue - # If higher rank delta checkopoint does not exists, we are using rank=0 deltas to create checkopoint + # If higher rank delta checkopoint does not exists we need to create it if checkpoint.rank > 0: - individual_deltas = ( - ProjectVersionDelta.query.filter( - ProjectVersionDelta.project_id == project_id, - ProjectVersionDelta.version >= since, - ProjectVersionDelta.version <= to, - ProjectVersionDelta.rank == 0, - ) - .order_by(ProjectVersionDelta.version) - .all() - if not individual_deltas - else individual_deltas - ) - - if not individual_deltas: - logging.error( - f"No individual deltas found for project {project_id} in range {since} / {to} to create checkpoint." - ) - return - new_checkpoint = ProjectVersionDelta.create_checkpoint( - project_id, checkpoint, individual_deltas + project_id, checkpoint ) if new_checkpoint: result.extend( @@ -443,6 +421,7 @@ def get_delta_changes( logging.error( f"Not possible to create checkpoint for project {project_id} in range {checkpoint.start}-{checkpoint.end}" ) + return return ProjectVersionDelta.merge_changes(result) @@ -520,6 +499,10 @@ def __init__(self, project_id, path): self.project_id = project_id self.path = path + def generate_diff_name(self): + """Generate uniqute diff file name for server generated diff""" + return mergin_secure_filename(f"{self.path}-diff-{uuid.uuid4()}") + class LatestProjectFiles(db.Model): """Store project latest version files history ids""" @@ -876,6 +859,23 @@ def abs_path(self) -> str: """ return os.path.join(self.file.project.storage.project_dir, self.location) + @staticmethod + def can_create_checkpoint(file_path_id: int, checkpoint: Checkpoint) -> bool: + """Check if it makes sense to create a diff file for a checkpoint, e.g. there where changes within the range""" + if checkpoint.rank == 0: + return True + + return ( + FileDiff.query.filter_by(file_path_id=file_path_id) + .filter( + FileDiff.version >= checkpoint.start, + FileDiff.version <= checkpoint.end, + FileDiff.rank == 0, + ) + .count() + > 0 + ) + def construct_checkpoint(self) -> bool: """Create a diff file checkpoint (aka. merged diff). Find all smaller diffs which are needed to create the final diff file and merge them. @@ -940,6 +940,7 @@ def construct_checkpoint(self) -> bool: continue # find diff in table and on disk + # diffs might not exist because theye were not created yet or there were no changes (e.g. for zeroth rank diffs) diff = next( ( d @@ -949,18 +950,24 @@ def construct_checkpoint(self) -> bool: None, ) - # lower rank diff not even in DB yet - create it and try to construct merged file if not diff: - diff = FileDiff( - basefile=basefile, - version=item.end, - rank=item.rank, - path=f"{basefile.file.path}-diff-{uuid.uuid4()}", - size=None, - checksum=None, - ) - db.session.add(diff) - db.session.commit() + # lower rank diff not even in DB yet - create it and try to construct merged file + if item.rank > 0 and FileDiff.can_create_checkpoint( + self.file_path_id, item + ): + diff = FileDiff( + basefile=basefile, + version=item.end, + rank=item.rank, + path=basefile.file.generate_diff_name(), + size=None, + checksum=None, + ) + db.session.add(diff) + db.session.commit() + else: + # such diff is not expected to exist + continue diff_exists = diff.construct_checkpoint() if diff_exists: @@ -1191,26 +1198,51 @@ def create_checkpoint( cls, project_id: str, checkpoint: Checkpoint, - from_deltas: List[ProjectVersionDelta] = [], ) -> Optional[ProjectVersionDelta]: """ - Creates and caches new checkpoint and any required FileDiff checkpoints. - Use from_deltas to create checkpoint from existing individual deltas. - Returns created ProjectVersionDelta object with checkpoint. + Creates and caches new checkpoint and any required FileDiff checkpoints recursively if needed. """ - delta_range = [ - change - for change in from_deltas - if checkpoint.start <= change.version <= checkpoint.end - ] + delta_range = [] + # our new checkpoint will be created by adding last individual delta to previous checkpoints + expected_checkpoints = Checkpoint.get_checkpoints( + checkpoint.start, checkpoint.end - 1 + ) + expected_checkpoints.append(Checkpoint(rank=0, index=checkpoint.end)) + + expected_deltas = ( + ProjectVersionDelta.query.filter( + ProjectVersionDelta.project_id == project_id, + tuple_(ProjectVersionDelta.rank, ProjectVersionDelta.version).in_( + [(item.rank, item.end) for item in expected_checkpoints] + ), + ) + .order_by(ProjectVersionDelta.version) + .all() + ) + + existing_delta_map = {(c.rank, c.version): c for c in expected_deltas} + # make sure we have all components, if not, created them (recursively) + for item in expected_checkpoints: + existing_delta = existing_delta_map.get((item.rank, item.end)) + if not existing_delta: + existing_delta = cls.create_checkpoint(project_id, item) + + if existing_delta: + delta_range.append(existing_delta) + else: + logging.error( + f"Missing project delta endpoint for {project_id} v{item.end} rank {item.rank} which could not be recreated" + ) + return if not delta_range: logging.warning( - f"No individual changes found for project {project_id} in range v{checkpoint.start}-v{checkpoint.end} to create checkpoint." + f"No changes found for project {project_id} in range v{checkpoint.start}-v{checkpoint.end} to create checkpoint." ) return None # dump changes lists from database and flatten list for merging + delta_range = sorted(delta_range, key=lambda x: x.version) changes = [] for delta in delta_range: changes.extend(DeltaChangeSchema(many=True).load(delta.changes)) @@ -1218,7 +1250,7 @@ def create_checkpoint( d.to_data_delta() for d in cls.merge_changes(changes) ] - # Pre-fetch data for all versioned files to create FileDiff checkpoints + # Pre-fetch data for all versioned files to create FileDiff checkpoints where it makes sense versioned_delta_items = [ item for item in merged_delta_items @@ -1251,17 +1283,17 @@ def create_checkpoint( if not base_file: continue - diff_path = mergin_secure_filename( - f"{item.path}-diff-{uuid.uuid4()}" - ) + if not FileDiff.can_create_checkpoint(file_path_id, checkpoint): + continue + checkpoint_diff = FileDiff( basefile=base_file, - path=diff_path, + path=base_file.file.generate_diff_name(), rank=checkpoint.rank, version=checkpoint.end, ) # Patch the delta with the path to the new diff checkpoint - item.diff = diff_path + item.diff = checkpoint_diff.path db.session.add(checkpoint_diff) checkpoint_delta = ProjectVersionDelta( From f7da89001362e3d0a25b4582f3e336635f7ae6c4 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Tue, 25 Nov 2025 08:10:07 +0100 Subject: [PATCH 32/36] Make diff checkoint validation check more robust --- server/mergin/sync/models.py | 37 ++++++++++++++++++----- server/mergin/tests/test_public_api_v2.py | 35 ++++++++++++++++++++- 2 files changed, 63 insertions(+), 9 deletions(-) diff --git a/server/mergin/sync/models.py b/server/mergin/sync/models.py index 92f656d7..30e0901b 100644 --- a/server/mergin/sync/models.py +++ b/server/mergin/sync/models.py @@ -861,20 +861,41 @@ def abs_path(self) -> str: @staticmethod def can_create_checkpoint(file_path_id: int, checkpoint: Checkpoint) -> bool: - """Check if it makes sense to create a diff file for a checkpoint, e.g. there where changes within the range""" - if checkpoint.rank == 0: - return True + """Check if it makes sense to create a diff file for a checkpoint, e.g. there were relevant changes within the range without breaking changes""" - return ( - FileDiff.query.filter_by(file_path_id=file_path_id) + basefile = FileHistory.get_basefile(file_path_id, checkpoint.end) + if not basefile: + return False + + file_was_deleted = ( + FileHistory.query.filter_by(file_path_id=file_path_id) .filter( - FileDiff.version >= checkpoint.start, - FileDiff.version <= checkpoint.end, - FileDiff.rank == 0, + FileHistory.project_version_name + >= max(basefile.project_version_name, checkpoint.start), + FileHistory.project_version_name <= checkpoint.end, + FileHistory.change == PushChangeType.DELETE.value, ) .count() > 0 ) + if file_was_deleted: + return False + + query = FileDiff.query.filter_by(basefile_id=basefile.id).filter( + FileDiff.rank == 0 + ) + + # rank 0 is a special case we only verify it exists + if checkpoint.rank == 0: + query = query.filter(FileDiff.version == checkpoint.end) + # for higher ranks we need to check if there were diff updates in that range + else: + query = query.filter( + FileDiff.version >= checkpoint.start, + FileDiff.version <= checkpoint.end, + ) + + return query.count() > 0 def construct_checkpoint(self) -> bool: """Create a diff file checkpoint (aka. merged diff). diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index cc96b5bf..b9a1891c 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -29,7 +29,7 @@ ProjectVersionDelta, ) from ..sync.files import DeltaChange, PushChangeType -from ..sync.utils import is_versioned_file +from ..sync.utils import Checkpoint, is_versioned_file from sqlalchemy.exc import IntegrityError import pytest from datetime import datetime, timedelta, timezone @@ -260,6 +260,7 @@ def test_create_diff_checkpoint(diff_project): assert len(diffs) == 22 # diff for v17-v20 from individual diffs + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(1, 5)) is True diff = FileDiff( basefile=basefile, path=f"test.gpkg-diff-{uuid.uuid4()}", version=20, rank=1 ) @@ -329,6 +330,38 @@ def test_create_diff_checkpoint(diff_project): assert not os.path.exists(diff.abs_path) +def test_can_create_checkpoint(diff_project): + """Test if diff file checkpoint can be created""" + file_path_id = ( + ProjectFilePath.query.filter_by(project_id=diff_project.id, path="base.gpkg") + .first() + .id + ) + + # we target v1 where file was uploaded => no diff + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(0, 1)) is False + + # for zero rank diffs we can always create a checkpoint (but that should already exist) + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(0, 4)) is True + + # there are diffs in both ranges, v1-v4 and v5-v8 + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(1, 1)) is True + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(1, 2)) is True + + # higher ranks cannot be created as file was removed at v9 + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(2, 1)) is False + + # there is no diff for such file in this range + file_path_id = ( + ProjectFilePath.query.filter_by( + project_id=diff_project.id, path="inserted_1_A.gpkg" + ) + .first() + .id + ) + assert FileDiff.can_create_checkpoint(file_path_id, Checkpoint(1, 1)) is False + + def test_delta_merge_changes(): """Test merging of delta changes works as expected""" From 2938e21c256a9f80e7e5075bae05d4ab1e9e6660 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Tue, 25 Nov 2025 08:35:15 +0100 Subject: [PATCH 33/36] Add more tests --- server/mergin/tests/test_public_api_v2.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index b9a1891c..8c9eae03 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -539,6 +539,7 @@ def test_project_version_delta_changes(client, diff_project: Project): assert len(delta_base_gpkg.diffs) == 0 # get data with multiple ranks = 1 level checkpoints 1-4, 5-8 + checkpoint 9 and 10 + assert not ProjectVersionDelta.query.filter_by(rank=1, version=8).first() delta = diff_project.get_delta_changes(0, 10) assert len(delta) == len(latest_version.files) delta_test_gpkg = next((d for d in delta if d.path == "test.gpkg"), None) @@ -548,6 +549,7 @@ def test_project_version_delta_changes(client, diff_project: Project): assert ProjectVersionDelta.query.filter_by(rank=2).count() == 0 # check if version is having rank 1 checkpoint with proper end version assert ProjectVersionDelta.query.filter_by(rank=1, version=4).first() + # missing lower checkpoint is recreated assert ProjectVersionDelta.query.filter_by(rank=1, version=8).first() # base gpgk is transparent, bacause we are requesting from 0 assert not next((c for c in delta if c.path == "base.gpkg"), None) @@ -603,6 +605,14 @@ def test_project_version_delta_changes(client, diff_project: Project): ) assert response.status_code == 200 + # remove intermediate deltas and assert they would be recreated if needed for higher ranks + ProjectVersionDelta.query.filter(ProjectVersionDelta.rank > 0).delete() + db.session.commit() + # v1-v16 would be created from v1-v4, v5-v8 and v9-v12 and 4 individual deltas + delta = diff_project.get_delta_changes(0, diff_project.latest_version) + assert ProjectVersionDelta.query.filter_by(rank=1).count() == 3 + assert ProjectVersionDelta.query.filter_by(rank=2, version=16).count() == 1 + push_data = [ # success From a9986b15e21b7502131ae2d4e2bfd61d931a5b4a Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Tue, 25 Nov 2025 13:44:41 +0100 Subject: [PATCH 34/36] Add cli command to trigger checkpoints caching --- server/mergin/sync/commands.py | 47 +++++++++++++++ server/mergin/tests/test_cli.py | 69 ++++++++++++++++++++++- server/mergin/tests/test_public_api_v2.py | 16 +++++- 3 files changed, 128 insertions(+), 4 deletions(-) diff --git a/server/mergin/sync/commands.py b/server/mergin/sync/commands.py index 21f5ef15..882b73b6 100644 --- a/server/mergin/sync/commands.py +++ b/server/mergin/sync/commands.py @@ -127,3 +127,50 @@ def remove(project_name): project.removed_by = None db.session.commit() click.secho("Project removed", fg="green") + + @project.command() + @click.argument("project-name", callback=normalize_input(lowercase=False)) + @click.option("--since", type=int, required=False) + @click.option("--to", type=int, required=False) + def create_checkpoint(project_name, since=None, to=None): + """Create project delta checkpoint, corresponding lower checkpoints and merged diffs for project""" + ws, name = split_project_path(project_name) + workspace = current_app.ws_handler.get_by_name(ws) + if not workspace: + click.secho("ERROR: Workspace does not exist", fg="red", err=True) + sys.exit(1) + project = ( + Project.query.filter_by(workspace_id=workspace.id, name=name) + .filter(Project.storage_params.isnot(None)) + .first() + ) + if not project: + click.secho("ERROR: Project does not exist", fg="red", err=True) + sys.exit(1) + + since = since if since is not None else 0 + to = to if to is not None else project.latest_version + if since < 0 or to < 1: + click.secho( + "ERROR: Invalid version number, minimum version for 'since' is 0 and minimum version for 'to' is 1", + fg="red", + err=True, + ) + sys.exit(1) + + if to > project.latest_version: + click.secho( + "ERROR: 'to' version exceeds latest project version", fg="red", err=True + ) + sys.exit(1) + + if since >= to: + click.secho( + "ERROR: 'since' version must be less than 'to' version", + fg="red", + err=True, + ) + sys.exit(1) + + project.get_delta_changes(since, to) + click.secho("Project checkpoint(s) created", fg="green") diff --git a/server/mergin/tests/test_cli.py b/server/mergin/tests/test_cli.py index d0b91717..d4197f86 100644 --- a/server/mergin/tests/test_cli.py +++ b/server/mergin/tests/test_cli.py @@ -13,7 +13,7 @@ from mergin.auth.models import User from mergin.commands import _check_permissions, _check_celery from mergin.stats.models import MerginInfo -from mergin.sync.models import Project, ProjectVersion +from mergin.sync.models import FileDiff, Project, ProjectVersion, ProjectVersionDelta from mergin.tests import ( test_project, test_workspace_id, @@ -545,3 +545,70 @@ def test_check_celery(mock_ping, ping, result, output, capsys): out, err = capsys.readouterr() # capture what was echoed to stdout assert ("Error: " not in out) == result assert output in out + + +create_project_checkpoint_data = [ + ( + f"{test_workspace_name}/non-existing", + 0, + 1, + "ERROR: Project does not exist", + ), + ( + f"{test_workspace_name}/{test_project}", + 4, + 1, + "ERROR: 'since' version must be less than 'to' version", + ), + ( + f"{test_workspace_name}/{test_project}", + 0, + 100, + "ERROR: 'to' version exceeds latest project version", + ), + ( + f"{test_workspace_name}/{test_project}", + 0, + 0, + "ERROR: Invalid version number, minimum version for 'since' is 0 and minimum version for 'to' is 1", + ), + ( + f"{test_workspace_name}/{test_project}", + 0, + 4, + "Project checkpoint(s) created", + ), + ( + f"{test_workspace_name}/{test_project}", + None, + None, + "Project checkpoint(s) created", + ), +] + + +@pytest.mark.parametrize("project_name,since,to,output", create_project_checkpoint_data) +def test_create_checkpoint(runner, project_name, since, to, output, diff_project): + """Test 'project remove' command""" + ProjectVersionDelta.query.filter_by(project_id=diff_project.id).filter( + ProjectVersionDelta.rank > 0 + ).delete() + db.session.commit() + + remove = runner.invoke( + args=[ + "project", + "create-checkpoint", + project_name, + "--since", + since, + "--to", + to, + ] + ) + assert output in remove.output + checkpoints = ProjectVersionDelta.query.filter(ProjectVersionDelta.rank > 0).count() + if remove.exit_code == 0: + assert checkpoints > 0 + else: + assert checkpoints == 0 diff --git a/server/mergin/tests/test_public_api_v2.py b/server/mergin/tests/test_public_api_v2.py index 8c9eae03..f30fe462 100644 --- a/server/mergin/tests/test_public_api_v2.py +++ b/server/mergin/tests/test_public_api_v2.py @@ -606,12 +606,22 @@ def test_project_version_delta_changes(client, diff_project: Project): assert response.status_code == 200 # remove intermediate deltas and assert they would be recreated if needed for higher ranks - ProjectVersionDelta.query.filter(ProjectVersionDelta.rank > 0).delete() + ProjectVersionDelta.query.filter_by(project_id=diff_project.id).filter( + ProjectVersionDelta.rank > 0 + ).delete() db.session.commit() # v1-v16 would be created from v1-v4, v5-v8 and v9-v12 and 4 individual deltas delta = diff_project.get_delta_changes(0, diff_project.latest_version) - assert ProjectVersionDelta.query.filter_by(rank=1).count() == 3 - assert ProjectVersionDelta.query.filter_by(rank=2, version=16).count() == 1 + assert ( + ProjectVersionDelta.query.filter_by(project_id=diff_project.id, rank=1).count() + == 3 + ) + assert ( + ProjectVersionDelta.query.filter_by( + project_id=diff_project.id, rank=2, version=16 + ).count() + == 1 + ) push_data = [ From 2b872e582f84f90cc7b2664b9f22deff0e9f6b81 Mon Sep 17 00:00:00 2001 From: Martin Varga Date: Tue, 25 Nov 2025 16:01:34 +0100 Subject: [PATCH 35/36] Fix failing tests with random 504 Do not update global config variable for gevent mode. Make sure we do not use gevent env for tests apart of dedicated tests. In those tests mock configuration rather than modifing global variable. --- server/.test.env | 1 + server/mergin/tests/test_middleware.py | 99 +++++++++++++++----------- 2 files changed, 59 insertions(+), 41 deletions(-) diff --git a/server/.test.env b/server/.test.env index bdaa7bfa..63294a3f 100644 --- a/server/.test.env +++ b/server/.test.env @@ -24,3 +24,4 @@ SECURITY_BEARER_SALT='bearer' SECURITY_EMAIL_SALT='email' SECURITY_PASSWORD_SALT='password' DIAGNOSTIC_LOGS_DIR=/tmp/diagnostic_logs +GEVENT_WORKER=0 \ No newline at end of file diff --git a/server/mergin/tests/test_middleware.py b/server/mergin/tests/test_middleware.py index 82b9cf26..2f5cbe4f 100644 --- a/server/mergin/tests/test_middleware.py +++ b/server/mergin/tests/test_middleware.py @@ -6,6 +6,7 @@ import psycogreen.gevent import pytest import sqlalchemy +from unittest.mock import patch from ..app import create_simple_app, GeventTimeoutMiddleware, db from ..config import Configuration @@ -14,58 +15,74 @@ @pytest.mark.parametrize("use_middleware", [True, False]) def test_use_middleware(use_middleware): """Test using middleware""" - Configuration.GEVENT_WORKER = use_middleware - Configuration.GEVENT_REQUEST_TIMEOUT = 1 - application = create_simple_app() + with patch.object( + Configuration, + "GEVENT_WORKER", + use_middleware, + ), patch.object( + Configuration, + "GEVENT_REQUEST_TIMEOUT", + 1, + ): + application = create_simple_app() - def ping(): - gevent.sleep(Configuration.GEVENT_REQUEST_TIMEOUT + 1) - return "pong" + def ping(): + gevent.sleep(Configuration.GEVENT_REQUEST_TIMEOUT + 1) + return "pong" - application.add_url_rule("/test", "ping", ping) - app_context = application.app_context() - app_context.push() + application.add_url_rule("/test", "ping", ping) + app_context = application.app_context() + app_context.push() - assert isinstance(application.wsgi_app, GeventTimeoutMiddleware) == use_middleware - # in case of gevent, dummy endpoint it set to time out - assert application.test_client().get("/test").status_code == ( - 504 if use_middleware else 200 - ) + assert ( + isinstance(application.wsgi_app, GeventTimeoutMiddleware) == use_middleware + ) + # in case of gevent, dummy endpoint it set to time out + assert application.test_client().get("/test").status_code == ( + 504 if use_middleware else 200 + ) def test_catch_timeout(): """Test proper handling of gevent timeout with db.session.rollback""" psycogreen.gevent.patch_psycopg() - Configuration.GEVENT_WORKER = True - Configuration.GEVENT_REQUEST_TIMEOUT = 1 - application = create_simple_app() + with patch.object( + Configuration, + "GEVENT_WORKER", + True, + ), patch.object( + Configuration, + "GEVENT_REQUEST_TIMEOUT", + 1, + ): + application = create_simple_app() - def unhandled(): - try: - db.session.execute("SELECT pg_sleep(1.1);") - finally: - db.session.execute("SELECT 1;") - return "" + def unhandled(): + try: + db.session.execute("SELECT pg_sleep(1.1);") + finally: + db.session.execute("SELECT 1;") + return "" - def timeout(): - try: - db.session.execute("SELECT pg_sleep(1.1);") - except gevent.timeout.Timeout: - db.session.rollback() - raise - finally: - db.session.execute("SELECT 1;") - return "" + def timeout(): + try: + db.session.execute("SELECT pg_sleep(1.1);") + except gevent.timeout.Timeout: + db.session.rollback() + raise + finally: + db.session.execute("SELECT 1;") + return "" - application.add_url_rule("/unhandled", "unhandled", unhandled) - application.add_url_rule("/timeout", "timeout", timeout) - app_context = application.app_context() - app_context.push() + application.add_url_rule("/unhandled", "unhandled", unhandled) + application.add_url_rule("/timeout", "timeout", timeout) + app_context = application.app_context() + app_context.push() - assert application.test_client().get("/timeout").status_code == 504 + assert application.test_client().get("/timeout").status_code == 504 - # in case of missing rollback sqlalchemy would raise error - with pytest.raises(sqlalchemy.exc.PendingRollbackError): - application.test_client().get("/unhandled") + # in case of missing rollback sqlalchemy would raise error + with pytest.raises(sqlalchemy.exc.PendingRollbackError): + application.test_client().get("/unhandled") - db.session.rollback() + db.session.rollback() From 80adef2f4ff0c1f92c2d8fca71e5a3647b561151 Mon Sep 17 00:00:00 2001 From: "marcel.kocisek" Date: Thu, 27 Nov 2025 09:53:20 +0100 Subject: [PATCH 36/36] Publish v2 pull enabled flag --- server/mergin/sync/config.py | 2 ++ server/mergin/tests/fixtures.py | 1 + server/mergin/tests/test_config.py | 1 + 3 files changed, 4 insertions(+) diff --git a/server/mergin/sync/config.py b/server/mergin/sync/config.py index 7200dae5..c2556f25 100644 --- a/server/mergin/sync/config.py +++ b/server/mergin/sync/config.py @@ -75,3 +75,5 @@ class Configuration(object): UPLOAD_CHUNKS_EXPIRATION = config( "UPLOAD_CHUNKS_EXPIRATION", default=86400, cast=int ) + # whether client can pull using v2 apis + V2_PULL_ENABLED = config("V2_PULL_ENABLED", default=True, cast=bool) diff --git a/server/mergin/tests/fixtures.py b/server/mergin/tests/fixtures.py index e1f3859f..5d719878 100644 --- a/server/mergin/tests/fixtures.py +++ b/server/mergin/tests/fixtures.py @@ -36,6 +36,7 @@ def flask_app(request): "COLLECT_STATISTICS", "USER_SELF_REGISTRATION", "V2_PUSH_ENABLED", + "V2_PULL_ENABLED", ] ) register(application) diff --git a/server/mergin/tests/test_config.py b/server/mergin/tests/test_config.py index af677cb0..494bb438 100644 --- a/server/mergin/tests/test_config.py +++ b/server/mergin/tests/test_config.py @@ -22,6 +22,7 @@ def test_config(client): "user_self_registration", "build_hash", "v2_push_enabled", + "v2_pull_enabled", } resp = client.get("/config") assert resp.status_code == 200