From e96d289bd0b737b0cd8ac91004515a8baf8ddfd9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Fri, 19 Nov 2021 17:02:09 +0800 Subject: [PATCH 01/57] First stab at finding a format to help analyse the problem --- DEVELOPMENT.md | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9abdd166392..dd1420cb3f3 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -173,4 +173,57 @@ echo c56a8e7aa92c86c41a923bc760d2dc39e8a31cf7 | git cat-file --batch | tail +2 Thus one has to post-process the file by reducing its size by one using `truncate -s -1 fixture`, **removing the newline byte**. +# Discovery: data consistency and resource usage of packed objects + +## Summary + +Concurrent writes to packs observable by applications pose a challenge to the current implementation and we need to find ways around that. + +## Motivation + +Use this document to shed light on the entire problem space surrounding data consistency and resource usage of packed objects to aid in finding solutions +that are best for various use cases without committing to high costs in one case or another. + +## Status Quo + +`gitoxide`s implementation as of 2021-11-19 is known to be unsuitable in the presence of concurrent writes to packs due to its inability to automatically respond +to changed packs on disk if objects cannot be found on the first attempt. Other implementations like `git2` and canonical `git` handle this by using +thread-safe interior mutability at the cost of scalability. + +`gitoxide`s implementation may be insufficient in that regard, but it shows how read-only data allows to scale across core as well as the CPU architecture allows. + +The usage of system resources like file handles is simple but potentially wasteful as all packs are memory-mapped in full immediately. Lazy and partial memory-mapping +of packs is used in other implementations. Laziness allows for more efficiency and partial mapping allows to handle big packs on 32 bit systems. + +Failure to acquire a memory map due to limits in the amount of open file handles results in an error when initializing the pack database in the `gitoxide`s implementation. +To my knowledge, this is handled similarly in other implementations. All implementations assume there is unlimited memory, but the effect of running out of memory is only +known to me in case of `gitoxide` which will panic. + +## Values + +- We value _highly_ to scale object access performance with cores. +- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. + +### Out of scope + +- We don't seek to handle out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. +- We do not want to enable 32 bit applications to handle pack files greater than 4GB and leave this field entirely to the other implementations. + +## Existing technical solutions + +| **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|-----------------------------------------------------------|--------------------------------------------------------------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------------------------| +| 1. initialization | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| 2. file limit hit | 1. fail | read-only possible | | gitoxide | +| | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| 3. file handle reduction/avoid hitting file limit | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count | libgit2 (only within pack database) | +| | 2. process-wide pooling of memory maps | perfect control over | | | +| 4. object miss | 1. fail | | | gitoxide | +| | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | | | libgit2 | +| 5. race when creating/altering more than a pack at a time | 1. ignore | | | all of them (I think) | + +## Approach + +We will look at typical access patterns holistically based on various use-cases and decide which existing technical solution would fit best. From f14cd615dbf3f4aea01d32476c694d2258a11ab6 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 09:59:00 +0800 Subject: [PATCH 02/57] Finish technical problems and solutions (#259) --- DEVELOPMENT.md | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index dd1420cb3f3..895bdeb6f6c 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -204,24 +204,33 @@ known to me in case of `gitoxide` which will panic. - We value _highly_ to scale object access performance with cores. - We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. -### Out of scope - -- We don't seek to handle out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. -- We do not want to enable 32 bit applications to handle pack files greater than 4GB and leave this field entirely to the other implementations. - -## Existing technical solutions - -| **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | -|-----------------------------------------------------------|--------------------------------------------------------------------------|----------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------------------------| -| 1. initialization | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | -| | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | -| 2. file limit hit | 1. fail | read-only possible | | gitoxide | -| | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | -| 3. file handle reduction/avoid hitting file limit | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count | libgit2 (only within pack database) | -| | 2. process-wide pooling of memory maps | perfect control over | | | -| 4. object miss | 1. fail | | | gitoxide | -| | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | | | libgit2 | -| 5. race when creating/altering more than a pack at a time | 1. ignore | | | all of them (I think) | +- We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. +- We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. + +## Existing technical problems and their solutions + +| **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|---------------------------------------------------------------|----------------------------------------------------------------------------|----------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| +| **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| **2. file limit hit** | 1. fail | read-only possible | | gitoxide | +| | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | +| | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | +| | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | +| ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | +| | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +### Amendum problem 5. + +Refreshing packs if an object is missed is the current way of handling writes to the pack database. As outlined in +how [geometric repacking works](https://github.blog/2021-04-29-scaling-monorepo-maintenance/#geometric-repacking) it can indeed +happen that multiple packs are changed which isn't atomic. However, since this will be done in an additive fashion, first adding the new packs based on existing packs +and loose objects, and then removing the packs and loose objects they replace, there is no race happening as all objects stay reachable at all times. + +## Understanding changes to repositories + +A change to any file in a git repository has the potential to affect the process operating on it. TBD ## Approach From 0c02c1315997083e742f3667c765f1d21e8ee687 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 12:07:26 +0800 Subject: [PATCH 03/57] Analyis about loose refs db (#259) --- DEVELOPMENT.md | 97 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 5 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 895bdeb6f6c..89770c65a68 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -177,7 +177,8 @@ Thus one has to post-process the file by reducing its size by one using `truncat ## Summary -Concurrent writes to packs observable by applications pose a challenge to the current implementation and we need to find ways around that. +Concurrent writes to packs observable by applications pose a challenge to the current implementation and we need to find ways around that. There may be other limitations +related to any resource that changes often, most prominently refs. ## Motivation @@ -186,6 +187,15 @@ that are best for various use cases without committing to high costs in one case ## Status Quo +### Object Databases + +In repositories there may be millions of objects and accessing them quickly is relevant to many of git's features. + +#### Packed object database + +As packs and the object database is inherently append-only, i.e. objects are never *[0] deleted, allowing concurrent readers to observe a consistent state even in presence +of writers. Writers create new pack files and may remove them after adding all changes successfully. + `gitoxide`s implementation as of 2021-11-19 is known to be unsuitable in the presence of concurrent writes to packs due to its inability to automatically respond to changed packs on disk if objects cannot be found on the first attempt. Other implementations like `git2` and canonical `git` handle this by using thread-safe interior mutability at the cost of scalability. @@ -199,6 +209,72 @@ Failure to acquire a memory map due to limits in the amount of open file handles To my knowledge, this is handled similarly in other implementations. All implementations assume there is unlimited memory, but the effect of running out of memory is only known to me in case of `gitoxide` which will panic. + +[0] deletion is possible but doesn't happen instantly, instead requiring time to pass and calls to git-maintenance and for them to be unreachable, i.e. not used in the + entire repository. + +#### Loose object database + +Each object is stored in a single file on disk, partitions by the first byte of its content hash. All implementations handle it similarly. + +### Loose reference database + +References, i.e. pointers to objects or other references, are stored one at a time in files, one reference at a time or multiple ones in a single well known file, `packed-refs`. +`packed-refs` is updated during maintenance to keep keep direct references only. + +Multiple references can change at the same time, but multiple changes aren't atomic as changes are made a file at a time. All implementations may observe intermediate states +where some but not all references are updated. + +`packed-refs` may change during maintenance or upon deletion of references. All implementations cache the `packed-refs` file but check for a stale cache (i.e. see if file on disk +changed in the mean time) before each use of the cached data. + +The database read, i.e. accessing individual reference values or iterating references, +performance is heavily limited by disk-IO when accessing loose files. Handling og `packed-refs` is speedy even in the presence of hundreds of thousands +of references due to optimizations performed in all implementations. + +The reference update/add performance is parallel per reference as long as the set of writers don't overlap in references to change, but bound by disk-IO, +due to writes happening one file at a time. `packed-refs` is not changed, but typically read to validate write constraints that help with atomicity, +i.e. only change a value if it matches the previously observed one. + +Deletions are slow and a worst-case scenario as not only the loose reference(s) will be deleted but potentially the `packed-refs` file updated if +it contained the (possibly only) copy/ies. An update implies rewriting `packed-refs` file entirely. During that time it is locked, blocking or failing other writers, forming +a choking point. + +`gitoxide`s implementation keeps one `packed-refs` cache handle to the underlying repository, which costs a file handle for a memory map if the `packed-refs` file is larger than +32kB, theoretically configurable but currently hardcoded based on the default in `git`. +Other implementations maintain one per repository instance (libgit2) or one per process (git). + + +| **Operation** | **read loose file** | **locks** | **costs** | **read packed-refs** | **concurrency granularity** | **Limit/Bottleneck** | +|-----------------|-------------------------------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|---------------------------------|----------------------------------------------| +| **Add/Update** | only if comparing to previous value | loose ref | .lock file per reference; write new value; move into place; create intermediate directories as needed; delete empty directories in the way as needed; read packed-refs | only if comparing to previous value and loose reference file isn't present. | per reference | disk-IO | +| **Read** | always | | read loose file; read packed-refs | if loose file didn't exist | n/a | disk-IO | +| **Delete** | only if assserting previous value | loose ref; packed-ref | .lock file per reference; delete loose file; delete lock file; .lock for packed-refs, rewrite packed-refs.lock; move packed-refs.lock into place | always | per reference (and packed-refs) | disk-IO (and CPU if packed-refs is involved) | +| **maintenance** | always all (or by filter) | loose ref; packed-ref | .lock file per reference, read loose reference; .lock for packed-refs; read entire packed-refs; insert loose reference values; write entire altered packed-refs into packed-refs.lock; move packed-refs.lock into place; delete existing loose references and delete their .lock | always | per reference and packed-refs | disk-IO and CPU | + +Failures to add/update/delete may occur if the constraint isn't met. It's possible to wait in the presence of a lock file instead of failing immediately, +which is beneficial if there is no value constraint. +Typically value constraints will be used for safety though, so waiting for a lock to be acquired usually results in failure right after as a change +caused by a value mismatch. However, in the presence of deletions, it is always useful to wait for locks as after deletion, the previous value can't be checked anymore +causing the operation to succeed. + +Races exist do not exist for writers, but do exist for readers as they may observe intermediate states of transactions involving multiple updates. + +Semantically, `gitoxide`s implementation of this database is equivalent to the one of `git`. + +### Ref-table database + +**TBD** + + +## Understanding changes to repositories + +A change to any file in a git repository has the potential to affect the process operating on it. Related changes to multiple files are never atomic, and can be observed +in an in-between state. + +**TBD** + + ## Values - We value _highly_ to scale object access performance with cores. @@ -228,11 +304,22 @@ how [geometric repacking works](https://github.blog/2021-04-29-scaling-monorepo- happen that multiple packs are changed which isn't atomic. However, since this will be done in an additive fashion, first adding the new packs based on existing packs and loose objects, and then removing the packs and loose objects they replace, there is no race happening as all objects stay reachable at all times. -## Understanding changes to repositories - -A change to any file in a git repository has the potential to affect the process operating on it. TBD - ## Approach We will look at typical access patterns holistically based on various use-cases and decide which existing technical solution would fit best. +## Problems and solutions + +### Loose references database has inconsistent reads + +When updating multiple references in a single transaction, writers may observe an intermediate states with some refs pointing to the previous value, some pointing to the new. + +The known **solution** is to switch to the `ref-table` implementation. + +## Learnings + +### Loose references database + +- When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as chocking point. +- When adding/updating references, prefer to fail immediately as the chance for the same change being made concurrently is low, and failure + would occur after waiting for the lock due to constraint mismatch. From 3ada0c9da6edcce4de7a7642bad8567821ee85a1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 13:47:38 +0800 Subject: [PATCH 04/57] Read up on ref-table a little and fill in some details (#259) --- DEVELOPMENT.md | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 89770c65a68..48b7efdb070 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -189,7 +189,7 @@ that are best for various use cases without committing to high costs in one case ### Object Databases -In repositories there may be millions of objects and accessing them quickly is relevant to many of git's features. +In repositories there may be millions to 10th of millions of objects like commits, blobs, trees and tags, and accessing them quickly is relevant to many of git's features. #### Packed object database @@ -217,9 +217,16 @@ known to me in case of `gitoxide` which will panic. Each object is stored in a single file on disk, partitions by the first byte of its content hash. All implementations handle it similarly. -### Loose reference database +### Reference databases -References, i.e. pointers to objects or other references, are stored one at a time in files, one reference at a time or multiple ones in a single well known file, `packed-refs`. +References are pointers to objects or other references and are crucial to `git` as they form the entry points in all git operations. If objects aren't reachable by starting +graph traversal at a reference (or its historical reflog information) it is effectively lost forever, i.e. unreachable. + +They may be just a couple to hundreds of thousands of references in a repository which are changed right after new objects are added to the object database. + +#### Loose reference database + +References are stored one at a time in files, one reference at a time or multiple ones in a single well known file, `packed-refs`. `packed-refs` is updated during maintenance to keep keep direct references only. Multiple references can change at the same time, but multiple changes aren't atomic as changes are made a file at a time. All implementations may observe intermediate states @@ -244,12 +251,14 @@ a choking point. 32kB, theoretically configurable but currently hardcoded based on the default in `git`. Other implementations maintain one per repository instance (libgit2) or one per process (git). +Even the biggest transactions will only additionally open one loose reference file at a time, and close it right after use. + | **Operation** | **read loose file** | **locks** | **costs** | **read packed-refs** | **concurrency granularity** | **Limit/Bottleneck** | |-----------------|-------------------------------------|-----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|---------------------------------|----------------------------------------------| -| **Add/Update** | only if comparing to previous value | loose ref | .lock file per reference; write new value; move into place; create intermediate directories as needed; delete empty directories in the way as needed; read packed-refs | only if comparing to previous value and loose reference file isn't present. | per reference | disk-IO | +| **Add/Update** | only if comparing to previous value | loose ref | .lock file per reference; write new value; move into place; create intermediate directories as needed; delete empty directories in the way as needed; read packed-refs; possibly append to reflog | only if comparing to previous value and loose reference file isn't present. | per reference | disk-IO | | **Read** | always | | read loose file; read packed-refs | if loose file didn't exist | n/a | disk-IO | -| **Delete** | only if assserting previous value | loose ref; packed-ref | .lock file per reference; delete loose file; delete lock file; .lock for packed-refs, rewrite packed-refs.lock; move packed-refs.lock into place | always | per reference (and packed-refs) | disk-IO (and CPU if packed-refs is involved) | +| **Delete** | only if assserting previous value | loose ref; packed-ref | .lock file per reference; delete loose file; delete lock file; .lock for packed-refs, rewrite packed-refs.lock; move packed-refs.lock into place; possibly delete reflog per ref | always | per reference (and packed-refs) | disk-IO (and CPU if packed-refs is involved) | | **maintenance** | always all (or by filter) | loose ref; packed-ref | .lock file per reference, read loose reference; .lock for packed-refs; read entire packed-refs; insert loose reference values; write entire altered packed-refs into packed-refs.lock; move packed-refs.lock into place; delete existing loose references and delete their .lock | always | per reference and packed-refs | disk-IO and CPU | Failures to add/update/delete may occur if the constraint isn't met. It's possible to wait in the presence of a lock file instead of failing immediately, @@ -264,8 +273,15 @@ Semantically, `gitoxide`s implementation of this database is equivalent to the o ### Ref-table database -**TBD** +Even though `gitoxide` doesn't have an implementation yet it's useful to understand its semantics as a possible path forward. + +As a major difference to the _loose reference database_ it doesn't operate on user editable files but uses a binary format optimized for read performance and consistency, allowing +readers to always have a consistent view. There can be any amount of readers. + +All changes like updates, additions or deletions are fully transactional, but there can only be one writer at a time. This has the potential for contention in busy +(mono-)repositories as file based locking mechanisms aren't fair and waiting strategies with exponential backoff may cause some writers to wait forever. +Due to the lack of experience with this database details around resource consumption in terms of file handles can't be provided at this time. ## Understanding changes to repositories @@ -320,6 +336,7 @@ The known **solution** is to switch to the `ref-table` implementation. ### Loose references database -- When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as chocking point. +- When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as chocking point. It's certainly worth it to split transactions + so that deletions are done separately from updates to allow usage of the most suitable locking strategies. - When adding/updating references, prefer to fail immediately as the chance for the same change being made concurrently is low, and failure would occur after waiting for the lock due to constraint mismatch. From 3bdf6c3e1afd1d0de92ff11def9e55f8b9a593c8 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 14:18:27 +0800 Subject: [PATCH 05/57] Write about git configuration to assert it doesn't affect the discovery (#259) --- DEVELOPMENT.md | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 48b7efdb070..8dd73b50d26 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -185,19 +185,28 @@ related to any resource that changes often, most prominently refs. Use this document to shed light on the entire problem space surrounding data consistency and resource usage of packed objects to aid in finding solutions that are best for various use cases without committing to high costs in one case or another. -## Status Quo +## Git as database + +Databases are well understood and there is common semantics they typically adhere to. The following is an exploration of the various kinds of databases that make up a git +repository along with some of their interplay. ### Object Databases In repositories there may be millions to 10th of millions of objects like commits, blobs, trees and tags, and accessing them quickly is relevant to many of git's features. +#### Loose object database + +Each object is stored in a single file on disk, partitions by the first byte of its content hash. All implementations handle it similarly. + #### Packed object database -As packs and the object database is inherently append-only, i.e. objects are never *[0] deleted, allowing concurrent readers to observe a consistent state even in presence -of writers. Writers create new pack files and may remove them after adding all changes successfully. +Packs are single files containing one or more objects. Delta-compression allows them to be very size-efficient, e.g. 90x compression for the linux kernel. + +Packs and the object database is inherently append-only, i.e. objects are never *[0] deleted, allowing concurrent readers to observe a consistent state even in presence +of writers. Writers create new pack files and may remove them after adding all changes successfully, without the need for locking. `gitoxide`s implementation as of 2021-11-19 is known to be unsuitable in the presence of concurrent writes to packs due to its inability to automatically respond -to changed packs on disk if objects cannot be found on the first attempt. Other implementations like `git2` and canonical `git` handle this by using +to changed packs on disk if objects cannot be found on the first attempt. Other implementations like `libgit2` and canonical `git` handle this by using thread-safe interior mutability at the cost of scalability. `gitoxide`s implementation may be insufficient in that regard, but it shows how read-only data allows to scale across core as well as the CPU architecture allows. @@ -213,10 +222,6 @@ known to me in case of `gitoxide` which will panic. [0] deletion is possible but doesn't happen instantly, instead requiring time to pass and calls to git-maintenance and for them to be unreachable, i.e. not used in the entire repository. -#### Loose object database - -Each object is stored in a single file on disk, partitions by the first byte of its content hash. All implementations handle it similarly. - ### Reference databases References are pointers to objects or other references and are crucial to `git` as they form the entry points in all git operations. If objects aren't reachable by starting @@ -271,7 +276,7 @@ Races exist do not exist for writers, but do exist for readers as they may obser Semantically, `gitoxide`s implementation of this database is equivalent to the one of `git`. -### Ref-table database +#### Ref-table database Even though `gitoxide` doesn't have an implementation yet it's useful to understand its semantics as a possible path forward. @@ -283,6 +288,16 @@ All changes like updates, additions or deletions are fully transactional, but th Due to the lack of experience with this database details around resource consumption in terms of file handles can't be provided at this time. +### Configuration + +`git` employs cascaded configuration files to affect processes running on the repository. Their effect can vary widely, and so can their use in applications handling repositories. + +`gitoxide`s implementation currently uses only the repository git configuration file when initializing a repository to correctly determine whether or not it is _bare. +It does not expose git configuration at this time and doesn't use git configuration when repository initialization is complete. + +We have no reason to believe that other implementations use git configuration beyond first initialization either or do anything to assure it remains up to date in memory +after reading it. + ## Understanding changes to repositories A change to any file in a git repository has the potential to affect the process operating on it. Related changes to multiple files are never atomic, and can be observed @@ -301,6 +316,8 @@ in an in-between state. ## Existing technical problems and their solutions +Solutions aren't always mutually exclusive despite the form of presentation suggesting it. + | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | |---------------------------------------------------------------|----------------------------------------------------------------------------|----------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | @@ -309,10 +326,12 @@ in an in-between state. | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| | 3. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | + ### Amendum problem 5. Refreshing packs if an object is missed is the current way of handling writes to the pack database. As outlined in @@ -340,3 +359,7 @@ The known **solution** is to switch to the `ref-table` implementation. so that deletions are done separately from updates to allow usage of the most suitable locking strategies. - When adding/updating references, prefer to fail immediately as the chance for the same change being made concurrently is low, and failure would occur after waiting for the lock due to constraint mismatch. + +### Git configuration + +- some application types might never use it, which is why it should be controllable how and what is loaded (when we actually implement it). From a6d73e43ad9a112cf938abe1f1a44e6ef9fb8d5d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 15:10:21 +0800 Subject: [PATCH 06/57] beef up problem/solution list and add git index info for completeness (#259) --- DEVELOPMENT.md | 64 +++++++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8dd73b50d26..618b6edd792 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -173,7 +173,7 @@ echo c56a8e7aa92c86c41a923bc760d2dc39e8a31cf7 | git cat-file --batch | tail +2 Thus one has to post-process the file by reducing its size by one using `truncate -s -1 fixture`, **removing the newline byte**. -# Discovery: data consistency and resource usage of packed objects +# Discovery: data consistency and resource usage ## Summary @@ -298,14 +298,25 @@ It does not expose git configuration at this time and doesn't use git configurat We have no reason to believe that other implementations use git configuration beyond first initialization either or do anything to assure it remains up to date in memory after reading it. +### Index + +The `index` is a single file for the purpose acting as a staging area for building and manipulating upcoming commits. + +It is created and updated when checking out files into the working tree, and is used to keep track of working tree states while no commit object +was created yet, i.e. during `git add …`. + +`gitoxide` neither implements it nor it is used in concurrent environments, which is why we exclude it from this discovery. + ## Understanding changes to repositories -A change to any file in a git repository has the potential to affect the process operating on it. Related changes to multiple files are never atomic, and can be observed +A change to any file in a git repository has the potential to affect the process operating on it. For the sake of practicality and from experience, we focus exclusively +on changes to the object database and the reference database along with typical interactions. + +Related changes to multiple files are never atomic, and can be observed in an in-between state. **TBD** - ## Values - We value _highly_ to scale object access performance with cores. @@ -314,23 +325,30 @@ in an in-between state. - We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. - We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. -## Existing technical problems and their solutions +## Known technical problems and their solutions Solutions aren't always mutually exclusive despite the form of presentation suggesting it. -| **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | -|---------------------------------------------------------------|----------------------------------------------------------------------------|----------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| -| **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | -| | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | -| **2. file limit hit** | 1. fail | read-only possible | | gitoxide | -| | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | -| **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | -| | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | -| | 3. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | -| **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | -| | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | -| ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | -| | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|-------------------|----------------------------------------------------------------------------------------|----------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| +| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | +| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | +| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | +| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | +| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | +| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | +| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | +| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | +| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | +| **loose refs** | **7. readers observe in-between states of transactions** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | +| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | +| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | +| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | ### Amendum problem 5. @@ -339,17 +357,9 @@ how [geometric repacking works](https://github.blog/2021-04-29-scaling-monorepo- happen that multiple packs are changed which isn't atomic. However, since this will be done in an additive fashion, first adding the new packs based on existing packs and loose objects, and then removing the packs and loose objects they replace, there is no race happening as all objects stay reachable at all times. -## Approach - -We will look at typical access patterns holistically based on various use-cases and decide which existing technical solution would fit best. - -## Problems and solutions - -### Loose references database has inconsistent reads - -When updating multiple references in a single transaction, writers may observe an intermediate states with some refs pointing to the previous value, some pointing to the new. +## Use-Cases -The known **solution** is to switch to the `ref-table` implementation. +We will look at typical access patterns holistically based on various use-cases, look at the problems they would run into and pick their preferred solution. ## Learnings From a0bcae0f8a4f8bbbd89a203473d4b67500bc77c4 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 15:32:30 +0800 Subject: [PATCH 07/57] get started with analysing changes (soon changes in parallel) (#259) --- DEVELOPMENT.md | 80 +++++++++++++++++++++++++++----------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 618b6edd792..8a233296c7b 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -307,48 +307,47 @@ was created yet, i.e. during `git add …`. `gitoxide` neither implements it nor it is used in concurrent environments, which is why we exclude it from this discovery. -## Understanding changes to repositories +## Applying changes to repositories while maintaining consistency -A change to any file in a git repository has the potential to affect the process operating on it. For the sake of practicality and from experience, we focus exclusively -on changes to the object database and the reference database along with typical interactions. +A change to any file in a git repository has the potential to affect the process operating on it, hence it's important to perform them so these +observe only valid repository states. For the sake of simplicity, we assume atomicity for all operations, but these still have to be ordered correctly +to assure a consistent state. -Related changes to multiple files are never atomic, and can be observed -in an in-between state. +### Adding objects to the database -**TBD** - -## Values - -- We value _highly_ to scale object access performance with cores. -- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. - -- We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. -- We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. +1. add objects + * this is safe as these objects aren't reachable yet. + * object databases are append, only, so no existing state is ever affected. +2. adjust references + * point related references to the entry-points of the newly added objects (typically commits or tags) + * now the new objects can be discovered ## Known technical problems and their solutions +From what we know, changing repositories can have the following problems, many of which have decent solutions. Solutions aren't always mutually exclusive despite the form of presentation suggesting it. -| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | -|-------------------|----------------------------------------------------------------------------------------|----------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------| -| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | -| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | -| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | -| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | -| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | -| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | -| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | -| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | -| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | -| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | -| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | -| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | -| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | -| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | -| **loose refs** | **7. readers observe in-between states of transactions** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | -| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | -| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | -| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | +| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|-------------------|----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| +| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | +| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | +| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | +| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | +| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | +| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | +| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | +| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | +| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | +| **loose refs** | **7. readers observe in-between states of transactions** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | +| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | +| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | +| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | +| **all** | **10. disk full/write failure/OOM** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | ### Amendum problem 5. @@ -359,9 +358,18 @@ and loose objects, and then removing the packs and loose objects they replace, t ## Use-Cases -We will look at typical access patterns holistically based on various use-cases, look at the problems they would run into and pick their preferred solution. +We will look at typical access patterns holistically based on various use-cases, look at the problems they would run into and pick their preferred solution that optimizes +for efficiency/performance/reduction of waste. -## Learnings +## Learnings and Actions Items + +Please note that these are based on the following value system: + +- We value _highly_ to scale object access performance with cores. +- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. + +- We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. +- We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. ### Loose references database From c4f1db00df89db6681235377ca8938db00b53b77 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 17:51:05 +0800 Subject: [PATCH 08/57] Analysis of repository changes in presence of caches and lack of atomicity (#259) --- DEVELOPMENT.md | 164 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 121 insertions(+), 43 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 8a233296c7b..387b133374a 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -173,6 +173,8 @@ echo c56a8e7aa92c86c41a923bc760d2dc39e8a31cf7 | git cat-file --batch | tail +2 Thus one has to post-process the file by reducing its size by one using `truncate -s -1 fixture`, **removing the newline byte**. + + # Discovery: data consistency and resource usage ## Summary @@ -307,69 +309,133 @@ was created yet, i.e. during `git add …`. `gitoxide` neither implements it nor it is used in concurrent environments, which is why we exclude it from this discovery. +## Known technical problems and their solutions + +Before looking at how changes affect data consistency and resource usage affects reliability, let's list all known technical issues thus far. + +Solutions aren't always mutually exclusive despite the form of presentation suggesting it. + +| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| +| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | +| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | +| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | +| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | +| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | +| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | +| | | 3. catch error, force a pack refresh, repeat | can work in conjunction with similar shortcomings of loose reference database | needs mutability, burden on the API user; | | +| | | 4. write new objects into loose db only; maintenance is controlled by this process to synchronize pack placement with refresh of packs in 'managed' repository instances | | high implementation complexity; assumes complete control of one process over git repository, excluding running git-maintenance; new readers aren't allowed for a while until the new pack is placed causing some moments of unresponsiveness/waiting | | +| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | +| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | +| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | +| **loose refs** | **7. readers of multiple references observe in-between states of transactions which change a subset more than one of these references** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | +| | | 2. repeat reference resolution until it's stable twice in a row | works without ref-table | cost for reference resolution at least doubles if more than one reference is needed; burden on API user | | +| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | +| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | +| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | +| **all** | **10. disk full/write failure** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | + +### Amendum problem 5. + +Refreshing packs if an object is missed is the current way of handling writes to the pack database. As outlined in +how [geometric repacking works](https://github.blog/2021-04-29-scaling-monorepo-maintenance/#geometric-repacking) it can indeed +happen that multiple packs are changed which isn't atomic. However, since this will be done in an additive fashion, first adding the new packs based on existing packs +and loose objects, and then removing the packs and loose objects they replace, there is no race happening as all objects stay reachable at all times. + ## Applying changes to repositories while maintaining consistency -A change to any file in a git repository has the potential to affect the process operating on it, hence it's important to perform them so these +A change to any file in a git repository has the potential to affect the processes operating on it, hence it's important to perform them so these observe only valid repository states. For the sake of simplicity, we assume atomicity for all operations, but these still have to be ordered correctly to assure a consistent state. -### Adding objects to the database +**Adding objects to the database** 1. add objects - * this is safe as these objects aren't reachable yet. - * object databases are append, only, so no existing state is ever affected. -2. adjust references - * point related references to the entry-points of the newly added objects (typically commits or tags) - * now the new objects can be discovered + * this is safe as these objects aren't reachable yet. + * object databases are append, only, so no existing state is ever affected. +2. adjust reference(s) + * point related references to the entry-points of the newly added objects (typically commits or tags) + * now the new objects can be discovered -## Known technical problems and their solutions +**Multiple writers** will run into conflicts with each other if the references they are handling overlap. +**Readers** will see the object graph from a snapshot of a set of references. -From what we know, changing repositories can have the following problems, many of which have decent solutions. -Solutions aren't always mutually exclusive despite the form of presentation suggesting it. +**Reorganize objects or references for higher performance** -| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | -|-------------------|----------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| -| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | -| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | -| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | -| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | -| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | -| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | -| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | -| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | -| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | -| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | -| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | -| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | -| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | -| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | -| **loose refs** | **7. readers observe in-between states of transactions** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | -| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | -| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | -| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | -| **all** | **10. disk full/write failure/OOM** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | +This happens regularly during maintenance operations, which don't alter the repository content but its structure. -### Amendum problem 5. +1. find objects or references to include in the reorganization +2. write the reorganized data +3. remove the now obsolete data -Refreshing packs if an object is missed is the current way of handling writes to the pack database. As outlined in -how [geometric repacking works](https://github.blog/2021-04-29-scaling-monorepo-maintenance/#geometric-repacking) it can indeed -happen that multiple packs are changed which isn't atomic. However, since this will be done in an additive fashion, first adding the new packs based on existing packs -and loose objects, and then removing the packs and loose objects they replace, there is no race happening as all objects stay reachable at all times. +Multiple of these operations must synchronize or else unpredictable outcomes may occur. +**Writers** may add packs which remain untouched, the same applies to loose objects. +**Readers** will see the object graph from a snapshot of a set of references. + +**Read references and find objects reachable through them** + +1. obtain object entry-points pointed to by references +2. traverse object graph with entry points + +Multiple readers are always safe. + +It's worth repeating that as long as all of the numbered items are truly atomic, readers will always observe a consistent state in the presence of writers. + +### What breaks consistency + +#### Packs + +If all objects were loose, there would be no issue. However, for the above changes to work one must always be able to see all objects. +This isn't the case when packs are involved, as accessing them is too costly to redo the entire pack initialization work each time an object is retrieved +(e.g. list packs directory, memory map packs). + +Thus packs are cached in order to reuse them each time an object is accessed. Staleness is typically checked if an object isn't present, as it's assumed +that all objects accessed are reachable through the object graph, which means they ought to be present, triggering a pack refresh and retrying the retrieval. + +This assumption isn't necessarily the case (_anymore_) due to git features causing partial object repositories: + +- depth-pruned clones (i.e. `git clone --depth X`) where all commits after a certain topological distance to the latest commit are left out. + Running into this case during traversal is cheap despite triggering a refresh to detect that the object is truly not present, as it running a single time + while stopping an entire traversal. +- partial clones with `git clone --filter=blob` which can be used to download entire histories but blobs only for a portion of the (usually recent) history. + Operations like `git diff`, without special precautions, may run into a lot of object misses causing costly refreshes each time. + +Any writer that eventually creates a pack may break readers that operate on a stale cache, making strategies for mitigation like the one above mandatory. +Such a mitigation, however, requires interior mutability as object access is a read-only operation, which in turn is costly in multi-threaded applications or +for applications that don't need it, like CLIs. + +#### Loose References + +Writing loose references isn't actually atomic, so readers may observe some refs in an old and some in a new state. This isn't always a breaking issue like it is +the case for packs, the progam can still operate and is likely to produce correct (enough) outcomes. + +Mitigations are possible with careful programming on the API user's side or by using the `ref-table` database instead. + +### What reduces reliability + +System resources are limited and avoiding waste (along with decent configuration of `ulimits` and maintenance of disk-space) is one of the ways to keep an application +from failing. + +The biggest consumer of system resources most certainly is the pack database due to its cachce of memory mapped pack and index files, and avoiding to duplicate these within +a process is paramount. ## Use-Cases We will look at typical access patterns holistically based on various use-cases, look at the problems they would run into and pick their preferred solution that optimizes for efficiency/performance/reduction of waste. -## Learnings and Actions Items +## Learnings -Please note that these are based on the following value system: +### Server Applications -- We value _highly_ to scale object access performance with cores. -- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. - -- We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. -- We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. +- There is an off-chance that on busy repositories interactions that change multiple references with a single push are observable in fetches, clones with part of the refs + pointing to the old value, and a part pointing to the new one. Despite unlikely it might be that these refs are only valid together so such fetch would observe an invalid + state. Using a ref-table is the only known mitigation. ### Loose references database @@ -381,3 +447,15 @@ Please note that these are based on the following value system: ### Git configuration - some application types might never use it, which is why it should be controllable how and what is loaded (when we actually implement it). + +## Action Items + +Please note that these are based on the following value system: + +- We value _highly_ to scale object access performance with cores. +- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. + +- We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. +- We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. + +**TBD** From 64947769d2c620893163b3defdbe187051e5db46 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 18:17:03 +0800 Subject: [PATCH 09/57] =?UTF-8?q?Add=20CLI=20example=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …which definitely is the usecase for the current implementation. It's clear it would benefit from lazy-loading too though, even though it's not the most important thing right now. --- DEVELOPMENT.md | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 387b133374a..434fb2e80b9 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -317,7 +317,7 @@ Solutions aren't always mutually exclusive despite the form of presentation sugg | **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | |-------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| -| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read | gitoxide | +| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read, some applications might run out of system resources early even though they would not have needed all packs. | gitoxide | | | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | | **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | | | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | @@ -356,11 +356,11 @@ to assure a consistent state. **Adding objects to the database** 1. add objects - * this is safe as these objects aren't reachable yet. - * object databases are append, only, so no existing state is ever affected. + * this is safe as these objects aren't reachable yet. + * object databases are append, only, so no existing state is ever affected. 2. adjust reference(s) - * point related references to the entry-points of the newly added objects (typically commits or tags) - * now the new objects can be discovered + * point related references to the entry-points of the newly added objects (typically commits or tags) + * now the new objects can be discovered **Multiple writers** will run into conflicts with each other if the references they are handling overlap. **Readers** will see the object graph from a snapshot of a set of references. @@ -411,7 +411,7 @@ for applications that don't need it, like CLIs. #### Loose References -Writing loose references isn't actually atomic, so readers may observe some refs in an old and some in a new state. This isn't always a breaking issue like it is +Writing loose references isn't actually atomic, so readers may observe some references in an old and some in a new state. This isn't always a breaking issue like it is the case for packs, the progam can still operate and is likely to produce correct (enough) outcomes. Mitigations are possible with careful programming on the API user's side or by using the `ref-table` database instead. @@ -429,6 +429,26 @@ a process is paramount. We will look at typical access patterns holistically based on various use-cases, look at the problems they would run into and pick their preferred solution that optimizes for efficiency/performance/reduction of waste. +### Multi-threaded CLI tool operating on filtered partial repository clone + +The tool displays the reference graph, shows differences and creates 'blame' like views on files. As it's operated by the user, other writers are unlikely +to happen while it is running. + +Scaling near perfectly with added CPU cores and great performance even in the biggest mono-repos are its hallmark. + +As the program is is written with missing objects being the default, it gracefully handles and expects such cases. While running in TUI mode, it offers a manual +refresh to the user in case they fetched or pulled in the meantime, to refresh the screen and update its object database to make all newly added objects available. + +**Problems and Solutions** + +The program only deals with _1) initialization_ and _4) object misses_, where the latter are expected and handled gracefully. 1) is handled with solution 1., spending time to make +all packs available to get the best and most consistent multi-threaded object access performance. + +**Drawbacks** + +The program could benefit of using 1.2 instead of 1.1 which could cause exhaustion of file handles despite the user having no interest in evaluating all available objects, +but ideally that is possible without loosing performance during multi-threading. + ## Learnings ### Server Applications From ac8f5940e5e3de21c1669bda7f12a6f855e27cce Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sat, 20 Nov 2021 20:32:50 +0800 Subject: [PATCH 10/57] Add professional git hosting server usecase (#259) --- DEVELOPMENT.md | 109 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 84 insertions(+), 25 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 434fb2e80b9..cd7b5382fff 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -315,30 +315,30 @@ Before looking at how changes affect data consistency and resource usage affects Solutions aren't always mutually exclusive despite the form of presentation suggesting it. -| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | -|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| -| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read, some applications might run out of system resources early even though they would not have needed all packs. | gitoxide | -| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | -| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | -| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | -| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | -| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | -| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | -| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | -| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | -| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | -| | | 3. catch error, force a pack refresh, repeat | can work in conjunction with similar shortcomings of loose reference database | needs mutability, burden on the API user; | | -| | | 4. write new objects into loose db only; maintenance is controlled by this process to synchronize pack placement with refresh of packs in 'managed' repository instances | | high implementation complexity; assumes complete control of one process over git repository, excluding running git-maintenance; new readers aren't allowed for a while until the new pack is placed causing some moments of unresponsiveness/waiting | | -| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | -| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | -| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | -| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | -| **loose refs** | **7. readers of multiple references observe in-between states of transactions which change a subset more than one of these references** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | -| | | 2. repeat reference resolution until it's stable twice in a row | works without ref-table | cost for reference resolution at least doubles if more than one reference is needed; burden on API user | | -| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | -| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | -| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | -| **all** | **10. disk full/write failure** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | +| **Database** | **Problem** | **Solution** | **Benefits** | **shortcomings** | **Example Implementation** | +|-------------------|----------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------| +| **pack** | **1. initialization** | 1. map all packs at once | read-only possible; same latency for all objects | worst case init delay, highest resource usage, some packs may never be read, some applications might run out of system resources early even though they would not have needed all packs. | gitoxide | +| | | 2. map packs later on object access miss | nearly no init delay, no unnecessary work, resource usage as needed | needs mutability; first access of some objects may be slow | libgit2, git | +| **pack** | **2. file limit hit** | 1. fail | read-only possible | | gitoxide | +| | | 2. free resources and retry, then possibly fail | higher reliability | needs mutability | libgit2 (only on self-imposed limit) | +| **pack** | **3. file handle reduction/avoid hitting file limit** | 1. do not exceed internal handle count | some control over file handles | the entire application needs to respect count, needs sync with actual OS imposed limit, no sharing across multiple in-process pack databases | libgit2 (only within pack database) | +| | | 2. process-wide pooling of memory maps | share packs across multiple repositories instances | pack databases aren't self-contained anymore | | +| | | 3. map whole packs at once (instead of non-overlapping parts of them) | Only consume one file handle per pack (as opposed to one per region) | cannot handle packs larger than 4GB on 32bit systems | gitoxide | +| | | 4. Multi-pack index files | A single index can be used N packs, saving N-1 packs for N>1 packs | | | +| **pack** | **4. object miss** | 1. fail | fast if object misses are expected | incorrect or a burden in user code if miss is due to changed packs | gitoxide | +| | | 2. lazy-load more packs, retry, refresh known packs, retry, then fail | always correct even in the light of writers | can cause huge amount of extra work if object misses are expected; does not handle problem 5. | libgit2 | +| | | 3. catch error, force a pack refresh, repeat | can work in conjunction with similar shortcomings of loose reference database | needs mutability, burden on the API user; | | +| | | 4. writers force an update of the process-wide pool of packs after creating new packs and before updating references with the new objects | | high implementation complexity; assumes complete control of one process over git repository, excluding running git-maintenance; new readers aren't allowed for a while until the new pack is placed causing some moments of unresponsiveness/waiting | | +| **pack** | ~~5. race when creating/altering more than a pack at a time~~ | 1. ignore | | a chance for occasional object misses | all of them | +| | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | +| **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | +| | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | +| **loose refs** | **7. readers of multiple references observe in-between states of transactions which change a subset more than one of these references** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | +| | | 2. repeat reference resolution until it's stable twice in a row | works without ref-table | cost for reference resolution at least doubles if more than one reference is needed; burden on API user | | +| **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | +| | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | +| **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | +| **all** | **10. disk full/write failure** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | ### Amendum problem 5. @@ -441,7 +441,7 @@ refresh to the user in case they fetched or pulled in the meantime, to refresh t **Problems and Solutions** -The program only deals with _1) initialization_ and _4) object misses_, where the latter are expected and handled gracefully. 1) is handled with solution 1., spending time to make + * The program only deals with _1) initialization_ and _4) object misses_, where the latter are expected and handled gracefully. 1) is handled with solution 1., spending time to make all packs available to get the best and most consistent multi-threaded object access performance. **Drawbacks** @@ -449,6 +449,65 @@ all packs available to get the best and most consistent multi-threaded object ac The program could benefit of using 1.2 instead of 1.1 which could cause exhaustion of file handles despite the user having no interest in evaluating all available objects, but ideally that is possible without loosing performance during multi-threading. +### Professional git-hosting mono-repo server with git-maintenance tasks and just-in-time replication + +A single server process handles all reads and writes concurrently and receives and sends using the git protocol using an async TCP/IO framework. +All operations calling into `gitoxide` are unblocked using a thread pool. There is only one repository with a reference namespace for each client repository of which it +contains 50.000, each averaging 5000 objects and 5 references for a total of 250.000.000 objects and 250.000 references. As many of these repositories have PNG files and +binaries checked in, they average to about 25MB of storage space for a total of 1.25TB. + +At night there are about 1 send-pack (fetch) and 0.25 send-pack (clone) operations and 0.5 received-pack per second, ramping up to 6 send-pack (fetch) and 1.5 send-pack (clone) +and 3 receive-packs per second. At peak times during the day, these values are 10 times higher due to "push and ~~run~~ go home", causing 30 receive-packs per second for an hour or two. +_(assumption is 50.000 * 5 pushes and 10 fetches and 2.5 clones)_. Most client-side push operations are sending just 1 or 2 commits with a couple of changes, averaging pack sizes +of about 20 objects for a total of 5.000.000 new objects per day or an average of ~208.000 per hour. Only 1 branch is typically updated for 250.000 per day or ~10400 per hour. + +After each receive-pack was run and the client connection was informed of success, a background job is started to push the changes using the git protocol to a a sibling server +which also takes part in serving clients and also pushes its changes to this server (the writes caused by it are already part of the above measurements). + +`git-maintenance` runs every 5 minutes using the built-in scheduling mechanism to repack references, place loose objects into packs, create and update reverse index caches +and reachability bitmaps and repacks existing packs geometrically. Every 24h it recompresses the newly created packs created during geometric repacking. + +**Problems and Solutions** + +* **1.1** pack database - initialization → all at once + - as the server comes up, there is a moment to map all 27 to 28 packs and their indices, totalling up to 56 required file handles, along with maybe twice that many + additional packs in case of many pushes of new repositories happening within the `git-maintenance` time window. + - there is traffic at all times with some portions of all objects needed in the course of any 24h period +* **2.1** pack database - file limit hit → fail + - The worst case scenario for the amount of required file handles can be estimated and overcommitted. + - Update the `ulimit` if there is not enough available handles for peak operations. +* **3.2 & 3.3** pack database - file handle reduction/avoid hitting file limit → process wide memory map pooling & mapping entire packs/indices at once + - Note that multipack indices are related to maximizing object-count performance in conjunction with bitmaps, which is a separate topic, but in short: yes, want that, too :) +* **4.3 & 4.4** pack database - object miss → catch error, refresh packs, retry & writers force update shared memory map pool + - we read references at well known times and assure their objects exists right away. If not, a forced (but efficient) pack refresh should make the new objects available. + - For readers this mostly happens if external processes change the pack database, like git-maintenance, as writers keep the view on memory mapped packs fresh (efficiently) + by informing about the exact changes to avoid directory scanning. +* **6.1 & 6.2** pack database - reduced object lookup performance/too many small packs - explode small packs to loose objects & keep packs with more than 10.000 objects + - The trick is to strike the balance to keeping the amount of small packs low and to try not to have too many loose objects due to file system performance limitations with + directories containing a lot of files. Within the maintenance window, +* **7.1 or 7.2** loose reference db - readers of multiple references observe in-between states of transactions […] → switch to ref-table or read references at least twice + - With about 3 ref writes per second ref-table should not suffer from lock-contention, and should over better overall performance. + - Otherwise, _7.2_ seems doable as well even though using the loose ref database definitely is a nightmare for reference deletions. +* **8** ref-table - contention of writers around unfair lock - deal-breaking contention would be unexpected + - With the luxury of having a ref-table at all it would be even more of a luxury to use more advanced features to increase write performance. It's all a little bit too unknown + right now. +* **9** loose object database - too many loose objects reduce overall performance - fixed with git-maintenance + - See drawbacks though +* **10** - disk full + - It will be interesting to think of solutions involving the introduction of a bigger machine and migration of all repositories to there before the disk space runs out. + Otherwise we believe that managing disk space is part of operation and not the server process itself. +* **10** - write failure - fail connection + - write failures aren't specifically handled but result in typical Rust error behaviour probably alongside error reporting on the respective channels of the git-transport sideband. + - `gitoxide` is made to cleanup on failure and leave nothing behind that could accumulate. + +**Drawbacks** + +* running `git-maintenance` every 5 minutes during off-hours seems like overkill and it seems better to a built-in repacking scheduler that is based on actual load. + +### Self-hosted git server with front-end and zero-conf + +**TBD** + ## Learnings ### Server Applications From 55773b87feb246927a7421a822c45d9e101e985e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 10:31:48 +0800 Subject: [PATCH 11/57] Describe and propose fix for ref namespace-sharing issue (#259) --- DEVELOPMENT.md | 13 ++++++++++++- git-repository/src/easy/ext/reference.rs | 2 ++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index cd7b5382fff..37603428cc7 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -339,6 +339,7 @@ Solutions aren't always mutually exclusive despite the form of presentation sugg | | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | | **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | | **all** | **10. disk full/write failure** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | +| **loose refs** | **11. namespace is stored in database instance, so different `Easy` handles share it** | 1. Have one loose ref database per state (optionally) | | A default must be chosen and either one might be surprising to some, i.e. shared namespace as preference depends on the use case entirely, but seems like an unsafe default. | | ### Amendum problem 5. @@ -499,6 +500,9 @@ and reachability bitmaps and repacks existing packs geometrically. Every 24h it * **10** - write failure - fail connection - write failures aren't specifically handled but result in typical Rust error behaviour probably alongside error reporting on the respective channels of the git-transport sideband. - `gitoxide` is made to cleanup on failure and leave nothing behind that could accumulate. +* **11** - loose ref database - namespace isn't per connection + - This needs fixing in `gitoxide` to probably be unshared by default. Namespaces are most useful on the server, which would use an `EasyArcExclusive` per connection. + Sharing ref namespaces would be surprising and wrong. **Drawbacks** @@ -537,4 +541,11 @@ Please note that these are based on the following value system: - We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. - We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. -**TBD** +1. **per `Easy…` ref namespace** + - As loose ref databases are cheap, one should live on each state by default, and if these namespaces were cloned with the `Easy…` it would be straightforward to propagate + this configuration. + - There is still the open question how this should work when `ref-table` appears on the scene and its multi-file database that can most certainly benefit from shared memory maps + similarly to pack databases, thus sharing it on the `repo()`. Maybe it would not contain the namespace but pass it as a parameter every time, which for consistency would be + best ported to the loose ref database as well. That way, there would be one API governing both, unifying sharing on the `repo()`. + - Ref databases could have methods like `find_in_namespace()' along with the current ones, whereas the current ones delegate to the ones with namespace which may be `None`, + to accommodate for the fact that most won't use namespaces. diff --git a/git-repository/src/easy/ext/reference.rs b/git-repository/src/easy/ext/reference.rs index 68bc95d5905..c3efdeb5c1c 100644 --- a/git-repository/src/easy/ext/reference.rs +++ b/git-repository/src/easy/ext/reference.rs @@ -71,6 +71,8 @@ pub trait ReferenceAccessExt: easy::Access + Sized { } /// Set the reference namespace to the given value, like `"foo"` or `"foo/bar"`. + /// + /// Note that this value is shared across all `Easy…` instances as the value is stored in the shared `Repository`. fn set_namespace<'a, Name, E>( &mut self, namespace: Name, From ca109c1a3adb513a6df037e68ab4e755d9b9fe36 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 11:32:59 +0800 Subject: [PATCH 12/57] A use-case for an intranet scale repository server (without problems/solutions) (#259) --- DEVELOPMENT.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 37603428cc7..7a6e9b80b74 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -452,7 +452,7 @@ but ideally that is possible without loosing performance during multi-threading. ### Professional git-hosting mono-repo server with git-maintenance tasks and just-in-time replication -A single server process handles all reads and writes concurrently and receives and sends using the git protocol using an async TCP/IO framework. +A single server process handles all reads and writes concurrently and receives and sends using the git protocol using an async TCP/IO framework over a custom transport. All operations calling into `gitoxide` are unblocked using a thread pool. There is only one repository with a reference namespace for each client repository of which it contains 50.000, each averaging 5000 objects and 5 references for a total of 250.000.000 objects and 250.000 references. As many of these repositories have PNG files and binaries checked in, they average to about 25MB of storage space for a total of 1.25TB. @@ -510,7 +510,24 @@ and reachability bitmaps and repacks existing packs geometrically. Every 24h it ### Self-hosted git server with front-end and zero-conf -**TBD** +This server is typically used by teams and runs within the intranet. Teams love it because it's easy to setup and usually 'just works' thanks to a local sqlite database +and a directory to store repositories in. + +It provides a front-end which displays repository data and allows team-members to create issues and comment on each others merge requests, among other things. This browser +application uses websockets to keep a connection to the server through which data can be retrieved using a simple request-response protocol. After some time of inactivity +it automatically disconnects, but is automatically revived if the browser tab is activated again. + +The implementation prides itself for showing each commit message affecting the files currently displayed without caching them in the database due to its clever use of +multithreading, offloading segments of the history to all threads available for processing, sending the results back as they come in and stopping the processing once all +files and directories are annotated. It uses a single `Repository` instance per thread which changes as the client browses to different repositories, and expects all +objects to exists even in presence of pushes happening in the meantime. It checks for the latter by regularly polling if the commit of the current +branch changed compared to the previous time it checked. + +Clients can push and fetch to the server via SSH and HTTP(S) transports. The SSH transport is implemented with a helper program that ultimately +calls `git receive-pack` and `git upload-pack`. When HTTP(S) is used, the serve program handles the connection itself, using one thread per connection and +opens up a new `Repository` for each of them. Multi-threading is used to build packs when sending or resolve them when receiving. + +The default favors speed and using all available cores, but savvy users can run it with `--threads 1` to only ever use a single thread for processing. ## Learnings From 2028f58f3a5365268d4a99e32a6fa71025efe95b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 14:03:39 +0800 Subject: [PATCH 13/57] Finish zero-conf easy-maintenance server problem statement (#259) --- DEVELOPMENT.md | 47 ++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 7a6e9b80b74..3d5a1d805a2 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -333,11 +333,13 @@ Solutions aren't always mutually exclusive despite the form of presentation sugg | | | 2. retry more than one time | greatly reduced likelyhood of object misses | | | | **pack** | **6.too many (small) packs (i.e. due to pack-receive) reduce lookup performance** | 1. explode pack into loose objects (and deal with them separately) | can run in parallel (but is typically bound by max IOP/s) | might take a while if many objects are contained in the pack due to file IOP/s; needs recompresssion and looses delta compression; risk of too many small objects | | | | | 2. combine multiple packs into one | keep all benefits of packs; very fast if pack-to-pack copy is used; can run in parallel (but is typically bound by max IOP/s) | combining with big packs takes has to write a lot of data; can be costly if pack delta compression is used | | +| | | 3. Just-in-time maintenance after writes | tuned to run just at the right time to run just as much as needed | an implementation isn't trivial as there must only be one maintenance operation per repository at a time, so some queue should be made available to not skip maintenance just because one is running already. | | | **loose refs** | **7. readers of multiple references observe in-between states of transactions which change a subset more than one of these references** | 1. switch to ref-table database | | switches to shortcomings of ref-table | | | | | 2. repeat reference resolution until it's stable twice in a row | works without ref-table | cost for reference resolution at least doubles if more than one reference is needed; burden on API user | | | **ref-table** | **8. contention of writers around unfair lock** | 1. implement process-internal queuing mechanism | minimal cost and high throughput with minimal, in-process synchronization | doesn't help with contention caused by multiple processes | | | | | 2. use ref-table partitions | works for in-process and multi-process case, even though it might be slower than queues. | I can't find information about it anymore | | | **loose objects** | **9. too many loose objects reduce overall performance** | 1. use packs | | needs scheduled or otherwise managed maintenance, and winning strategies depend on the size and business of the repository | | +| | | 2. Just-in-time maintenance after writes | tuned to run just at the right time to run just as much as needed | an implementation isn't trivial as there must only be one maintenance operation per repository at a time, so some queue should be made available to not skip maintenance just because one is running already. | | | **all** | **10. disk full/write failure** | 1. write temporary files first, with robust auto-removal, move into place when completed; partial transactions are robustly rolled back or stray files aren't discoverable or are valid on their own | | | gitoxide; git (not libgit2, it leaves partial packs on receive for example) | | **loose refs** | **11. namespace is stored in database instance, so different `Easy` handles share it** | 1. Have one loose ref database per state (optionally) | | A default must be chosen and either one might be surprising to some, i.e. shared namespace as preference depends on the use case entirely, but seems like an unsafe default. | | @@ -486,7 +488,7 @@ and reachability bitmaps and repacks existing packs geometrically. Every 24h it * **6.1 & 6.2** pack database - reduced object lookup performance/too many small packs - explode small packs to loose objects & keep packs with more than 10.000 objects - The trick is to strike the balance to keeping the amount of small packs low and to try not to have too many loose objects due to file system performance limitations with directories containing a lot of files. Within the maintenance window, -* **7.1 or 7.2** loose reference db - readers of multiple references observe in-between states of transactions […] → switch to ref-table or read references at least twice +* **7.1 or 7.2** loose reference database - readers of multiple references observe in-between states of transactions […] → switch to ref-table or read references at least twice - With about 3 ref writes per second ref-table should not suffer from lock-contention, and should over better overall performance. - Otherwise, _7.2_ seems doable as well even though using the loose ref database definitely is a nightmare for reference deletions. * **8** ref-table - contention of writers around unfair lock - deal-breaking contention would be unexpected @@ -508,14 +510,15 @@ and reachability bitmaps and repacks existing packs geometrically. Every 24h it * running `git-maintenance` every 5 minutes during off-hours seems like overkill and it seems better to a built-in repacking scheduler that is based on actual load. -### Self-hosted git server with front-end and zero-conf +### Self-hosted git server with front-end and zero-conf and auto-maintenance This server is typically used by teams and runs within the intranet. Teams love it because it's easy to setup and usually 'just works' thanks to a local sqlite database -and a directory to store repositories in. +and a directory to store repositories in. Some teams have it running for 10 years without issues. It provides a front-end which displays repository data and allows team-members to create issues and comment on each others merge requests, among other things. This browser application uses websockets to keep a connection to the server through which data can be retrieved using a simple request-response protocol. After some time of inactivity -it automatically disconnects, but is automatically revived if the browser tab is activated again. +it automatically disconnects, but is automatically revived if the browser tab is activated again. There is prominent warnings if the disk space is low along with suggestions +for a fix. The implementation prides itself for showing each commit message affecting the files currently displayed without caching them in the database due to its clever use of multithreading, offloading segments of the history to all threads available for processing, sending the results back as they come in and stopping the processing once all @@ -525,10 +528,44 @@ branch changed compared to the previous time it checked. Clients can push and fetch to the server via SSH and HTTP(S) transports. The SSH transport is implemented with a helper program that ultimately calls `git receive-pack` and `git upload-pack`. When HTTP(S) is used, the serve program handles the connection itself, using one thread per connection and -opens up a new `Repository` for each of them. Multi-threading is used to build packs when sending or resolve them when receiving. +opens up a new `Repository` for each of them. Multi-threading is used to build packs when sending or resolve them when receiving. After writing a pack the server +will schedule a background maintenance process to keep repositories fast and small. The default favors speed and using all available cores, but savvy users can run it with `--threads 1` to only ever use a single thread for processing. +**Problems and Solutions** + +* **1.2** pack database - map packs later on object access miss + - The server generally creates one `Repository` instance per connection which won't know the objects to access in advance. Minimizing initialization times is paramount. + - The code is willing to differentiate between single- and multi-threaded mode when both are sufficiently different, but otherwise uses multi-threading compatible types + even if using only one thread. As it doesn't expect to use too many cores at once this seems acceptable. +* **2.1** pack database - file limit hit → fail + - Since `Repository` instances aren't shared across connection, there is no way to free files. The system relies heavily on lazy-loading of pack data to not use system + resources only when needed. + - Update the `ulimit` if there is not enough available handles for peak operations. +* **3.2 & 3.3** pack database - file handle reduction/avoid hitting file limit → process wide memory map pooling & mapping entire packs/indices at once + - Note that 3.2 is very interesting as a way to deduplicate memory maps of multiple connections to the same repository. It should be fine to do without such optimization though + and just increase the limit for file handles. +* **4.2** pack database - object miss → lazy-load next pack, retry, repeat until there is no more pack to load + - client code doesn't want to know about internal optimizations and thus prefers lazy-loading. It's notable that none-existing objects will force loading all packs that way, + but that isn't expected on a server that definitely holds the entire object database. +* **6.1 & 6.2 & 6.3** pack database - reduced object lookup performance/too many small packs - explode small packs to loose objects & keep packs with more than 10.000 objects & just-in-time maintenance + - Even in low-traffic servers its important to maintain them to avoid running into unavailability. +* **7.1** loose reference database - readers of multiple references observe in-between states of transactions […] → switch to ref-table + - concurrency issues are less likely on a low-traffic server with people mostly pushing a single branch at a time. However, switching to ref-table solves a potential + issue that should be chosen if there is no other reason to use a loose reference database +* **8** ref-table - contention of writers around unfair lock - n/a + - not an issue as there isn't enough traffic here +* **9.2** loose object database - too many loose objects reduce overall performance - just-in-time maintenance +* **10** - disk full - display early warnings in the front-end to every user to get it fixed + - This solution is implemented on application side (and not in `gitoxide`), it's intersting enough to mention though for systems that operate themselves. + - One could also imagine that it tries to spend the nights aggressively compression repositories, some low-hanging fruits there. +* **10** - write failure - fail connection + - write failures aren't specifically handled but result in typical Rust error behaviour probably alongside error reporting on the respective channels of the git-transport sideband. + - `gitoxide` is made to cleanup on failure and leave nothing behind that could accumulate. +* **11** - loose ref database - namespace isn't per connection + - Needs fixing in `gitoxide`. + ## Learnings ### Server Applications From c042072833bdf780ffea0ba5c3a5b1c84427c4ef Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 15:34:18 +0800 Subject: [PATCH 14/57] First thoughts on how to tackle the pack problem (#259) Even though I definitely don't like all that complexity :/ --- DEVELOPMENT.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 3d5a1d805a2..ea8bd07b00c 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -603,3 +603,22 @@ Please note that these are based on the following value system: best ported to the loose ref database as well. That way, there would be one API governing both, unifying sharing on the `repo()`. - Ref databases could have methods like `find_in_namespace()' along with the current ones, whereas the current ones delegate to the ones with namespace which may be `None`, to accommodate for the fact that most won't use namespaces. + +2. **Parameterize some sort of Policy into linked::ODB/compound::ODB** + - First off, this needs an experiment to try it out quickly. + - **thoughts** + - Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different + trait bounds (once with and once without `Send + Sync`. I would like to avoid more feature toggles in `git-repository`, but could live with it. + - `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this + would also lead in a type-explosion for `Easy` and may force it to have a type parameter too. + - To keep the `Repository` free of type parameters we could boil policies down to typical policies, like Eager, Lazy, LazyThreadSafe, PooledLazy, PooledLazyThreadSafe, + all with different tradeoffs. On that level, maybe 3 to 5 feature toggles would work, but who likes feature toggles especially if they aren't additive? + - `contains(oid)` is not actually exposed in any trait and not used much in `git` either, even though it is optimized for by loading pack data only on demand. We, however, + use `git_pack::Bundle` as smallest unit, which is a mapped index _and_ data file, thus forcing more work to be done in some cases. There is only one multi-pack index + per repository, but that would force all packs to be loaded if it was implemented similarly, but that shows that Bundle's probably aren't the right abstraction or + have to make their pack data optional. If that happens, we definitely need some sort of policy to make this work. Definitely put `contains(oid)` into the `Find` trait + or a separate trait to enforce us dealing with this and keep multi-pack indices in mind. + - Some operations rely on pack-ids and or indices into a vector to find a pack, and this must either be stable or stable for long enough especially in the presence + of refreshes. + - Can we be sure that there won't be another type parameter in `Repository` for the refs database? If yes, we basically say that `ref-table` will work read-only or + hides its interior mutability behind RwLocks. It's probably going to be the latter as it should be fast enough, but it's sad there is inevitably some loss :/. From 63dd57b0dde9461e659d0a0a2de3f30e24214e24 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 17:56:46 +0800 Subject: [PATCH 15/57] Some more thoughts, it _could_ even work (#259) --- DEVELOPMENT.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index ea8bd07b00c..523bd4672fd 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -606,7 +606,7 @@ Please note that these are based on the following value system: 2. **Parameterize some sort of Policy into linked::ODB/compound::ODB** - First off, this needs an experiment to try it out quickly. - - **thoughts** + - **initial thoughts** - Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different trait bounds (once with and once without `Send + Sync`. I would like to avoid more feature toggles in `git-repository`, but could live with it. - `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this @@ -622,3 +622,10 @@ Please note that these are based on the following value system: of refreshes. - Can we be sure that there won't be another type parameter in `Repository` for the refs database? If yes, we basically say that `ref-table` will work read-only or hides its interior mutability behind RwLocks. It's probably going to be the latter as it should be fast enough, but it's sad there is inevitably some loss :/. + - There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time + unless there is the need for a refresh. This means bundles are not owned by the compound::Store anymore, but rather their container is. + - There should be a way to gradually build up that container, so that one says: get next pack while we look for an object, otherwise the first refresh would + map all files right away. Ideally it's index by index for `contains()` and index + data of a bundle at a time for `find()`. + - This also means that if `contains()` is even a possibility, that on each call one will have to refresh the `view`. This might mean we want to split out that functionality + into their own traits and rather hand people an object which is created after the `view` was configured - i.e. after calls to `contains()` one has to set the + view to also contain packs. Ideally that happens on demand though… right now indices and packs are quite coupled so maybe this has to go away. From 22caa6c84cd8a7619c479359ea4d1adcab6d3654 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 18:48:36 +0800 Subject: [PATCH 16/57] Clarification of thoughts, onto something here (#259) --- DEVELOPMENT.md | 36 ++++++++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 523bd4672fd..746f0f3161f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -607,25 +607,45 @@ Please note that these are based on the following value system: 2. **Parameterize some sort of Policy into linked::ODB/compound::ODB** - First off, this needs an experiment to try it out quickly. - **initial thoughts** - - Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different + - ✔️ Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different trait bounds (once with and once without `Send + Sync`. I would like to avoid more feature toggles in `git-repository`, but could live with it. - - `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this - would also lead in a type-explosion for `Easy` and may force it to have a type parameter too. - - To keep the `Repository` free of type parameters we could boil policies down to typical policies, like Eager, Lazy, LazyThreadSafe, PooledLazy, PooledLazyThreadSafe, + - `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this + would also lead in a type-explosion for `Easy` and may force it to have a type parameter too. + - ❌ To keep the `Repository` free of type parameters we could boil policies down to typical policies, like Eager, Lazy, LazyThreadSafe, PooledLazy, PooledLazyThreadSafe, all with different tradeoffs. On that level, maybe 3 to 5 feature toggles would work, but who likes feature toggles especially if they aren't additive? - - `contains(oid)` is not actually exposed in any trait and not used much in `git` either, even though it is optimized for by loading pack data only on demand. We, however, + - ✔️ `contains(oid)` is not actually exposed in any trait and not used much in `git` either, even though it is optimized for by loading pack data only on demand. We, however, use `git_pack::Bundle` as smallest unit, which is a mapped index _and_ data file, thus forcing more work to be done in some cases. There is only one multi-pack index per repository, but that would force all packs to be loaded if it was implemented similarly, but that shows that Bundle's probably aren't the right abstraction or have to make their pack data optional. If that happens, we definitely need some sort of policy to make this work. Definitely put `contains(oid)` into the `Find` trait or a separate trait to enforce us dealing with this and keep multi-pack indices in mind. - - Some operations rely on pack-ids and or indices into a vector to find a pack, and this must either be stable or stable for long enough especially in the presence + - ✔️ Some operations rely on pack-ids and or indices into a vector to find a pack, and this must either be stable or stable for long enough especially in the presence of refreshes. - - Can we be sure that there won't be another type parameter in `Repository` for the refs database? If yes, we basically say that `ref-table` will work read-only or + - Make sure pack-ids are always incrementing, which is how it's currently implemented, too (more or less, it always restarts from 0 which should be fine but why risk it). + - ✔️ Can we be sure that there won't be another type parameter in `Repository` for the refs database? If yes, we basically say that `ref-table` will work read-only or hides its interior mutability behind RwLocks. It's probably going to be the latter as it should be fast enough, but it's sad there is inevitably some loss :/. - - There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time + - Technically it has a similar problem as the pack database, except that it's not append only which tips the scale towards interior mutability or `find(…)` + implementations that are `&mut self`. However, that would exclude sharing of the ref-db, which probably is acceptable given that its usually just used to kick off + multi-threaded computations. We don't even want to it hide its mutability as it's probably a catastrophe if multiple threads tried to read from it. So it would have + to be behind a good old RWLock and maybe that's something we can live with for the Multi-RefDB that works for either loose refs or ref-table. A single lock to + make the ref-table version sync. Unknown how that would relate to supporting parallel writing some time, but let's just say that _we don't think that another type + parameter will be necessary for this one_, ever. It might even be an option to have one per `Easy::state` if it's reasonably fast, so yes, let's not bother with that now. + - ❓ There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time unless there is the need for a refresh. This means bundles are not owned by the compound::Store anymore, but rather their container is. - There should be a way to gradually build up that container, so that one says: get next pack while we look for an object, otherwise the first refresh would map all files right away. Ideally it's index by index for `contains()` and index + data of a bundle at a time for `find()`. - This also means that if `contains()` is even a possibility, that on each call one will have to refresh the `view`. This might mean we want to split out that functionality into their own traits and rather hand people an object which is created after the `view` was configured - i.e. after calls to `contains()` one has to set the view to also contain packs. Ideally that happens on demand though… right now indices and packs are quite coupled so maybe this has to go away. + - If there are views, these really should be per-thread because only then we can turn RwLocks/Mutexes into RefCells for mutation of the internal view, which is then + also specific to the access pattern of the actual reader _and_ will be discarded when done (as opposed to a shared view in the Repository which lives much longer). + Or in other words, `Policy` implementation could optionally be thread-safe, whereas the acutal object repo is not, but the policy could be shared then if behind `Borrow`. + - **in the clear** + - `Repository` and `Easy…` are parameterized over the `pack::Policy` + - There should be type-defs for typical setups + - Algorithmically, object access starts out with `Indices`, loading one at a time, and once the right pack is found, the pack data is loaded (if needed) to possibly extract + object data. + - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. + - a `Policy` could implement the building blocks needed by that algorithm. + - The `Policy` should go through `Deref` to + - Having `views` would complicate the generics tremendously and it's probably enough to handle the containment without generics entirely. Maybe there could be an optional + code path that's only really useful for eagerly loaded policies that can provide vectors of things (i.e. their internal storage). From 2e62eb808f7669ec2522addc5d43466992a8fde1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 19:33:30 +0800 Subject: [PATCH 17/57] More clarification around Policies, which makes `Repository` alone unusable (#259) --- DEVELOPMENT.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 746f0f3161f..560c2a5dc77 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -638,7 +638,8 @@ Please note that these are based on the following value system: view to also contain packs. Ideally that happens on demand though… right now indices and packs are quite coupled so maybe this has to go away. - If there are views, these really should be per-thread because only then we can turn RwLocks/Mutexes into RefCells for mutation of the internal view, which is then also specific to the access pattern of the actual reader _and_ will be discarded when done (as opposed to a shared view in the Repository which lives much longer). - Or in other words, `Policy` implementation could optionally be thread-safe, whereas the acutal object repo is not, but the policy could be shared then if behind `Borrow`. + Or in other words, `Policy` implementation could optionally be thread-safe, whereas the actual object repo is not, but the policy could be shared then if behind `Borrow`. + Doing this would also mean that the Repository doesn't even have a repository anymore, but just a `pack::Policy`. - **in the clear** - `Repository` and `Easy…` are parameterized over the `pack::Policy` - There should be type-defs for typical setups @@ -646,6 +647,7 @@ Please note that these are based on the following value system: object data. - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. - a `Policy` could implement the building blocks needed by that algorithm. - - The `Policy` should go through `Deref` to + - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations + would either duplicate code or forward to even more generic implementations. - Having `views` would complicate the generics tremendously and it's probably enough to handle the containment without generics entirely. Maybe there could be an optional code path that's only really useful for eagerly loaded policies that can provide vectors of things (i.e. their internal storage). From b7eaf747eba703e0d7dc6a65687ca72ae7e2ec5b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 20:32:55 +0800 Subject: [PATCH 18/57] Some more ideas that will modify what `Repository` is fundamentally (#259) Because it really is a storage location for shared data that doesn't do anything on its own. --- DEVELOPMENT.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 560c2a5dc77..309f58e25c4 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -629,7 +629,7 @@ Please note that these are based on the following value system: to be behind a good old RWLock and maybe that's something we can live with for the Multi-RefDB that works for either loose refs or ref-table. A single lock to make the ref-table version sync. Unknown how that would relate to supporting parallel writing some time, but let's just say that _we don't think that another type parameter will be necessary for this one_, ever. It might even be an option to have one per `Easy::state` if it's reasonably fast, so yes, let's not bother with that now. - - ❓ There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time + - ✔️ There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time unless there is the need for a refresh. This means bundles are not owned by the compound::Store anymore, but rather their container is. - There should be a way to gradually build up that container, so that one says: get next pack while we look for an object, otherwise the first refresh would map all files right away. Ideally it's index by index for `contains()` and index + data of a bundle at a time for `find()`. @@ -641,13 +641,17 @@ Please note that these are based on the following value system: Or in other words, `Policy` implementation could optionally be thread-safe, whereas the actual object repo is not, but the policy could be shared then if behind `Borrow`. Doing this would also mean that the Repository doesn't even have a repository anymore, but just a `pack::Policy`. - **in the clear** - - `Repository` and `Easy…` are parameterized over the `pack::Policy` - - There should be type-defs for typical setups - Algorithmically, object access starts out with `Indices`, loading one at a time, and once the right pack is found, the pack data is loaded (if needed) to possibly extract object data. - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. - a `Policy` could implement the building blocks needed by that algorithm. - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations would either duplicate code or forward to even more generic implementations. - - Having `views` would complicate the generics tremendously and it's probably enough to handle the containment without generics entirely. Maybe there could be an optional - code path that's only really useful for eagerly loaded policies that can provide vectors of things (i.e. their internal storage). + - `Policy` might need to be implemented for `&T` and `Box|Rc|Arc` where `T: Policy` as well, let's see. + - `Views` work if they are placed in the state and are thread-local for that reason, with interior mutability. A `view` will just be the linked odb implementation itself. + - It should contain a borrowed `Policy` which is owned in the shared `Repository`. The latter should contains a list of paths to object databases (i.e. alternates) to + allow seeing all multi-pack indices and indices like it is one repository. + - `Repository` turns into `RepositoryLocal` with a `Rc` that isn't `Sync` and adds a `Repository` type that does the same but with `Arc`. + - each of these repository types has their own `Easy` types. + - _Difficulties_: Some `Platform` types store `repo: Access::RepoRef` and use `repo.deref().odb`, but that now only has a `Policy` from which a new `State` should be created + or they store the State right away… . From dad4bf981801716b59b1060b865b38e08c4948b9 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 07:40:24 +0800 Subject: [PATCH 19/57] even more notes about implementation details (#259) But the best of it is that auto-refreshing Policies should be the new default. --- DEVELOPMENT.md | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 309f58e25c4..70e034c42cf 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -568,15 +568,9 @@ The default favors speed and using all available cores, but savvy users can run ## Learnings -### Server Applications - -- There is an off-chance that on busy repositories interactions that change multiple references with a single push are observable in fetches, clones with part of the refs - pointing to the old value, and a part pointing to the new one. Despite unlikely it might be that these refs are only valid together so such fetch would observe an invalid - state. Using a ref-table is the only known mitigation. - ### Loose references database -- When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as chocking point. It's certainly worth it to split transactions +- When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as choking point. It's certainly worth it to split transactions so that deletions are done separately from updates to allow usage of the most suitable locking strategies. - When adding/updating references, prefer to fail immediately as the chance for the same change being made concurrently is low, and failure would occur after waiting for the lock due to constraint mismatch. @@ -629,25 +623,32 @@ Please note that these are based on the following value system: to be behind a good old RWLock and maybe that's something we can live with for the Multi-RefDB that works for either loose refs or ref-table. A single lock to make the ref-table version sync. Unknown how that would relate to supporting parallel writing some time, but let's just say that _we don't think that another type parameter will be necessary for this one_, ever. It might even be an option to have one per `Easy::state` if it's reasonably fast, so yes, let's not bother with that now. + - The ref-table DB could probably must be `&mut` for finding references and then be placed behind an Rc or Arc respectively. The `Repository` will just be + share storage container and maybe there will be a better name for it. - ✔️ There is also the idea of 'views' which provide an owned set of bundles to iterate over so that pack access doesn't have to go through a lock most of the time unless there is the need for a refresh. This means bundles are not owned by the compound::Store anymore, but rather their container is. - There should be a way to gradually build up that container, so that one says: get next pack while we look for an object, otherwise the first refresh would map all files right away. Ideally it's index by index for `contains()` and index + data of a bundle at a time for `find()`. - - This also means that if `contains()` is even a possibility, that on each call one will have to refresh the `view`. This might mean we want to split out that functionality + - ❌ This also means that if `contains()` is even a possibility, that on each call one will have to refresh the `view`. This might mean we want to split out that functionality into their own traits and rather hand people an object which is created after the `view` was configured - i.e. after calls to `contains()` one has to set the view to also contain packs. Ideally that happens on demand though… right now indices and packs are quite coupled so maybe this has to go away. - If there are views, these really should be per-thread because only then we can turn RwLocks/Mutexes into RefCells for mutation of the internal view, which is then also specific to the access pattern of the actual reader _and_ will be discarded when done (as opposed to a shared view in the Repository which lives much longer). Or in other words, `Policy` implementation could optionally be thread-safe, whereas the actual object repo is not, but the policy could be shared then if behind `Borrow`. Doing this would also mean that the Repository doesn't even have a repository anymore, but just a `pack::Policy`. + - It's still unclear how to best control repeated refreshes in presence of the possibility of blobs missing due to `--filter=blob`. Maybe this is the moment where one would + access the policy directly to turn off refreshes for the duration of the operation. - **in the clear** - - Algorithmically, object access starts out with `Indices`, loading one at a time, and once the right pack is found, the pack data is loaded (if needed) to possibly extract - object data. + - Algorithmically, object access starts out with `Indices`, fetching more until the object is contained within, then the corresponding the pack data is loaded (if needed) + to possibly extract object data. + - The result of a contains check would have to be a pack id to allow looking it up in the own cache and then ask it to be loaded/returned by the `Policy`, along with + maybe the indices required to fetch it from the pack, maybe just a `bundle::Location` of sorts. It could also be a struct to encode the pack-id and the index within it. - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. - a `Policy` could implement the building blocks needed by that algorithm. - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations - would either duplicate code or forward to even more generic implementations. + would either duplicate code or forward to even more generic implementations. - `Policy` might need to be implemented for `&T` and `Box|Rc|Arc` where `T: Policy` as well, let's see. + - Of course, the `Policy` must be object safe - `Views` work if they are placed in the state and are thread-local for that reason, with interior mutability. A `view` will just be the linked odb implementation itself. - It should contain a borrowed `Policy` which is owned in the shared `Repository`. The latter should contains a list of paths to object databases (i.e. alternates) to allow seeing all multi-pack indices and indices like it is one repository. @@ -655,3 +656,6 @@ Please note that these are based on the following value system: - each of these repository types has their own `Easy` types. - _Difficulties_: Some `Platform` types store `repo: Access::RepoRef` and use `repo.deref().odb`, but that now only has a `Policy` from which a new `State` should be created or they store the State right away… . + - The default `Policy` should be the least surprising, hence incur mapping costs over time and automatically refresh itself if needed. + - Make sure auto-refresh can be turned off on policy level to allow users to probe for objects despite `--filter=blob` or similar without slowing down to a crawl due to + a refresh each time an object is missing. From 761c63a3cd8303a1272da3f1f90735e2c14cd34a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Sun, 21 Nov 2021 19:53:04 +0800 Subject: [PATCH 20/57] add experiment for type system (#259) --- Cargo.lock | 4 ++++ Cargo.toml | 1 + experiments/odb-redesign/Cargo.toml | 8 ++++++++ experiments/odb-redesign/src/lib.rs | 8 ++++++++ 4 files changed, 21 insertions(+) create mode 100644 experiments/odb-redesign/Cargo.toml create mode 100644 experiments/odb-redesign/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 7e4829568e5..154dc0b2ad8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1836,6 +1836,10 @@ dependencies = [ "rayon", ] +[[package]] +name = "odb-redesign" +version = "0.1.0" + [[package]] name = "once_cell" version = "1.8.0" diff --git a/Cargo.toml b/Cargo.toml index 29abdc9c983..fb6fb7840d5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -132,6 +132,7 @@ members = [ "experiments/object-access", "experiments/diffing", "experiments/traversal", + "experiments/odb-redesign", "cargo-smart-release", diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml new file mode 100644 index 00000000000..47df98245e6 --- /dev/null +++ b/experiments/odb-redesign/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "odb-redesign" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs new file mode 100644 index 00000000000..1b4a90c9383 --- /dev/null +++ b/experiments/odb-redesign/src/lib.rs @@ -0,0 +1,8 @@ +#[cfg(test)] +mod tests { + #[test] + fn it_works() { + let result = 2 + 2; + assert_eq!(result, 4); + } +} From cf17816640ce2418624de04c3979a2c03a1cc917 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 09:00:05 +0800 Subject: [PATCH 21/57] A first stab at sketching types (#259) Already showing some issues but I think it can be done smartly nonetheless. --- Cargo.lock | 4 ++ experiments/odb-redesign/Cargo.toml | 2 + experiments/odb-redesign/src/lib.rs | 57 ++++++++++++++++++++++++++--- 3 files changed, 57 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 154dc0b2ad8..67234d9962b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1839,6 +1839,10 @@ dependencies = [ [[package]] name = "odb-redesign" version = "0.1.0" +dependencies = [ + "git-pack", + "parking_lot", +] [[package]] name = "once_cell" diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml index 47df98245e6..488acefbfdd 100644 --- a/experiments/odb-redesign/Cargo.toml +++ b/experiments/odb-redesign/Cargo.toml @@ -6,3 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +git-pack = { path = "../../git-pack", version = "*" } +parking_lot = { version = "0.11.0", default-features = false } diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 1b4a90c9383..5833b2928e8 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -1,8 +1,53 @@ -#[cfg(test)] -mod tests { - #[test] - fn it_works() { - let result = 2 + 2; - assert_eq!(result, 4); +#![allow(dead_code)] + +use std::cell::RefCell; +use std::ops::Deref; +use std::rc::Rc; +use std::sync::Arc; + +pub mod policy { + pub struct IndexMarker(u32); + + pub mod next_indices { + use crate::policy::IndexMarker; + + pub enum Outcome { + Next { + indices: Vec, // should probably be small vec to get around most allocations + mark: IndexMarker, // use to show where you left off next time you call + }, + /// No new indices to look at, caller should stop give up + NoMoreIndices, + } } } + +trait Policy { + type PackData: Deref; + type PackIndex: Deref; + + fn next_indices(&mut self) -> std::io::Result>; +} + +struct EagerLocal {} + +type DynRcPolicy = dyn Policy, PackData = Rc>; +type DynArcPolicy = dyn Policy, PackData = Arc>; + +/// Formerly RepositoryLocal +struct SharedLocal { + pack_policy: Rc, PackData = Rc>>>, +} + +/// Formerly Repository +struct Shared { + pack_policy: Arc>, +} + +/// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics +/// should be a thing. +/// Without generics, there would be a thread-safe and thread-local version of everything. +/// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? +struct SharedGeneric { + pack_policy: PackPolicy, +} From 882dde9790d2f0d9bae653c73f74789fcb401e39 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 09:14:31 +0800 Subject: [PATCH 22/57] FAIL: try to make thread-safety togglable in just git-features (#259) However, dyn-traits can't be combined with non-auto traits which prevents this. Maybe a macro would do though. --- experiments/odb-redesign/src/lib.rs | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 5833b2928e8..8aa6e3dd8c6 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -31,8 +31,35 @@ trait Policy { struct EagerLocal {} +mod thread_safe { + use std::sync::Arc; + + pub(crate) trait ThreadSafe: Send + Sync {} + + pub fn into_shared(v: T) -> Arc { + Arc::new(v) + } + pub fn add_interior_mutability(v: T) -> parking_lot::Mutex { + parking_lot::Mutex::new(v) + } +} + +mod thread_local { + use std::rc::Rc; + + trait ThreadSafe: Send + Sync {} + + pub fn into_shared(v: T) -> Rc { + Rc::new(v) + } + pub fn add_interior_mutability(v: T) -> parking_lot::Mutex { + parking_lot::Mutex::new(v) + } +} + type DynRcPolicy = dyn Policy, PackData = Rc>; -type DynArcPolicy = dyn Policy, PackData = Arc>; +type DynArcPolicy = + dyn Policy, PackData = Arc> + thread_safe::ThreadSafe; /// Formerly RepositoryLocal struct SharedLocal { From 3ce7f30a980c6eb1715558e8ca7e2b10e93aba73 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 10:45:44 +0800 Subject: [PATCH 23/57] =?UTF-8?q?Sketch=20types=20across=20git-features,?= =?UTF-8?q?=20git-odb=20and=20git-repository=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The goal is to make thread-safety toggalble with features to get type complexity down to just a single type in git-repository without needing more feature toggles in parent crates. This should work actually, this is an intermediate commit. Something that will change is the settings of Easy, as there probably will only be an Easy and EasyShared, don't know how the types can be filled in based on a feature toggle though. Maybe it can work with more typedefs. --- DEVELOPMENT.md | 3 + experiments/odb-redesign/Cargo.toml | 4 + experiments/odb-redesign/src/lib.rs | 179 +++++++++++++++++++--------- 3 files changed, 127 insertions(+), 59 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 70e034c42cf..f0642cec94a 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -659,3 +659,6 @@ Please note that these are based on the following value system: - The default `Policy` should be the least surprising, hence incur mapping costs over time and automatically refresh itself if needed. - Make sure auto-refresh can be turned off on policy level to allow users to probe for objects despite `--filter=blob` or similar without slowing down to a crawl due to a refresh each time an object is missing. + - The way this is going, `Deref` can panic in single-threaded applications only if recursion is used. Thread-safe versions of this will hang + unless a reentrant mutex is used. Since we don't call ourselves recursively (i.e. during `find_object(…)`, this won't be an issue. It should also be impossible in single-threaded + mode even with multiple `Easy` instances. diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml index 488acefbfdd..f07a3b12a7f 100644 --- a/experiments/odb-redesign/Cargo.toml +++ b/experiments/odb-redesign/Cargo.toml @@ -5,6 +5,10 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html +[features] +default = ["thread-safe"] +thread-safe = [] + [dependencies] git-pack = { path = "../../git-pack", version = "*" } parking_lot = { version = "0.11.0", default-features = false } diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 8aa6e3dd8c6..b22c44620a8 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -1,80 +1,141 @@ -#![allow(dead_code)] - -use std::cell::RefCell; -use std::ops::Deref; -use std::rc::Rc; -use std::sync::Arc; - -pub mod policy { - pub struct IndexMarker(u32); - - pub mod next_indices { - use crate::policy::IndexMarker; - - pub enum Outcome { - Next { - indices: Vec, // should probably be small vec to get around most allocations - mark: IndexMarker, // use to show where you left off next time you call - }, - /// No new indices to look at, caller should stop give up - NoMoreIndices, +#![allow(dead_code, unused_variables)] + +mod features { + pub mod threaded { + use std::sync::Arc; + + #[cfg(feature = "thread-safe")] + #[macro_export] + macro_rules! marker_traits { + ($target:ident, $trait:tt) => { + pub trait $target: $trait + Send + Sync {} + }; + } + + pub type OwnShared = Arc; + pub type Mutable = parking_lot::Mutex; + + pub fn into_shared(v: T) -> OwnShared { + Arc::new(v) + } + pub fn with_interior_mutability(v: T) -> Mutable { + parking_lot::Mutex::new(v) } } -} -trait Policy { - type PackData: Deref; - type PackIndex: Deref; + pub(crate) mod local { + use std::cell::RefCell; + use std::rc::Rc; + + pub type OwnShared = Rc; + pub type Mutable = RefCell; + + #[cfg(not(feature = "thread-safe"))] + #[macro_export] + macro_rules! marker_traits { + ($target:ident, $trait:ty) => { + type $target = dyn ($trait); + }; + } + + pub fn into_shared(v: T) -> Rc { + Rc::new(v) + } + pub fn with_interior_mutability(v: T) -> Mutable { + RefCell::new(v) + } + } - fn next_indices(&mut self) -> std::io::Result>; + #[cfg(not(feature = "thread-safe"))] + pub use local::*; + #[cfg(feature = "thread-safe")] + pub use threaded::*; } -struct EagerLocal {} +mod odb { + use crate::features; + use std::ops::Deref; -mod thread_safe { - use std::sync::Arc; + pub mod pack { + pub struct IndexMarker(u32); - pub(crate) trait ThreadSafe: Send + Sync {} + pub mod next_indices { + use crate::odb::pack::IndexMarker; - pub fn into_shared(v: T) -> Arc { - Arc::new(v) - } - pub fn add_interior_mutability(v: T) -> parking_lot::Mutex { - parking_lot::Mutex::new(v) + pub enum Outcome { + Next { + indices: Vec, // should probably be small vec to get around most allocations + mark: IndexMarker, // use to show where you left off next time you call + }, + /// No new indices to look at, caller should stop give up + NoMoreIndices, + } + } } -} -mod thread_local { - use std::rc::Rc; + pub trait Policy { + type PackData: Deref; + type PackIndex: Deref; - trait ThreadSafe: Send + Sync {} + fn next_indices(&mut self) -> std::io::Result>; + } - pub fn into_shared(v: T) -> Rc { - Rc::new(v) + #[derive(Default)] + pub struct EagerLocal { + bundles: features::local::Mutable>>, } - pub fn add_interior_mutability(v: T) -> parking_lot::Mutex { - parking_lot::Mutex::new(v) + + impl Policy for EagerLocal { + type PackData = features::local::OwnShared; + type PackIndex = features::local::OwnShared; + + fn next_indices(&mut self) -> std::io::Result> { + todo!() + } } -} -type DynRcPolicy = dyn Policy, PackData = Rc>; -type DynArcPolicy = - dyn Policy, PackData = Arc> + thread_safe::ThreadSafe; + fn try_setup() { + let policy = EagerLocal::default(); + // let policy = Box::new(EagerLocal::default()) + // as Box< + // dyn DynPolicy< + // PackIndex = features::OwnShared, + // PackData = features::OwnShared, + // >, + // >; + } -/// Formerly RepositoryLocal -struct SharedLocal { - pack_policy: Rc, PackData = Rc>>>, + crate::marker_traits!(DynPolicy, Policy); } -/// Formerly Repository -struct Shared { - pack_policy: Arc>, -} +mod repository { + // type DynPolicy = dyn Policy< + // PackIndex = threading::OwnShared, + // PackData = threading::OwnShared, + // > + Send + // + Sync; + + use crate::{features, odb}; + // We probably don't need to use a macro like that as we have a feature toggle in Repository, or do we? + // We need it, as otherwise there is no way to instantiate the correct version of the policy, or is there? + // Should that be delegated to the caller, but if so that would lock them in to a choice and need custom code + // depending on a feature toggle that they should only switch on or off. + // crate::marker_traits!(DynPolicy, Policy); + + struct Repository { + pack_policy: features::OwnShared< + dyn odb::DynPolicy< + PackIndex = features::OwnShared, + PackData = features::OwnShared, + >, + >, + } -/// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics -/// should be a thing. -/// Without generics, there would be a thread-safe and thread-local version of everything. -/// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? -struct SharedGeneric { - pack_policy: PackPolicy, + // /// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics + // /// should be a thing. + // /// Without generics, there would be a thread-safe and thread-local version of everything. + // /// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? + // struct RepositoryGeneric { + // pack_policy: PackPolicy, + // } } From 1e861485608c3cf8ec74f2f67c4b8ded77481200 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 11:21:23 +0800 Subject: [PATCH 24/57] =?UTF-8?q?A=20few=20more=20steps=20towards=20implem?= =?UTF-8?q?entation=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …but it shows that ideally we just export the correct Policy type (i.e. one with Send and Sync requirements as needed) to avoid having to do some conversion acrobatics when we want to use it later. --- Cargo.lock | 7 ++-- experiments/odb-redesign/Cargo.toml | 3 ++ experiments/odb-redesign/src/lib.rs | 53 +++++++++++++++++++++-------- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 67234d9962b..e0ba3758586 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,9 +30,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.45" +version = "1.0.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee10e43ae4a853c0a3591d4e2ada1719e553be18199d9da9d4a83f5927c2f5c7" +checksum = "38d9ff5d688f1c13395289f67db01d4826b46dd694e7580accdc3e8430f2d98e" [[package]] name = "argh" @@ -1840,8 +1840,11 @@ dependencies = [ name = "odb-redesign" version = "0.1.0" dependencies = [ + "anyhow", + "git-odb", "git-pack", "parking_lot", + "thiserror", ] [[package]] diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml index f07a3b12a7f..55d7520be37 100644 --- a/experiments/odb-redesign/Cargo.toml +++ b/experiments/odb-redesign/Cargo.toml @@ -11,4 +11,7 @@ thread-safe = [] [dependencies] git-pack = { path = "../../git-pack", version = "*" } +git-odb = { path = "../../git-odb", version = "*" } parking_lot = { version = "0.11.0", default-features = false } +thiserror = "1.0.30" +anyhow = "1.0.47" diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index b22c44620a8..ff7850e62d6 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -1,7 +1,7 @@ #![allow(dead_code, unused_variables)] mod features { - pub mod threaded { + mod threaded { use std::sync::Arc; #[cfg(feature = "thread-safe")] @@ -23,7 +23,7 @@ mod features { } } - pub(crate) mod local { + mod local { use std::cell::RefCell; use std::rc::Rc; @@ -33,8 +33,8 @@ mod features { #[cfg(not(feature = "thread-safe"))] #[macro_export] macro_rules! marker_traits { - ($target:ident, $trait:ty) => { - type $target = dyn ($trait); + ($target:ident, $trait:tt) => { + pub trait $target: $trait {} }; } @@ -54,7 +54,9 @@ mod features { mod odb { use crate::features; + use std::borrow::BorrowMut; use std::ops::Deref; + use std::path::PathBuf; pub mod pack { pub struct IndexMarker(u32); @@ -77,25 +79,47 @@ mod odb { type PackData: Deref; type PackIndex: Deref; - fn next_indices(&mut self) -> std::io::Result>; + fn next_indices( + &mut self, + marker: Option, + ) -> std::io::Result>; + } + + pub struct Eager { + db_paths: Vec, + bundles: features::Mutable>>, } - #[derive(Default)] - pub struct EagerLocal { - bundles: features::local::Mutable>>, + #[derive(Debug, thiserror::Error)] + pub enum Error { + #[error(transparent)] + AlternateResolve(#[from] git_odb::alternate::Error), + } + + impl Eager { + pub fn at(objects_directory: impl Into) -> Result { + Ok(Eager { + db_paths: git_odb::alternate::resolve(objects_directory)?, + bundles: features::Mutable::new(Vec::new()), + }) + } } - impl Policy for EagerLocal { - type PackData = features::local::OwnShared; - type PackIndex = features::local::OwnShared; + impl Policy for Eager { + type PackData = features::OwnShared; + type PackIndex = features::OwnShared; - fn next_indices(&mut self) -> std::io::Result> { + fn next_indices( + &mut self, + marker: Option, + ) -> std::io::Result> { + let bundles = self.bundles.borrow_mut(); todo!() } } - fn try_setup() { - let policy = EagerLocal::default(); + fn try_setup() -> anyhow::Result<()> { + let policy = Eager::at(".git/objects")?; // let policy = Box::new(EagerLocal::default()) // as Box< // dyn DynPolicy< @@ -103,6 +127,7 @@ mod odb { // PackData = features::OwnShared, // >, // >; + Ok(()) } crate::marker_traits!(DynPolicy, Policy); From e6ae34f9984e98318da0d81616d425ab481e3d25 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 14:43:12 +0800 Subject: [PATCH 25/57] use generic Repository type with typedef default to avoid macro-madness (#259) --- DEVELOPMENT.md | 3 +- experiments/odb-redesign/src/lib.rs | 59 +++++++---------------------- 2 files changed, 15 insertions(+), 47 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index f0642cec94a..b7c1198edc2 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -647,8 +647,6 @@ Please note that these are based on the following value system: - a `Policy` could implement the building blocks needed by that algorithm. - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations would either duplicate code or forward to even more generic implementations. - - `Policy` might need to be implemented for `&T` and `Box|Rc|Arc` where `T: Policy` as well, let's see. - - Of course, the `Policy` must be object safe - `Views` work if they are placed in the state and are thread-local for that reason, with interior mutability. A `view` will just be the linked odb implementation itself. - It should contain a borrowed `Policy` which is owned in the shared `Repository`. The latter should contains a list of paths to object databases (i.e. alternates) to allow seeing all multi-pack indices and indices like it is one repository. @@ -662,3 +660,4 @@ Please note that these are based on the following value system: - The way this is going, `Deref` can panic in single-threaded applications only if recursion is used. Thread-safe versions of this will hang unless a reentrant mutex is used. Since we don't call ourselves recursively (i.e. during `find_object(…)`, this won't be an issue. It should also be impossible in single-threaded mode even with multiple `Easy` instances. + - memory map pooling across repositories can work if the odb informs about the entry-point path when asking for more indices to check diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index ff7850e62d6..5a5cbd03d54 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -4,14 +4,6 @@ mod features { mod threaded { use std::sync::Arc; - #[cfg(feature = "thread-safe")] - #[macro_export] - macro_rules! marker_traits { - ($target:ident, $trait:tt) => { - pub trait $target: $trait + Send + Sync {} - }; - } - pub type OwnShared = Arc; pub type Mutable = parking_lot::Mutex; @@ -30,14 +22,6 @@ mod features { pub type OwnShared = Rc; pub type Mutable = RefCell; - #[cfg(not(feature = "thread-safe"))] - #[macro_export] - macro_rules! marker_traits { - ($target:ident, $trait:tt) => { - pub trait $target: $trait {} - }; - } - pub fn into_shared(v: T) -> Rc { Rc::new(v) } @@ -129,38 +113,23 @@ mod odb { // >; Ok(()) } - - crate::marker_traits!(DynPolicy, Policy); } mod repository { - // type DynPolicy = dyn Policy< - // PackIndex = threading::OwnShared, - // PackData = threading::OwnShared, - // > + Send - // + Sync; - - use crate::{features, odb}; - // We probably don't need to use a macro like that as we have a feature toggle in Repository, or do we? - // We need it, as otherwise there is no way to instantiate the correct version of the policy, or is there? - // Should that be delegated to the caller, but if so that would lock them in to a choice and need custom code - // depending on a feature toggle that they should only switch on or off. - // crate::marker_traits!(DynPolicy, Policy); - - struct Repository { - pack_policy: features::OwnShared< - dyn odb::DynPolicy< - PackIndex = features::OwnShared, - PackData = features::OwnShared, - >, - >, + use crate::odb; + + pub mod raw { + use crate::odb; + + /// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics + /// should be a thing. + /// Without generics, there would be a thread-safe and thread-local version of everything. + /// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? + pub struct RepositoryGeneric { + pack_policy: PackPolicy, + } } - // /// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics - // /// should be a thing. - // /// Without generics, there would be a thread-safe and thread-local version of everything. - // /// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? - // struct RepositoryGeneric { - // pack_policy: PackPolicy, - // } + /// Exposed type top-level repository to hide generic complexity, with one-size-fits-most default + type Repository = raw::RepositoryGeneric; } From e7793116a5f51887ff5a7dc4651cf907f9a63719 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 15:10:22 +0800 Subject: [PATCH 26/57] Now it's getting somewhere, with the potential for resource pooling (#259) --- DEVELOPMENT.md | 1 + experiments/odb-redesign/src/lib.rs | 71 +++++++++++++++++------------ 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index b7c1198edc2..07e03484a6d 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -661,3 +661,4 @@ Please note that these are based on the following value system: unless a reentrant mutex is used. Since we don't call ourselves recursively (i.e. during `find_object(…)`, this won't be an issue. It should also be impossible in single-threaded mode even with multiple `Easy` instances. - memory map pooling across repositories can work if the odb informs about the entry-point path when asking for more indices to check + - It's OK to use a feature toggle to switch between Rc and Arc diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 5a5cbd03d54..b356ef648b7 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -7,26 +7,20 @@ mod features { pub type OwnShared = Arc; pub type Mutable = parking_lot::Mutex; - pub fn into_shared(v: T) -> OwnShared { - Arc::new(v) - } - pub fn with_interior_mutability(v: T) -> Mutable { - parking_lot::Mutex::new(v) + pub fn get_mut(v: &Mutable) -> parking_lot::MutexGuard<'_, T> { + v.lock() } } mod local { - use std::cell::RefCell; + use std::cell::{RefCell, RefMut}; use std::rc::Rc; pub type OwnShared = Rc; pub type Mutable = RefCell; - pub fn into_shared(v: T) -> Rc { - Rc::new(v) - } - pub fn with_interior_mutability(v: T) -> Mutable { - RefCell::new(v) + pub fn get_mut(v: &Mutable) -> RefMut<'_, T> { + v.borrow_mut() } } @@ -38,9 +32,10 @@ mod features { mod odb { use crate::features; - use std::borrow::BorrowMut; + use crate::features::get_mut; + use std::io; use std::ops::Deref; - use std::path::PathBuf; + use std::path::Path; pub mod pack { pub struct IndexMarker(u32); @@ -64,28 +59,25 @@ mod odb { type PackIndex: Deref; fn next_indices( - &mut self, + &self, + objects_directory: &Path, marker: Option, ) -> std::io::Result>; } + #[derive(Default)] pub struct Eager { - db_paths: Vec, - bundles: features::Mutable>>, + state: features::Mutable, } - #[derive(Debug, thiserror::Error)] - pub enum Error { - #[error(transparent)] - AlternateResolve(#[from] git_odb::alternate::Error), - } + mod eager { + use crate::features; + use std::path::PathBuf; - impl Eager { - pub fn at(objects_directory: impl Into) -> Result { - Ok(Eager { - db_paths: git_odb::alternate::resolve(objects_directory)?, - bundles: features::Mutable::new(Vec::new()), - }) + #[derive(Default)] + pub struct State { + pub(crate) db_paths: Vec, + pub(crate) bundles: Vec>, } } @@ -94,16 +86,35 @@ mod odb { type PackIndex = features::OwnShared; fn next_indices( - &mut self, + &self, + objects_directory: &Path, marker: Option, ) -> std::io::Result> { - let bundles = self.bundles.borrow_mut(); + let mut state = get_mut(&self.state); + if state.db_paths.is_empty() { + state.db_paths.extend( + git_odb::alternate::resolve(objects_directory) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, + ); + } else { + debug_assert_eq!( + Some(objects_directory), + state.db_paths.get(0).map(|p| p.as_path()), + "Eager policy can't be shared across different repositories" + ); + } + + // if bundles.is_empty() + // match marker { + // Some(marker) if marker == bundles.len() => todo!() + // } + // git_odb::alternate::resolve(objects_directory) todo!() } } fn try_setup() -> anyhow::Result<()> { - let policy = Eager::at(".git/objects")?; + let policy = Eager::default(); // let policy = Box::new(EagerLocal::default()) // as Box< // dyn DynPolicy< From 4b84a4800f8cb26847fff3bc39c87a2aee5a1a5b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 18:12:15 +0800 Subject: [PATCH 27/57] =?UTF-8?q?All=20logic=20for=20handling=20eager=20re?= =?UTF-8?q?freshing=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …which looks like it will be similar in other implementations too. Maybe there could be one store that has configuration that can be used with a 'frontend', producing its own local state for later use in Easy state? That looks more like it actually, and generics don't have to stay either, nice. --- DEVELOPMENT.md | 4 +- experiments/odb-redesign/src/lib.rs | 123 ++++++++++++++++++++++------ 2 files changed, 101 insertions(+), 26 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 07e03484a6d..e403877762a 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -568,7 +568,7 @@ The default favors speed and using all available cores, but savvy users can run ## Learnings -### Loose references database +### Loose refer*ences database - When deleting (with or without value constraint), wait for locks instead of failing to workaround `packed-refs` as choking point. It's certainly worth it to split transactions so that deletions are done separately from updates to allow usage of the most suitable locking strategies. @@ -584,7 +584,7 @@ The default favors speed and using all available cores, but savvy users can run Please note that these are based on the following value system: - We value _highly_ to scale object access performance with cores. -- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings. +- We value _more_ to offer a choice of trade-offs than to aim for a one-size-fits-all solution, unless the latter has no shortcomings.* - We don't value the handling of out of memory situations differently than panicking. This might change if `gitoxide` should fly to Mars or land in the linux kernel though. - We don't value enabling 32 bit applications to deal with pack files greater than 4GB and leave this field entirely to the other implementations. diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index b356ef648b7..59657afdc1d 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -33,20 +33,46 @@ mod features { mod odb { use crate::features; use crate::features::get_mut; + use crate::odb::policy::{load_indices, PackIndexMarker}; use std::io; use std::ops::Deref; use std::path::Path; - pub mod pack { - pub struct IndexMarker(u32); + pub mod policy { + /// A way to indicate which pack indices we have seen already + pub struct PackIndexMarker { + /// The generation the marker belongs to, is incremented on each refresh and possibly only if there is an actual change + pub generation: u8, + /// The amount of pack indices available + pub pack_index_count: usize, + } + + /// Define how packs will be refreshed when all indices are loaded + pub enum RefreshMode { + AfterAllIndicesLoaded, + /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes + Never, + } - pub mod next_indices { - use crate::odb::pack::IndexMarker; + pub mod load_indices { + use crate::odb::policy::{PackIndexMarker, RefreshMode}; + use std::path::Path; + + pub struct Options<'a> { + pub objects_directory: &'a Path, + pub mode: RefreshMode, + } pub enum Outcome { - Next { + /// Replace your set with the given one + Replace { + indices: Vec, + mark: PackIndexMarker, + }, + /// Extend with the given indices and keep searching + Extend { indices: Vec, // should probably be small vec to get around most allocations - mark: IndexMarker, // use to show where you left off next time you call + mark: PackIndexMarker, // use to show where you left off next time you call }, /// No new indices to look at, caller should stop give up NoMoreIndices, @@ -58,11 +84,11 @@ mod odb { type PackData: Deref; type PackIndex: Deref; - fn next_indices( + fn load_next_indices( &self, - objects_directory: &Path, - marker: Option, - ) -> std::io::Result>; + options: policy::load_indices::Options<'_>, + marker: Option, + ) -> std::io::Result>; } #[derive(Default)] @@ -77,7 +103,8 @@ mod odb { #[derive(Default)] pub struct State { pub(crate) db_paths: Vec, - pub(crate) bundles: Vec>, + pub(crate) indices: Vec>, + pub(crate) generation: u8, } } @@ -85,17 +112,17 @@ mod odb { type PackData = features::OwnShared; type PackIndex = features::OwnShared; - fn next_indices( + fn load_next_indices( &self, - objects_directory: &Path, - marker: Option, - ) -> std::io::Result> { + policy::load_indices::Options { + objects_directory, + mode, + }: policy::load_indices::Options<'_>, + marker: Option, + ) -> std::io::Result> { let mut state = get_mut(&self.state); if state.db_paths.is_empty() { - state.db_paths.extend( - git_odb::alternate::resolve(objects_directory) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, - ); + return Self::refresh(&mut state, objects_directory); } else { debug_assert_eq!( Some(objects_directory), @@ -104,11 +131,59 @@ mod odb { ); } - // if bundles.is_empty() - // match marker { - // Some(marker) if marker == bundles.len() => todo!() - // } - // git_odb::alternate::resolve(objects_directory) + Ok(match marker { + Some(marker) => { + if marker.generation != state.generation { + load_indices::Outcome::Replace { + indices: state.indices.clone(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_count: state.indices.len(), + }, + } + } else { + if marker.pack_index_count == state.indices.len() { + match mode { + policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, + policy::RefreshMode::AfterAllIndicesLoaded => { + return Self::refresh(&mut state, objects_directory) + } + } + } else { + load_indices::Outcome::Replace { + indices: state.indices[marker.pack_index_count..].to_vec(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_count: state.indices.len(), + }, + } + } + } + } + None => load_indices::Outcome::Replace { + indices: state.indices.clone(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_count: state.indices.len(), + }, + }, + }) + } + } + + impl Eager { + fn refresh( + eager::State { + db_paths, + indices: bundles, + generation, + }: &mut eager::State, + objects_directory: &Path, + ) -> io::Result>> { + db_paths.extend( + git_odb::alternate::resolve(objects_directory) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, + ); todo!() } } From 1837abc79c6ff5b03135dba96100985f80d8c8ef Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Mon, 22 Nov 2021 18:19:44 +0800 Subject: [PATCH 28/57] change direction towards something simpler that will still be fast (#259) --- DEVELOPMENT.md | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index e403877762a..2d599e9bc1a 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -638,27 +638,29 @@ Please note that these are based on the following value system: Doing this would also mean that the Repository doesn't even have a repository anymore, but just a `pack::Policy`. - It's still unclear how to best control repeated refreshes in presence of the possibility of blobs missing due to `--filter=blob`. Maybe this is the moment where one would access the policy directly to turn off refreshes for the duration of the operation. - - **in the clear** - - Algorithmically, object access starts out with `Indices`, fetching more until the object is contained within, then the corresponding the pack data is loaded (if needed) - to possibly extract object data. - - The result of a contains check would have to be a pack id to allow looking it up in the own cache and then ask it to be loaded/returned by the `Policy`, along with - maybe the indices required to fetch it from the pack, maybe just a `bundle::Location` of sorts. It could also be a struct to encode the pack-id and the index within it. - - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. - - a `Policy` could implement the building blocks needed by that algorithm. - - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations - would either duplicate code or forward to even more generic implementations. - `Views` work if they are placed in the state and are thread-local for that reason, with interior mutability. A `view` will just be the linked odb implementation itself. - - It should contain a borrowed `Policy` which is owned in the shared `Repository`. The latter should contains a list of paths to object databases (i.e. alternates) to - allow seeing all multi-pack indices and indices like it is one repository. + - It should contain a borrowed `Policy` which is owned in the shared `Repository`. The latter should contains a list of paths to object databases (i.e. alternates) to + allow seeing all multi-pack indices and indices like it is one repository. - `Repository` turns into `RepositoryLocal` with a `Rc` that isn't `Sync` and adds a `Repository` type that does the same but with `Arc`. - - each of these repository types has their own `Easy` types. - - _Difficulties_: Some `Platform` types store `repo: Access::RepoRef` and use `repo.deref().odb`, but that now only has a `Policy` from which a new `State` should be created - or they store the State right away… . + - each of these repository types has their own `Easy` types. + - _Difficulties_: Some `Platform` types store `repo: Access::RepoRef` and use `repo.deref().odb`, but that now only has a `Policy` from which a new `State` should be created + or they store the State right away… . - The default `Policy` should be the least surprising, hence incur mapping costs over time and automatically refresh itself if needed. - Make sure auto-refresh can be turned off on policy level to allow users to probe for objects despite `--filter=blob` or similar without slowing down to a crawl due to a refresh each time an object is missing. - - The way this is going, `Deref` can panic in single-threaded applications only if recursion is used. Thread-safe versions of this will hang + - The way this is going, `Deref` can panic in single-threaded applications only if recursion is used. Thread-safe versions of this will hang unless a reentrant mutex is used. Since we don't call ourselves recursively (i.e. during `find_object(…)`, this won't be an issue. It should also be impossible in single-threaded mode even with multiple `Easy` instances. - memory map pooling across repositories can work if the odb informs about the entry-point path when asking for more indices to check - It's OK to use a feature toggle to switch between Rc and Arc + - **in the clear** + - Algorithmically, object access starts out with `Indices`, fetching more until the object is contained within, then the corresponding the pack data is loaded (if needed) + to possibly extract object data. + - The result of a contains check would have to be a pack id to allow looking it up in the own cache and then ask it to be loaded/returned by the `Policy`, along with + maybe the indices required to fetch it from the pack, maybe just a `bundle::Location` of sorts. It could also be a struct to encode the pack-id and the index within it. + - `pack::Bundle` won't be used within the ODB anymore as it doesn't allow such separation and won't work well with multi pack indices. + - a `Policy` could implement the building blocks needed by that algorithm. + - The `Policy` should go through `Deref` to allow for different ways of internal shared ownership of actual indices, but that would also mean multiple implementations + would either duplicate code or forward to even more generic implementations. + - It looks like building a configurable 'can-do-it-call' store is more like it and would use compile-time types to avoid generics entirely. This could live in the Repository + as before. From 5e9250f5027e4b2c701ceae72a6038ac2a4a2093 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 09:03:16 +0800 Subject: [PATCH 29/57] =?UTF-8?q?Turns=20out=20the=20new=20`PolicyStore`?= =?UTF-8?q?=20can=20co-exist=20with=20existing=20one=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …which needs refactoring too to become simpler and deal with alternates in a less convoluted fashion. --- Cargo.lock | 1 + DEVELOPMENT.md | 9 ++ experiments/odb-redesign/Cargo.toml | 1 + experiments/odb-redesign/src/lib.rs | 215 ++++++++++++++++------------ git-pack/src/find_traits.rs | 2 +- 5 files changed, 139 insertions(+), 89 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e0ba3758586..bcfcd6226c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1841,6 +1841,7 @@ name = "odb-redesign" version = "0.1.0" dependencies = [ "anyhow", + "git-hash", "git-odb", "git-pack", "parking_lot", diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 2d599e9bc1a..65559fbe057 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -664,3 +664,12 @@ Please note that these are based on the following value system: would either duplicate code or forward to even more generic implementations. - It looks like building a configurable 'can-do-it-call' store is more like it and would use compile-time types to avoid generics entirely. This could live in the Repository as before. + - Having it in a shared-ownership configurable `Policy` is probably the way to go as it would allow sharing mappings across repositories while implementing ways of handling + them. + - when building packs, it's vital that the pack indices stay stable and don't go through a refresh (which isn't observable for the one finding objects). Thus it's vital + that packs are built with their own object database that is configured to not refresh packs or better even, eager policy without refresh. The latter requires a well-maintained + object database due to lots of additional file handles needed, or alternatively an algorithm which fails if a refresh would be needed but instead retries if an object wasn't found, + for example when a pack can't be (lazy) loaded as it was removed during maintenance. In other words, those creating packs definitely have to deal with failure specifically and + probably just retry + - Also it seems that the existing implementation has merrit, but should probably be altered to be a single store (without policy) instead also to fix latent issues around + addressing packs in alternate repositories. diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml index 55d7520be37..9b212fdb8dc 100644 --- a/experiments/odb-redesign/Cargo.toml +++ b/experiments/odb-redesign/Cargo.toml @@ -12,6 +12,7 @@ thread-safe = [] [dependencies] git-pack = { path = "../../git-pack", version = "*" } git-odb = { path = "../../git-odb", version = "*" } +git-hash = { path = "../../git-hash", version ="^0.8.0" } parking_lot = { version = "0.11.0", default-features = false } thiserror = "1.0.30" anyhow = "1.0.47" diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 59657afdc1d..e4a1db0afe6 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -34,11 +34,33 @@ mod odb { use crate::features; use crate::features::get_mut; use crate::odb::policy::{load_indices, PackIndexMarker}; + use git_odb::data::Object; + use git_odb::pack::bundle::Location; + use git_odb::pack::cache::DecodeEntry; + use git_odb::pack::find::Entry; + use git_odb::pack::Bundle; use std::io; - use std::ops::Deref; - use std::path::Path; + use std::ops::DerefMut; + use std::path::{Path, PathBuf}; pub mod policy { + + pub(crate) enum State { + Eager(eager::State), + } + + pub(crate) mod eager { + use crate::features; + use std::path::PathBuf; + + #[derive(Default)] + pub struct State { + pub(crate) db_paths: Vec, + pub(crate) indices: Vec>, + pub(crate) generation: u8, + } + } + /// A way to indicate which pack indices we have seen already pub struct PackIndexMarker { /// The generation the marker belongs to, is incremented on each refresh and possibly only if there is an actual change @@ -80,104 +102,89 @@ mod odb { } } - pub trait Policy { - type PackData: Deref; - type PackIndex: Deref; - - fn load_next_indices( - &self, - options: policy::load_indices::Options<'_>, - marker: Option, - ) -> std::io::Result>; + pub struct Policy { + state: features::Mutable, } - #[derive(Default)] - pub struct Eager { - state: features::Mutable, - } - - mod eager { - use crate::features; - use std::path::PathBuf; - - #[derive(Default)] - pub struct State { - pub(crate) db_paths: Vec, - pub(crate) indices: Vec>, - pub(crate) generation: u8, + impl Policy { + fn eager() -> Self { + Policy { + state: features::Mutable::new(policy::State::Eager(policy::eager::State::default())), + } } } - impl Policy for Eager { - type PackData = features::OwnShared; - type PackIndex = features::OwnShared; - - fn load_next_indices( + impl Policy { + pub fn load_next_indices( &self, - policy::load_indices::Options { + load_indices::Options { objects_directory, mode, - }: policy::load_indices::Options<'_>, + }: load_indices::Options<'_>, marker: Option, - ) -> std::io::Result> { + ) -> std::io::Result>> { let mut state = get_mut(&self.state); - if state.db_paths.is_empty() { - return Self::refresh(&mut state, objects_directory); - } else { - debug_assert_eq!( - Some(objects_directory), - state.db_paths.get(0).map(|p| p.as_path()), - "Eager policy can't be shared across different repositories" - ); - } + match state.deref_mut() { + policy::State::Eager(state) => { + if state.db_paths.is_empty() { + return Self::refresh(state, objects_directory); + } else { + debug_assert_eq!( + Some(objects_directory), + state.db_paths.get(0).map(|p| p.as_path()), + "Eager policy can't be shared across different repositories" + ); + } - Ok(match marker { - Some(marker) => { - if marker.generation != state.generation { - load_indices::Outcome::Replace { + Ok(match marker { + Some(marker) => { + if marker.generation != state.generation { + load_indices::Outcome::Replace { + indices: state.indices.clone(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_count: state.indices.len(), + }, + } + } else { + if marker.pack_index_count == state.indices.len() { + match mode { + policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, + policy::RefreshMode::AfterAllIndicesLoaded => { + return Self::refresh(state, objects_directory) + } + } + } else { + load_indices::Outcome::Replace { + indices: state.indices[marker.pack_index_count..].to_vec(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_count: state.indices.len(), + }, + } + } + } + } + None => load_indices::Outcome::Replace { indices: state.indices.clone(), mark: PackIndexMarker { generation: state.generation, pack_index_count: state.indices.len(), }, - } - } else { - if marker.pack_index_count == state.indices.len() { - match mode { - policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, - policy::RefreshMode::AfterAllIndicesLoaded => { - return Self::refresh(&mut state, objects_directory) - } - } - } else { - load_indices::Outcome::Replace { - indices: state.indices[marker.pack_index_count..].to_vec(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_count: state.indices.len(), - }, - } - } - } + }, + }) } - None => load_indices::Outcome::Replace { - indices: state.indices.clone(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_count: state.indices.len(), - }, - }, - }) + } } - } - impl Eager { + /// If there is only additive changes, there is no need for a new `generation` actually, which helps + /// callers to retain stability. fn refresh( - eager::State { + policy::eager::State { db_paths, indices: bundles, generation, - }: &mut eager::State, + }: &mut policy::eager::State, objects_directory: &Path, ) -> io::Result>> { db_paths.extend( @@ -188,8 +195,39 @@ mod odb { } } + /// The store shares a policy and + pub struct PolicyStore { + policy: features::OwnShared, + objects_directory: PathBuf, + } + + impl git_odb::Find for PolicyStore { + type Error = git_odb::compound::find::Error; + + fn try_find<'a>( + &self, + id: impl AsRef, + buffer: &'a mut Vec, + pack_cache: &mut impl DecodeEntry, + ) -> Result>, Self::Error> { + todo!() + } + + fn location_by_oid(&self, id: impl AsRef, buf: &mut Vec) -> Option { + todo!() + } + + fn bundle_by_pack_id(&self, pack_id: u32) -> Option<&Bundle> { + todo!() + } + + fn entry_by_location(&self, location: &Location) -> Option> { + todo!() + } + } + fn try_setup() -> anyhow::Result<()> { - let policy = Eager::default(); + let policy = Policy::eager(); // let policy = Box::new(EagerLocal::default()) // as Box< // dyn DynPolicy< @@ -204,18 +242,19 @@ mod odb { mod repository { use crate::odb; - pub mod raw { - use crate::odb; + mod raw { + use git_pack::Find; - /// Using generics here would mean we need policy to handle its mutability itself, pushing it down might be easiest if generics - /// should be a thing. - /// Without generics, there would be a thread-safe and thread-local version of everything. - /// Maybe this should be solved with a feature toggle instead? Aka thread-safe or not? - pub struct RepositoryGeneric { - pack_policy: PackPolicy, + pub struct Repository + where + Odb: Find, // + Contains + Refresh/Reset maybe? + { + odb: Odb, } } - /// Exposed type top-level repository to hide generic complexity, with one-size-fits-most default - type Repository = raw::RepositoryGeneric; + /// Maybe we will end up providing a generic version as there still seems to be benefits in having a read-only Store implementation. + pub struct Repository { + odb: odb::PolicyStore, + } } diff --git a/git-pack/src/find_traits.rs b/git-pack/src/find_traits.rs index aeb2c28efbd..00e41818fbb 100644 --- a/git-pack/src/find_traits.rs +++ b/git-pack/src/find_traits.rs @@ -4,7 +4,7 @@ use crate::{data, find}; /// /// ## Notes /// -/// Locate effectively needs [generic associated types][issue] to allow a trait for the returned object type. +/// Find effectively needs [generic associated types][issue] to allow a trait for the returned object type. /// Until then, we will have to make due with explicit types and give them the potentially added features we want. /// /// [issue]: https://github.com/rust-lang/rust/issues/44265 From ab1562e2d39f1b67ce3a61af53af625e5707c522 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 13:49:11 +0800 Subject: [PATCH 30/57] some more clarity around generations and cache updates (#259) --- experiments/odb-redesign/src/lib.rs | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index e4a1db0afe6..b5e194c2432 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -63,16 +63,20 @@ mod odb { /// A way to indicate which pack indices we have seen already pub struct PackIndexMarker { - /// The generation the marker belongs to, is incremented on each refresh and possibly only if there is an actual change + /// The generation the marker belongs to, is incremented to indicate that there were changes that caused the removal of a + /// pack and require the caller to rebuild their cache to free resources. pub generation: u8, - /// The amount of pack indices available - pub pack_index_count: usize, + /// An ever increasing number within a generation indicating the number of loaded pack indices. It's reset with + /// each new generation. + pub pack_index_sequence: usize, } /// Define how packs will be refreshed when all indices are loaded pub enum RefreshMode { + /// Check for new or changed pack indices when the last known index is loaded. AfterAllIndicesLoaded, - /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes + /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes even after all packs are loaded. + /// This comes at the risk of not learning that the packs have changed in the mean time. Never, } @@ -143,11 +147,11 @@ mod odb { indices: state.indices.clone(), mark: PackIndexMarker { generation: state.generation, - pack_index_count: state.indices.len(), + pack_index_sequence: state.indices.len(), }, } } else { - if marker.pack_index_count == state.indices.len() { + if marker.pack_index_sequence == state.indices.len() { match mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { @@ -155,11 +159,11 @@ mod odb { } } } else { - load_indices::Outcome::Replace { - indices: state.indices[marker.pack_index_count..].to_vec(), + load_indices::Outcome::Extend { + indices: state.indices[marker.pack_index_sequence..].to_vec(), mark: PackIndexMarker { generation: state.generation, - pack_index_count: state.indices.len(), + pack_index_sequence: state.indices.len(), }, } } @@ -169,7 +173,7 @@ mod odb { indices: state.indices.clone(), mark: PackIndexMarker { generation: state.generation, - pack_index_count: state.indices.len(), + pack_index_sequence: state.indices.len(), }, }, }) From 3c790e01de0dbd3ffa2683d5cf060723d11d64a5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 14:44:16 +0800 Subject: [PATCH 31/57] remove trait bounds to allow single-threaded applications to exist (#259) As long as there is a feature toggle to build single-threaded applications, they can benefit only if trait bounds don't enforce thread Send and Sync unless truly needed. As long as everything is goverened by the same feature toggles to create fitting types, this will work transparently. Previously this wasn't done as it was the assumption that code is always capable of multi-threading even if it choses not to do it. Now we will have types that don't actually support it due to interior mutability, hence the relaxation of bounds where possible. --- DEVELOPMENT.md | 10 ++++++++++ git-features/src/parallel/serial.rs | 10 ++++------ git-pack/src/data/output/count/objects/mod.rs | 12 ++++++------ git-pack/src/data/output/entry/iter_from_counts.rs | 4 ++-- 4 files changed, 22 insertions(+), 14 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 65559fbe057..9d140a41e80 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -673,3 +673,13 @@ Please note that these are based on the following value system: probably just retry - Also it seems that the existing implementation has merrit, but should probably be altered to be a single store (without policy) instead also to fix latent issues around addressing packs in alternate repositories. + - The new ODB implementation and the previous one are pretty incompatible because the old one is Sync, but the new one is designed not to be (i.e.) use thread-local storage. + Existing implementations need to adjust their trait bounds and operate differently to accommodate. Counting objects and packing would be a good benchmark though, even though + the latter doesn't even scale that well due to the required dashmap to check for existing objects. In other words, currently there seems to be no actual benchmark for parallel + usage. + - In single-threaded operation the trait-bounds would prevent creation of packs unless they are adjusted as well, leading to `git-pack` requiring its own feature toggle + which we really try hard to avoid, but probably can be placed on application level, which has to use that to setup git-features accordingly, making it bearable. This + means though that we need to implement single-threaded and multi-threaded versions of everything important, like pack generation based on the count (which already has + a single-threaded version). + - Maybe… after some benchmarks, we can entirely drop the single-threaded version if it's not significantly faster on a single thread (without thread primiives) than the + same with multi-threading (but a single-thread). diff --git a/git-features/src/parallel/serial.rs b/git-features/src/parallel/serial.rs index ea22a37bb5b..ddc6c3ef589 100644 --- a/git-features/src/parallel/serial.rs +++ b/git-features/src/parallel/serial.rs @@ -2,7 +2,7 @@ use crate::parallel::Reduce; /// Runs `left` and then `right`, one after another, returning their output when both are done. #[cfg(not(feature = "parallel"))] -pub fn join(left: impl FnOnce() -> O1 + Send, right: impl FnOnce() -> O2 + Send) -> (O1, O2) { +pub fn join(left: impl FnOnce() -> O1, right: impl FnOnce() -> O2) -> (O1, O2) { (left(), right()) } @@ -17,16 +17,14 @@ pub fn join(left: impl FnOnce() -> O1 + Send, right: impl Fn /// /// **This serial version performing all calculations on the current thread.** pub fn in_parallel( - input: impl Iterator + Send, + input: impl Iterator, _thread_limit: Option, - new_thread_state: impl Fn(usize) -> S + Send + Sync, - consume: impl Fn(I, &mut S) -> O + Send + Sync, + new_thread_state: impl Fn(usize) -> S, + consume: impl Fn(I, &mut S) -> O, mut reducer: R, ) -> Result<::Output, ::Error> where R: Reduce, - I: Send, - O: Send, { let mut state = new_thread_state(0); for item in input { diff --git a/git-pack/src/data/output/count/objects/mod.rs b/git-pack/src/data/output/count/objects/mod.rs index f87ed99b7da..6c7e466c459 100644 --- a/git-pack/src/data/output/count/objects/mod.rs +++ b/git-pack/src/data/output/count/objects/mod.rs @@ -121,9 +121,9 @@ pub fn objects_unthreaded( input_object_expansion: ObjectExpansion, ) -> Result, IterErr> where - Find: crate::Find + Send + Sync, - Oid: Into + Send, - IterErr: std::error::Error + Send, + Find: crate::Find, + Oid: Into, + IterErr: std::error::Error, { let seen_objs = RefCell::new(HashSet::::default()); @@ -175,9 +175,9 @@ mod expand { allow_pack_lookups: bool, ) -> super::Result, IterErr> where - Find: crate::Find + Send + Sync, - Oid: Into + Send, - IterErr: std::error::Error + Send, + Find: crate::Find, + Oid: Into, + IterErr: std::error::Error, { use ObjectExpansion::*; diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index 5e5a634dd92..a4038254f37 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -42,7 +42,7 @@ use crate::data::{output, output::ChunkId}; pub fn iter_from_counts( mut counts: Vec, db: Find, - make_cache: impl Fn() -> Cache + Send + Clone + Sync + 'static, + make_cache: impl Fn() -> Cache + Send + Sync + 'static, mut progress: impl Progress, Options { version, @@ -54,7 +54,7 @@ pub fn iter_from_counts( ) -> impl Iterator), Error>> + parallel::reduce::Finalize>> where - Find: crate::Find + Clone + Send + Sync + 'static, + Find: crate::Find + Send + Sync + 'static, ::Error: Send, Cache: crate::cache::DecodeEntry, { From c805d0b231cf4d2f51dae7705bfbbc6562f86c32 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 16:02:14 +0800 Subject: [PATCH 32/57] unify trait bounds for parallel code: prefer Clone over Sync (#259) --- DEVELOPMENT.md | 6 ++++++ git-features/src/parallel/in_parallel.rs | 8 ++++---- git-features/src/parallel/mod.rs | 4 ++-- git-features/src/parallel/reduce.rs | 20 +++++++------------ git-pack/src/data/output/count/objects/mod.rs | 4 ++-- .../src/data/output/entry/iter_from_counts.rs | 6 +++--- 6 files changed, 24 insertions(+), 24 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9d140a41e80..2040bb0dd7f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -683,3 +683,9 @@ Please note that these are based on the following value system: a single-threaded version). - Maybe… after some benchmarks, we can entirely drop the single-threaded version if it's not significantly faster on a single thread (without thread primiives) than the same with multi-threading (but a single-thread). + - Servers would be split into readers and writers, where… + - …readers share a common pool and thus all maps, with lazy loading and refresh (but their pack-ids change due to that, and packs might disappear) + - …writers when… + - …cloning use an eagerly loaded Repository just for that particular clone for stable pack ids + - …fetches use a lazy-loaded Repository with refresh disabled, and full retries if the pack they were referring to goes away. Maybe there can be a policy for that to keep + pack ids stable despite refresh, which would also solve clones which could then lazy-load. diff --git a/git-features/src/parallel/in_parallel.rs b/git-features/src/parallel/in_parallel.rs index 8c9b6d228bf..55f8935c40b 100644 --- a/git-features/src/parallel/in_parallel.rs +++ b/git-features/src/parallel/in_parallel.rs @@ -22,8 +22,8 @@ pub fn join(left: impl FnOnce() -> O1 + Send, right: impl Fn pub fn in_parallel( input: impl Iterator + Send, thread_limit: Option, - new_thread_state: impl Fn(usize) -> S + Send + Sync, - consume: impl Fn(I, &mut S) -> O + Send + Sync, + new_thread_state: impl Fn(usize) -> S + Send + Clone, + consume: impl Fn(I, &mut S) -> O + Send + Clone, mut reducer: R, ) -> Result<::Output, ::Error> where @@ -32,8 +32,6 @@ where O: Send, { let num_threads = num_threads(thread_limit); - let new_thread_state = &new_thread_state; - let consume = &consume; crossbeam_utils::thread::scope(move |s| { let receive_result = { let (send_input, receive_input) = crossbeam_channel::bounded::(num_threads); @@ -42,6 +40,8 @@ where s.spawn({ let send_result = send_result.clone(); let receive_input = receive_input.clone(); + let new_thread_state = new_thread_state.clone(); + let consume = consume.clone(); move |_| { let mut state = new_thread_state(thread_id); for item in receive_input { diff --git a/git-features/src/parallel/mod.rs b/git-features/src/parallel/mod.rs index 7d6a38e8aa0..9ad5af7decc 100644 --- a/git-features/src/parallel/mod.rs +++ b/git-features/src/parallel/mod.rs @@ -133,8 +133,8 @@ pub fn in_parallel_if( condition: impl FnOnce() -> bool, input: impl Iterator + Send, thread_limit: Option, - new_thread_state: impl Fn(usize) -> S + Send + Sync, - consume: impl Fn(I, &mut S) -> O + Send + Sync, + new_thread_state: impl Fn(usize) -> S + Send + Clone, + consume: impl Fn(I, &mut S) -> O + Send + Clone, reducer: R, ) -> Result<::Output, ::Error> where diff --git a/git-features/src/parallel/reduce.rs b/git-features/src/parallel/reduce.rs index c3835724c1a..3a700317a32 100644 --- a/git-features/src/parallel/reduce.rs +++ b/git-features/src/parallel/reduce.rs @@ -138,11 +138,9 @@ mod stepped { impl Stepwise where - InputIter: Iterator + Send, - ConsumeFn: Fn(I, &mut S) -> O + Send + Sync, + InputIter: Iterator, + ConsumeFn: Fn(I, &mut S) -> O, Reduce: super::Reduce, - I: Send, - O: Send, { /// Instantiate a new iterator. /// For a description of parameters, see [`in_parallel()`][crate::parallel::in_parallel()]. @@ -154,7 +152,7 @@ mod stepped { reducer: Reduce, ) -> Self where - ThreadStateFn: Fn(usize) -> S + Send + Sync, + ThreadStateFn: Fn(usize) -> S, { Stepwise { input, @@ -175,11 +173,9 @@ mod stepped { impl Iterator for Stepwise where - InputIter: Iterator + Send, - ConsumeFn: Fn(I, &mut ThreadState) -> O + Send + Sync, + InputIter: Iterator, + ConsumeFn: Fn(I, &mut ThreadState) -> O, Reduce: super::Reduce, - I: Send, - O: Send, { type Item = Result; @@ -192,11 +188,9 @@ mod stepped { impl super::Finalize for Stepwise where - InputIter: Iterator + Send, - ConsumeFn: Fn(I, &mut S) -> O + Send + Sync, + InputIter: Iterator, + ConsumeFn: Fn(I, &mut S) -> O, R: super::Reduce, - I: Send, - O: Send, { type Reduce = R; diff --git a/git-pack/src/data/output/count/objects/mod.rs b/git-pack/src/data/output/count/objects/mod.rs index 6c7e466c459..0f5de74bc68 100644 --- a/git-pack/src/data/output/count/objects/mod.rs +++ b/git-pack/src/data/output/count/objects/mod.rs @@ -39,7 +39,7 @@ pub type Result = std::result::Result<(Vec, Outcome), Err /// * more configuration pub fn objects( db: Find, - make_caches: impl Fn() -> (PackCache, ObjectCache) + Send + Sync, + make_caches: impl Fn() -> (PackCache, ObjectCache) + Send + Clone, objects_ids: Iter, progress: impl Progress, should_interrupt: &AtomicBool, @@ -50,7 +50,7 @@ pub fn objects( }: Options, ) -> Result, IterErr> where - Find: crate::Find + Send + Sync, + Find: crate::Find + Send + Clone, ::Error: Send, Iter: Iterator> + Send, Oid: Into + Send, diff --git a/git-pack/src/data/output/entry/iter_from_counts.rs b/git-pack/src/data/output/entry/iter_from_counts.rs index a4038254f37..06b9660799d 100644 --- a/git-pack/src/data/output/entry/iter_from_counts.rs +++ b/git-pack/src/data/output/entry/iter_from_counts.rs @@ -42,7 +42,7 @@ use crate::data::{output, output::ChunkId}; pub fn iter_from_counts( mut counts: Vec, db: Find, - make_cache: impl Fn() -> Cache + Send + Sync + 'static, + make_cache: impl Fn() -> Cache + Send + Clone + Sync + 'static, mut progress: impl Progress, Options { version, @@ -54,7 +54,7 @@ pub fn iter_from_counts( ) -> impl Iterator), Error>> + parallel::reduce::Finalize>> where - Find: crate::Find + Send + Sync + 'static, + Find: crate::Find + Send + Clone + 'static, ::Error: Send, Cache: crate::cache::DecodeEntry, { @@ -78,7 +78,7 @@ where { let progress = Arc::clone(&progress); let counts = &counts; - let db = &db; + let db = db.clone(); move |chunk_range, buf| { let chunk = { let c = &counts[chunk_range]; From 4c00f78fcc52ddd93accda74e5711bbc62f20581 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 16:29:22 +0800 Subject: [PATCH 33/57] Sketch the future of the ref-db with namespace fix and upcoming reftable (#259) --- DEVELOPMENT.md | 9 +++++--- experiments/odb-redesign/src/lib.rs | 36 +++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 2040bb0dd7f..9794e0dca76 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -601,9 +601,9 @@ Please note that these are based on the following value system: 2. **Parameterize some sort of Policy into linked::ODB/compound::ODB** - First off, this needs an experiment to try it out quickly. - **initial thoughts** - - ✔️ Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different + - ❌ Depending on the actual implementation of `Policy`, `Repository/Easy` will or will not be thread-safe. This excludes using a `Box<…>` there as it has different trait bounds (once with and once without `Send + Sync`. I would like to avoid more feature toggles in `git-repository`, but could live with it. - - `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this + - ✔️ `Repository` would end up with type parameters if feature toggles aren't used, which could be compensated for with typedefs for the few known policies. However, this would also lead in a type-explosion for `Easy` and may force it to have a type parameter too. - ❌ To keep the `Repository` free of type parameters we could boil policies down to typical policies, like Eager, Lazy, LazyThreadSafe, PooledLazy, PooledLazyThreadSafe, all with different tradeoffs. On that level, maybe 3 to 5 feature toggles would work, but who likes feature toggles especially if they aren't additive? @@ -671,8 +671,11 @@ Please note that these are based on the following value system: object database due to lots of additional file handles needed, or alternatively an algorithm which fails if a refresh would be needed but instead retries if an object wasn't found, for example when a pack can't be (lazy) loaded as it was removed during maintenance. In other words, those creating packs definitely have to deal with failure specifically and probably just retry - - Also it seems that the existing implementation has merrit, but should probably be altered to be a single store (without policy) instead also to fix latent issues around + - Also it seems that the existing implementation has merit, but should probably be altered to be a single store (without policy) instead also to fix latent issues around addressing packs in alternate repositories. + - The current store is Sync, and can easily be passed around behind an `Arc`, which is actually how it works currently as well even though the `Arc` is on the `Repository` + instead. + - Shared-ownership `Policy` based stores would work just like it, but each one has its own thread-local interior mutable cache. - The new ODB implementation and the previous one are pretty incompatible because the old one is Sync, but the new one is designed not to be (i.e.) use thread-local storage. Existing implementations need to adjust their trait bounds and operate differently to accommodate. Counting objects and packing would be a good benchmark though, even though the latter doesn't even scale that well due to the required dashmap to check for existing objects. In other words, currently there seems to be no actual benchmark for parallel diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index b5e194c2432..dcc2f0030c0 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -243,6 +243,42 @@ mod odb { } } +mod refs { + use crate::features; + use std::path::PathBuf; + + pub struct LooseStore { + path: PathBuf, + reflog_mode: u32, + // namespace absent + } + + pub struct RefTable { + path: PathBuf, + reflog_mode: u32, + // lot's of caching things, no namespace, needs mutability when doing anything (let's not hide it at all) + } + + mod inner { + use crate::features; + use crate::refs::{LooseStore, RefTable}; + + pub(crate) enum StoreSelection { + Loose(LooseStore), + RefTable(features::Mutable), + } + } + + #[derive(Clone)] + struct Store { + inner: features::OwnShared, + namespace: u32, + } + + // impl common interface but check how all this works with iterators, there is some implementation around that already + // and maybe this should just be its own State like thing… bet its own Easy so to say. +} + mod repository { use crate::odb; From 8cafc8d745ca5471a8218e23337b5bd6865b91dc Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 16:48:38 +0800 Subject: [PATCH 34/57] a path forward to deciding which ODB implementation to pick (#259) After testing performance, we can either decide to allow multiple impls thanks to type parameters, or just use one and keep them out of it. --- experiments/odb-redesign/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index dcc2f0030c0..a9ffae18a2a 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -285,6 +285,10 @@ mod repository { mod raw { use git_pack::Find; + /// Let's avoid generics and rather switch the actual implementation with a feature toggle or just for testing. + /// After all, there is no use for keeping multiple implementations around just for a minor gain and a lot of added complexity. + /// Definitely run the existing experiments which are exercising the parallel code-paths perfectly and could be adjusted + /// to also try the access through easy. pub struct Repository where Odb: Find, // + Contains + Refresh/Reset maybe? From 977bd80d99f75296e28b97332c26a8acf8fd1a5b Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 17:06:22 +0800 Subject: [PATCH 35/57] This experiment shows that working through an Arc isn't slower than through a reference (#259) This is quite an amazing result and maybe it's just an M1 thing, but it's clearly showing that modern CPUs can handle that pretty well. We still keep the idea alive to make thread-safety switchable to avoid forcing overhead on other architectures or those who don't want to use threads. --- experiments/object-access/src/main.rs | 70 +++++++++++++++++++++++---- 1 file changed, 61 insertions(+), 9 deletions(-) diff --git a/experiments/object-access/src/main.rs b/experiments/object-access/src/main.rs index bf774a5b4d2..f4c6d9b7deb 100644 --- a/experiments/object-access/src/main.rs +++ b/experiments/object-access/src/main.rs @@ -1,3 +1,4 @@ +use std::sync::Arc; use std::{path::Path, time::Instant}; use anyhow::anyhow; @@ -21,6 +22,7 @@ fn main() -> anyhow::Result<()> { }; let objs_per_sec = |elapsed: std::time::Duration| hashes.len() as f32 / elapsed.as_secs_f32(); + let start = Instant::now(); do_gitoxide_in_parallel(&hashes, &repo, || odb::pack::cache::Never, AccessMode::ObjectExists)?; let elapsed = start.elapsed(); @@ -32,16 +34,25 @@ fn main() -> anyhow::Result<()> { ); let start = Instant::now(); - let bytes = do_gitoxide_in_parallel( + let bytes = do_gitoxide_in_parallel_through_arc( &hashes, - &repo, - || odb::pack::cache::lru::MemoryCappedHashmap::new(GITOXIDE_CACHED_OBJECT_DATA_PER_THREAD_IN_BYTES), + &repo.odb.dbs[0].loose.path, + odb::pack::cache::Never::default, AccessMode::ObjectData, )?; let elapsed = start.elapsed(); println!( - "parallel gitoxide (cache = {:.0}MB): confirmed {} bytes in {:?} ({:0.0} objects/s)", - GITOXIDE_CACHED_OBJECT_DATA_PER_THREAD_IN_BYTES as f32 / (1024 * 1024) as f32, + "parallel gitoxide (uncached, Arc): confirmed {} bytes in {:?} ({:0.0} objects/s)", + bytes, + elapsed, + objs_per_sec(elapsed) + ); + + let start = Instant::now(); + let bytes = do_gitoxide_in_parallel(&hashes, &repo, odb::pack::cache::Never::default, AccessMode::ObjectData)?; + let elapsed = start.elapsed(); + println!( + "parallel gitoxide (uncached): confirmed {} bytes in {:?} ({:0.0} objects/s)", bytes, elapsed, objs_per_sec(elapsed) @@ -51,22 +62,28 @@ fn main() -> anyhow::Result<()> { let bytes = do_gitoxide_in_parallel( &hashes, &repo, - odb::pack::cache::lru::StaticLinkedList::::default, + || odb::pack::cache::lru::MemoryCappedHashmap::new(GITOXIDE_CACHED_OBJECT_DATA_PER_THREAD_IN_BYTES), AccessMode::ObjectData, )?; let elapsed = start.elapsed(); println!( - "parallel gitoxide (small static cache): confirmed {} bytes in {:?} ({:0.0} objects/s)", + "parallel gitoxide (cache = {:.0}MB): confirmed {} bytes in {:?} ({:0.0} objects/s)", + GITOXIDE_CACHED_OBJECT_DATA_PER_THREAD_IN_BYTES as f32 / (1024 * 1024) as f32, bytes, elapsed, objs_per_sec(elapsed) ); let start = Instant::now(); - let bytes = do_gitoxide_in_parallel(&hashes, &repo, odb::pack::cache::Never::default, AccessMode::ObjectData)?; + let bytes = do_gitoxide_in_parallel( + &hashes, + &repo, + odb::pack::cache::lru::StaticLinkedList::::default, + AccessMode::ObjectData, + )?; let elapsed = start.elapsed(); println!( - "parallel gitoxide (uncached): confirmed {} bytes in {:?} ({:0.0} objects/s)", + "parallel gitoxide (small static cache): confirmed {} bytes in {:?} ({:0.0} objects/s)", bytes, elapsed, objs_per_sec(elapsed) @@ -213,3 +230,38 @@ where Ok(bytes.load(std::sync::atomic::Ordering::Acquire)) } + +fn do_gitoxide_in_parallel_through_arc( + hashes: &[ObjectId], + repo: &Path, + new_cache: impl Fn() -> C + Send + Clone, + mode: AccessMode, +) -> anyhow::Result +where + C: odb::pack::cache::DecodeEntry, +{ + let bytes = std::sync::atomic::AtomicU64::default(); + let odb = Arc::new(git_repository::odb::linked::Store::at(repo)?); + + git_repository::parallel::in_parallel( + hashes.chunks(1000), + None, + move |_| (Vec::new(), new_cache(), odb.clone()), + |hashes, (buf, cache, odb)| { + for hash in hashes { + match mode { + AccessMode::ObjectData => { + let obj = odb.find(hash, buf, cache)?; + bytes.fetch_add(obj.data.len() as u64, std::sync::atomic::Ordering::Relaxed); + } + AccessMode::ObjectExists => { + assert!(odb.contains(hash), "each traversed object exists"); + } + } + } + Ok(()) + }, + git_repository::parallel::reduce::IdentityWithResult::<(), anyhow::Error>::default(), + )?; + Ok(bytes.load(std::sync::atomic::Ordering::Acquire)) +} From d40288a65e773ed2bf8f623ca98a5f33632abd6c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 17:17:04 +0800 Subject: [PATCH 36/57] Show that arcs can cost up to 16.5% when having 10mio ops compared to no-arc (#259) However, that can only be caused when doing contains checks, which probably are not usually what one does, and in standard object access operation going through an arc (and even an arc-lock) isn't a problem. Going through a Mutex is slow though, so it's probably better to use read-locks when possible and upgrade them when needed. --- Cargo.lock | 1 + experiments/object-access/Cargo.toml | 1 + experiments/object-access/src/main.rs | 80 +++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index bcfcd6226c2..19184079585 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1833,6 +1833,7 @@ dependencies = [ "anyhow", "git-repository", "git2", + "parking_lot", "rayon", ] diff --git a/experiments/object-access/Cargo.toml b/experiments/object-access/Cargo.toml index faa01e2f14d..379c1ef16ec 100644 --- a/experiments/object-access/Cargo.toml +++ b/experiments/object-access/Cargo.toml @@ -12,3 +12,4 @@ anyhow = "1" git-repository = { version ="^0.12.0", path = "../../git-repository", features = ["unstable"] } git2 = "0.13" rayon = "1.5.0" +parking_lot = { version = "0.11.0", default-features = false } diff --git a/experiments/object-access/src/main.rs b/experiments/object-access/src/main.rs index f4c6d9b7deb..9b1b713f331 100644 --- a/experiments/object-access/src/main.rs +++ b/experiments/object-access/src/main.rs @@ -48,6 +48,51 @@ fn main() -> anyhow::Result<()> { objs_per_sec(elapsed) ); + let start = Instant::now(); + do_gitoxide_in_parallel_through_arc( + &hashes, + &repo.odb.dbs[0].loose.path, + odb::pack::cache::Never::default, + AccessMode::ObjectExists, + )?; + let elapsed = start.elapsed(); + println!( + "parallel gitoxide (Arc): confirmed {} objects exists in {:?} ({:0.0} objects/s)", + hashes.len(), + elapsed, + objs_per_sec(elapsed) + ); + + let start = Instant::now(); + let bytes = do_gitoxide_in_parallel_through_arc_rw_lock( + &hashes, + &repo.odb.dbs[0].loose.path, + odb::pack::cache::Never::default, + AccessMode::ObjectData, + )?; + let elapsed = start.elapsed(); + println!( + "parallel gitoxide (uncached, Arc, Lock): confirmed {} bytes in {:?} ({:0.0} objects/s)", + bytes, + elapsed, + objs_per_sec(elapsed) + ); + + let start = Instant::now(); + do_gitoxide_in_parallel_through_arc_rw_lock( + &hashes, + &repo.odb.dbs[0].loose.path, + odb::pack::cache::Never::default, + AccessMode::ObjectExists, + )?; + let elapsed = start.elapsed(); + println!( + "parallel gitoxide (Arc, Lock): confirmed {} objects exists in {:?} ({:0.0} objects/s)", + hashes.len(), + elapsed, + objs_per_sec(elapsed) + ); + let start = Instant::now(); let bytes = do_gitoxide_in_parallel(&hashes, &repo, odb::pack::cache::Never::default, AccessMode::ObjectData)?; let elapsed = start.elapsed(); @@ -265,3 +310,38 @@ where )?; Ok(bytes.load(std::sync::atomic::Ordering::Acquire)) } + +fn do_gitoxide_in_parallel_through_arc_rw_lock( + hashes: &[ObjectId], + repo: &Path, + new_cache: impl Fn() -> C + Send + Clone, + mode: AccessMode, +) -> anyhow::Result +where + C: odb::pack::cache::DecodeEntry, +{ + let bytes = std::sync::atomic::AtomicU64::default(); + let odb = Arc::new(parking_lot::RwLock::new(git_repository::odb::linked::Store::at(repo)?)); + + git_repository::parallel::in_parallel( + hashes.chunks(1000), + None, + move |_| (Vec::new(), new_cache(), odb.clone()), + |hashes, (buf, cache, odb)| { + for hash in hashes { + match mode { + AccessMode::ObjectData => { + let obj = odb.read().find(hash, buf, cache)?; + bytes.fetch_add(obj.data.len() as u64, std::sync::atomic::Ordering::Relaxed); + } + AccessMode::ObjectExists => { + assert!(odb.read().contains(hash), "each traversed object exists"); + } + } + } + Ok(()) + }, + git_repository::parallel::reduce::IdentityWithResult::<(), anyhow::Error>::default(), + )?; + Ok(bytes.load(std::sync::atomic::Ordering::Acquire)) +} From 9696383e70818baa92025fa648b9743c17516327 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 17:48:24 +0800 Subject: [PATCH 37/57] RwLocks are preferred over Mutexes if there is a decent read-only path (#259) For this we skip the Eager-portion and go straight to lazy loading, while introducing a flag that should help to keep packs available (as well as indices) indefinitely. I imagine the coordinator of readers (upload-pack) to check for the amount of open handles from time to time and the amount of reuse, and replace it with a new one to clear handles. This also means this must not be implemented as a flag per call, but as a setting on the policy. --- DEVELOPMENT.md | 4 +- experiments/odb-redesign/src/lib.rs | 142 ++++++++++++++-------------- 2 files changed, 74 insertions(+), 72 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 9794e0dca76..0062359f3d1 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -687,8 +687,8 @@ Please note that these are based on the following value system: - Maybe… after some benchmarks, we can entirely drop the single-threaded version if it's not significantly faster on a single thread (without thread primiives) than the same with multi-threading (but a single-thread). - Servers would be split into readers and writers, where… - - …readers share a common pool and thus all maps, with lazy loading and refresh (but their pack-ids change due to that, and packs might disappear) - - …writers when… + - …readers (receive-pack) share a common pool and thus all maps, with lazy loading and refresh (but their pack-ids change due to that, and packs might disappear, which they don't mind) + - …writers (upload-pack) use a lazy loaded repository pool like readers during negotiation, but when… - …cloning use an eagerly loaded Repository just for that particular clone for stable pack ids - …fetches use a lazy-loaded Repository with refresh disabled, and full retries if the pack they were referring to goes away. Maybe there can be a policy for that to keep pack ids stable despite refresh, which would also solve clones which could then lazy-load. diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index a9ffae18a2a..71d1abf7080 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -5,10 +5,16 @@ mod features { use std::sync::Arc; pub type OwnShared = Arc; - pub type Mutable = parking_lot::Mutex; + pub type MutableOnDemand = parking_lot::RwLock; - pub fn get_mut(v: &Mutable) -> parking_lot::MutexGuard<'_, T> { - v.lock() + pub fn get_ref(v: &MutableOnDemand) -> parking_lot::RwLockUpgradableReadGuard<'_, T> { + v.upgradable_read() + } + + pub fn upgrade_ref_to_mut( + v: parking_lot::RwLockUpgradableReadGuard<'_, T>, + ) -> parking_lot::RwLockWriteGuard<'_, T> { + parking_lot::RwLockUpgradableReadGuard::upgrade(v) } } @@ -32,7 +38,7 @@ mod features { mod odb { use crate::features; - use crate::features::get_mut; + use crate::features::{get_ref, upgrade_ref_to_mut}; use crate::odb::policy::{load_indices, PackIndexMarker}; use git_odb::data::Object; use git_odb::pack::bundle::Location; @@ -40,25 +46,18 @@ mod odb { use git_odb::pack::find::Entry; use git_odb::pack::Bundle; use std::io; - use std::ops::DerefMut; use std::path::{Path, PathBuf}; pub mod policy { - pub(crate) enum State { - Eager(eager::State), - } - - pub(crate) mod eager { - use crate::features; - use std::path::PathBuf; + use crate::features; + use std::path::PathBuf; - #[derive(Default)] - pub struct State { - pub(crate) db_paths: Vec, - pub(crate) indices: Vec>, - pub(crate) generation: u8, - } + #[derive(Default)] + pub(crate) struct State { + pub(crate) db_paths: Vec, + pub(crate) indices: Vec>, + pub(crate) generation: u8, } /// A way to indicate which pack indices we have seen already @@ -73,8 +72,11 @@ mod odb { /// Define how packs will be refreshed when all indices are loaded pub enum RefreshMode { - /// Check for new or changed pack indices when the last known index is loaded. - AfterAllIndicesLoaded, + /// Check for new or changed pack indices (and pack data files) when the last known index is loaded. + /// If allow unloading pack data files is true, we will start fresh and previous packs (by their ids) might be lost + /// forever causing problems retrieving data. + /// If it the flag is false, pack files will never be unloaded when they aren't available on disk anymore. + AfterAllIndicesLoaded { allow_unloading_pack_data: bool }, /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes even after all packs are loaded. /// This comes at the risk of not learning that the packs have changed in the mean time. Never, @@ -107,13 +109,13 @@ mod odb { } pub struct Policy { - state: features::Mutable, + state: features::MutableOnDemand, } impl Policy { fn eager() -> Self { Policy { - state: features::Mutable::new(policy::State::Eager(policy::eager::State::default())), + state: features::MutableOnDemand::new(policy::State::default()), } } } @@ -127,68 +129,68 @@ mod odb { }: load_indices::Options<'_>, marker: Option, ) -> std::io::Result>> { - let mut state = get_mut(&self.state); - match state.deref_mut() { - policy::State::Eager(state) => { - if state.db_paths.is_empty() { - return Self::refresh(state, objects_directory); - } else { - debug_assert_eq!( - Some(objects_directory), - state.db_paths.get(0).map(|p| p.as_path()), - "Eager policy can't be shared across different repositories" - ); - } + let state = get_ref(&self.state); + if state.db_paths.is_empty() { + let mut state = upgrade_ref_to_mut(state); + return Self::refresh(&mut state, objects_directory); + } else { + debug_assert_eq!( + Some(objects_directory), + state.db_paths.get(0).map(|p| p.as_path()), + "Eager policy can't be shared across different repositories" + ); + } - Ok(match marker { - Some(marker) => { - if marker.generation != state.generation { - load_indices::Outcome::Replace { - indices: state.indices.clone(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_sequence: state.indices.len(), - }, - } - } else { - if marker.pack_index_sequence == state.indices.len() { - match mode { - policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, - policy::RefreshMode::AfterAllIndicesLoaded => { - return Self::refresh(state, objects_directory) - } - } - } else { - load_indices::Outcome::Extend { - indices: state.indices[marker.pack_index_sequence..].to_vec(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_sequence: state.indices.len(), - }, - } - } - } - } - None => load_indices::Outcome::Replace { + Ok(match marker { + Some(marker) => { + if marker.generation != state.generation { + load_indices::Outcome::Replace { indices: state.indices.clone(), mark: PackIndexMarker { generation: state.generation, pack_index_sequence: state.indices.len(), }, - }, - }) + } + } else { + if marker.pack_index_sequence == state.indices.len() { + match mode { + policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, + policy::RefreshMode::AfterAllIndicesLoaded { + allow_unloading_pack_data: _, + } => { + let mut state = upgrade_ref_to_mut(state); + return Self::refresh(&mut state, objects_directory); + } + } + } else { + load_indices::Outcome::Extend { + indices: state.indices[marker.pack_index_sequence..].to_vec(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_sequence: state.indices.len(), + }, + } + } + } } - } + None => load_indices::Outcome::Replace { + indices: state.indices.clone(), + mark: PackIndexMarker { + generation: state.generation, + pack_index_sequence: state.indices.len(), + }, + }, + }) } /// If there is only additive changes, there is no need for a new `generation` actually, which helps /// callers to retain stability. fn refresh( - policy::eager::State { + policy::State { db_paths, indices: bundles, generation, - }: &mut policy::eager::State, + }: &mut policy::State, objects_directory: &Path, ) -> io::Result>> { db_paths.extend( @@ -265,7 +267,7 @@ mod refs { pub(crate) enum StoreSelection { Loose(LooseStore), - RefTable(features::Mutable), + RefTable(features::MutableOnDemand), } } From 4160b4a76b8cc0fb81bd00255eb1fc3bd636edda Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 18:14:45 +0800 Subject: [PATCH 38/57] Adjust unload mode to be a policy wide settings, probably along some more soon (#259) --- experiments/odb-redesign/src/lib.rs | 56 ++++++++++++++++++----------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 71d1abf7080..1049202fb91 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -53,11 +53,32 @@ mod odb { use crate::features; use std::path::PathBuf; + /// Unloading here means to drop the shared reference to the mapped pack data file. + /// + /// Indices are always handled like that if their file on disk disappears. + pub enum PackDataUnloadMode { + /// Keep pack data always available in loaded memory maps even if the underlying data file (and usually index) are gone. + /// This means algorithms that keep track of packs like pack-generators will always be able to resolve the data they reference. + /// This also means, however, that one might run out of system resources some time, which means the coordinator of such users + /// needs to check resource usage vs amount of uses and replace this instance with a new policy to eventually have the memory + /// mapped packs drop (as references to them disappear once consumers disappear) + Never, + /// Forget/drop the mapped pack data file when its file on disk disappeared. + WhenDiskFileIsMissing, + } + + impl Default for PackDataUnloadMode { + fn default() -> Self { + PackDataUnloadMode::WhenDiskFileIsMissing + } + } + #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, pub(crate) indices: Vec>, pub(crate) generation: u8, + pub(crate) allow_unload: bool, } /// A way to indicate which pack indices we have seen already @@ -73,10 +94,9 @@ mod odb { /// Define how packs will be refreshed when all indices are loaded pub enum RefreshMode { /// Check for new or changed pack indices (and pack data files) when the last known index is loaded. - /// If allow unloading pack data files is true, we will start fresh and previous packs (by their ids) might be lost - /// forever causing problems retrieving data. - /// If it the flag is false, pack files will never be unloaded when they aren't available on disk anymore. - AfterAllIndicesLoaded { allow_unloading_pack_data: bool }, + /// Note that this might not yield stable pack data or index ids unless the Policy is set to never actually unload indices. + /// The caller cannot know though. + AfterAllIndicesLoaded, /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes even after all packs are loaded. /// This comes at the risk of not learning that the packs have changed in the mean time. Never, @@ -108,14 +128,18 @@ mod odb { } } + #[derive(Default)] pub struct Policy { state: features::MutableOnDemand, } impl Policy { - fn eager() -> Self { + fn new(unload_mode: policy::PackDataUnloadMode) -> Self { Policy { - state: features::MutableOnDemand::new(policy::State::default()), + state: features::MutableOnDemand::new(policy::State { + allow_unload: matches!(unload_mode, policy::PackDataUnloadMode::WhenDiskFileIsMissing), + ..policy::State::default() + }), } } } @@ -155,9 +179,7 @@ mod odb { if marker.pack_index_sequence == state.indices.len() { match mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, - policy::RefreshMode::AfterAllIndicesLoaded { - allow_unloading_pack_data: _, - } => { + policy::RefreshMode::AfterAllIndicesLoaded => { let mut state = upgrade_ref_to_mut(state); return Self::refresh(&mut state, objects_directory); } @@ -188,6 +210,7 @@ mod odb { fn refresh( policy::State { db_paths, + allow_unload: _, indices: bundles, generation, }: &mut policy::State, @@ -202,12 +225,12 @@ mod odb { } /// The store shares a policy and - pub struct PolicyStore { + pub struct Store { policy: features::OwnShared, objects_directory: PathBuf, } - impl git_odb::Find for PolicyStore { + impl git_odb::Find for Store { type Error = git_odb::compound::find::Error; fn try_find<'a>( @@ -233,14 +256,7 @@ mod odb { } fn try_setup() -> anyhow::Result<()> { - let policy = Policy::eager(); - // let policy = Box::new(EagerLocal::default()) - // as Box< - // dyn DynPolicy< - // PackIndex = features::OwnShared, - // PackData = features::OwnShared, - // >, - // >; + let policy = Policy::default(); Ok(()) } } @@ -301,6 +317,6 @@ mod repository { /// Maybe we will end up providing a generic version as there still seems to be benefits in having a read-only Store implementation. pub struct Repository { - odb: odb::PolicyStore, + odb: odb::Store, } } From 3d533a1d4aedd2ac1b543c1701749391e0e069b1 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 19:18:11 +0800 Subject: [PATCH 39/57] Fix single-threaded version of design-sketch (#259) --- experiments/odb-redesign/src/lib.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 1049202fb91..f23db3438a4 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -23,11 +23,15 @@ mod features { use std::rc::Rc; pub type OwnShared = Rc; - pub type Mutable = RefCell; + pub type MutableOnDemand = RefCell; - pub fn get_mut(v: &Mutable) -> RefMut<'_, T> { + pub fn get_ref(v: &RefCell) -> RefMut<'_, T> { v.borrow_mut() } + + pub fn upgrade_ref_to_mut(v: RefMut<'_, T>) -> RefMut<'_, T> { + v + } } #[cfg(not(feature = "thread-safe"))] From f90207dc8fb2d8c773389c217baac0c2a733d3b5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Tue, 23 Nov 2021 19:52:36 +0800 Subject: [PATCH 40/57] More work towards understanding indices and multi-pack indices for eventual pack lookup (#259) --- DEVELOPMENT.md | 4 +++ experiments/odb-redesign/src/lib.rs | 52 ++++++++++++++++++++++------- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 0062359f3d1..87a778c5a8f 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -597,6 +597,10 @@ Please note that these are based on the following value system: best ported to the loose ref database as well. That way, there would be one API governing both, unifying sharing on the `repo()`. - Ref databases could have methods like `find_in_namespace()' along with the current ones, whereas the current ones delegate to the ones with namespace which may be `None`, to accommodate for the fact that most won't use namespaces. + - Use this opportunity to implement a general `Store` for later use with ref-table, and for immediate use in the `Easy` state. The packed-buffer along with the stat logic + should definitely be placed in there, too, I think, as technically without a maintained packed buffer the whole loose ref DB is quite useless AND the `Store` API won't be + equivalent between different implementations. AKA, hide implementation details which the packed ref buffer is. This would also give it its own Buffer for logs, but that's + quite alright, after all these are quite separate from object buffers. 2. **Parameterize some sort of Policy into linked::ODB/compound::ODB** - First off, this needs an experiment to try it out quickly. diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index f23db3438a4..e73f3014fd0 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -55,8 +55,34 @@ mod odb { pub mod policy { use crate::features; + use crate::odb::policy; use std::path::PathBuf; + mod index_file { + pub enum State { + Index { id: usize, file: git_pack::index::File }, + MultipackIndex { id: usize }, + } + } + + pub(crate) struct IndexForObjectInPack { + /// The internal identifier of the pack itself + pack_id: u32, + /// The index of the object within the pack + object_index: u32, + /// The id of the corresponding index file, in case of a multi-pack index + /// This could probably be packed into a u64 with a bitmap, but probably not useful. + multipack_index_id: Option, + } + + pub(crate) struct IndexFile { + inner: index_file::State, + } + + impl IndexFile { + // TODO: typical access, like by Oid, yielding a pack id and object index for this pack + } + /// Unloading here means to drop the shared reference to the mapped pack data file. /// /// Indices are always handled like that if their file on disk disappears. @@ -80,7 +106,7 @@ mod odb { #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, - pub(crate) indices: Vec>, + pub(crate) loaded_indices: Vec>, pub(crate) generation: u8, pub(crate) allow_unload: bool, } @@ -149,14 +175,14 @@ mod odb { } impl Policy { - pub fn load_next_indices( + pub(crate) fn load_next_indices( &self, load_indices::Options { objects_directory, mode, }: load_indices::Options<'_>, marker: Option, - ) -> std::io::Result>> { + ) -> std::io::Result>> { let state = get_ref(&self.state); if state.db_paths.is_empty() { let mut state = upgrade_ref_to_mut(state); @@ -173,14 +199,14 @@ mod odb { Some(marker) => { if marker.generation != state.generation { load_indices::Outcome::Replace { - indices: state.indices.clone(), + indices: state.loaded_indices.clone(), mark: PackIndexMarker { generation: state.generation, - pack_index_sequence: state.indices.len(), + pack_index_sequence: state.loaded_indices.len(), }, } } else { - if marker.pack_index_sequence == state.indices.len() { + if marker.pack_index_sequence == state.loaded_indices.len() { match mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { @@ -190,20 +216,20 @@ mod odb { } } else { load_indices::Outcome::Extend { - indices: state.indices[marker.pack_index_sequence..].to_vec(), + indices: state.loaded_indices[marker.pack_index_sequence..].to_vec(), mark: PackIndexMarker { generation: state.generation, - pack_index_sequence: state.indices.len(), + pack_index_sequence: state.loaded_indices.len(), }, } } } } None => load_indices::Outcome::Replace { - indices: state.indices.clone(), + indices: state.loaded_indices.clone(), mark: PackIndexMarker { generation: state.generation, - pack_index_sequence: state.indices.len(), + pack_index_sequence: state.loaded_indices.len(), }, }, }) @@ -215,11 +241,11 @@ mod odb { policy::State { db_paths, allow_unload: _, - indices: bundles, + loaded_indices: bundles, generation, }: &mut policy::State, objects_directory: &Path, - ) -> io::Result>> { + ) -> io::Result>> { db_paths.extend( git_odb::alternate::resolve(objects_directory) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, @@ -243,6 +269,8 @@ mod odb { buffer: &'a mut Vec, pack_cache: &mut impl DecodeEntry, ) -> Result>, Self::Error> { + // TODO: if the generation changes, we need to clear the pack-cache as it depends on pack-ids. + // Can we simplify this so it's more obvious what generation does? todo!() } From 3fce8f2b35ec6c2076f66fdde16a5f99a68326ac Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 24 Nov 2021 08:31:04 +0800 Subject: [PATCH 41/57] sketch a little more how packs could be accessed (#259) --- experiments/odb-redesign/src/lib.rs | 33 +++++++++++++++++++++-------- git-pack/src/bundle/mod.rs | 1 + git-pack/src/data/mod.rs | 1 + 3 files changed, 26 insertions(+), 9 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index e73f3014fd0..66a071500d2 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -56,31 +56,42 @@ mod odb { use crate::features; use crate::odb::policy; + use git_hash::oid; use std::path::PathBuf; mod index_file { pub enum State { - Index { id: usize, file: git_pack::index::File }, + Index { id: u32, file: git_pack::index::File }, MultipackIndex { id: usize }, } } pub(crate) struct IndexForObjectInPack { - /// The internal identifier of the pack itself - pack_id: u32, + /// The internal identifier of the pack itself, which either is referred to by an index or a multi-pack index. + pack_id: PackId, /// The index of the object within the pack - object_index: u32, - /// The id of the corresponding index file, in case of a multi-pack index - /// This could probably be packed into a u64 with a bitmap, but probably not useful. - multipack_index_id: Option, + object_index_in_pack: u32, } + /// A way to load and refer to a pack uniquely. + pub struct PackId(u32); + pub(crate) struct IndexFile { - inner: index_file::State, + state: index_file::State, } impl IndexFile { - // TODO: typical access, like by Oid, yielding a pack id and object index for this pack + fn lookup(&self, object_id: &oid) -> Option { + match &self.state { + index_file::State::Index { id, file } => { + file.lookup(object_id).map(|object_index_in_pack| IndexForObjectInPack { + pack_id: PackId(*id), + object_index_in_pack, + }) + } + index_file::State::MultipackIndex { .. } => todo!("required with multi-pack index"), + } + } } /// Unloading here means to drop the shared reference to the mapped pack data file. @@ -175,6 +186,9 @@ mod odb { } impl Policy { + pub(crate) fn load_pack(id: policy::PackId) -> std::io::Result> { + todo!("find in our own cache or load it, which needs a way to find their path by id") + } pub(crate) fn load_next_indices( &self, load_indices::Options { @@ -278,6 +292,7 @@ mod odb { todo!() } + // TODO: turn this into a pack-id fn bundle_by_pack_id(&self, pack_id: u32) -> Option<&Bundle> { todo!() } diff --git a/git-pack/src/bundle/mod.rs b/git-pack/src/bundle/mod.rs index 12bd24f1500..2c169ddfa69 100644 --- a/git-pack/src/bundle/mod.rs +++ b/git-pack/src/bundle/mod.rs @@ -1,4 +1,5 @@ /// A way to uniquely identify the location of an object within a pack bundle +// TODO: this should move to become a pack location, it has nothing to do with an index, it's all about the pack data file #[derive(PartialEq, Eq, Debug, Hash, Ord, PartialOrd, Clone)] #[cfg_attr(feature = "serde1", derive(serde::Serialize, serde::Deserialize))] pub struct Location { diff --git a/git-pack/src/data/mod.rs b/git-pack/src/data/mod.rs index b468288c4c3..56cbab87796 100644 --- a/git-pack/src/data/mod.rs +++ b/git-pack/src/data/mod.rs @@ -71,6 +71,7 @@ pub struct File { /// /// Note that `path` might not be canonicalized, thus different hashes might actually refer to the same pack on disk. This will /// only lead to less efficient cache usage. + /// TODO: remove this intrinsic id as it will henceforth be handled separately by the object store pub id: u32, version: Version, num_objects: u32, From babfb7b3e18beaa5866905b3a4e227a371defb83 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 24 Nov 2021 09:00:29 +0800 Subject: [PATCH 42/57] figure out how pack-ids can be stable even though multi-packs are still unclear (#259) --- experiments/odb-redesign/src/lib.rs | 100 +++++++++++++++++++--------- 1 file changed, 67 insertions(+), 33 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 66a071500d2..58ed99253f8 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -61,8 +61,15 @@ mod odb { mod index_file { pub enum State { - Index { id: u32, file: git_pack::index::File }, - MultipackIndex { id: usize }, + Index { + /// The identifier for the index, matching its location in a lookup table for the paths to the files so that it can + /// be used to load pack data files on the fly. + id: u32, + file: git_pack::index::File, + }, + MultipackIndex { + id: usize, + }, } } @@ -73,8 +80,11 @@ mod odb { object_index_in_pack: u32, } - /// A way to load and refer to a pack uniquely. - pub struct PackId(u32); + /// A way to load and refer to a pack uniquely, namespaced by their indexing mechanism, aka multi-pack or not. + pub struct PackId { + id: u32, + in_multi_pack: bool, + } pub(crate) struct IndexFile { state: index_file::State, @@ -85,7 +95,10 @@ mod odb { match &self.state { index_file::State::Index { id, file } => { file.lookup(object_id).map(|object_index_in_pack| IndexForObjectInPack { - pack_id: PackId(*id), + pack_id: PackId { + id: *id, + in_multi_pack: false, + }, object_index_in_pack, }) } @@ -94,20 +107,6 @@ mod odb { } } - /// Unloading here means to drop the shared reference to the mapped pack data file. - /// - /// Indices are always handled like that if their file on disk disappears. - pub enum PackDataUnloadMode { - /// Keep pack data always available in loaded memory maps even if the underlying data file (and usually index) are gone. - /// This means algorithms that keep track of packs like pack-generators will always be able to resolve the data they reference. - /// This also means, however, that one might run out of system resources some time, which means the coordinator of such users - /// needs to check resource usage vs amount of uses and replace this instance with a new policy to eventually have the memory - /// mapped packs drop (as references to them disappear once consumers disappear) - Never, - /// Forget/drop the mapped pack data file when its file on disk disappeared. - WhenDiskFileIsMissing, - } - impl Default for PackDataUnloadMode { fn default() -> Self { PackDataUnloadMode::WhenDiskFileIsMissing @@ -117,7 +116,7 @@ mod odb { #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, - pub(crate) loaded_indices: Vec>, + pub(crate) loaded_indices: Vec>>, pub(crate) generation: u8, pub(crate) allow_unload: bool, } @@ -126,25 +125,44 @@ mod odb { pub struct PackIndexMarker { /// The generation the marker belongs to, is incremented to indicate that there were changes that caused the removal of a /// pack and require the caller to rebuild their cache to free resources. - pub generation: u8, + pub(crate) generation: u8, /// An ever increasing number within a generation indicating the number of loaded pack indices. It's reset with /// each new generation. - pub pack_index_sequence: usize, + pub(crate) pack_index_sequence: usize, } - /// Define how packs will be refreshed when all indices are loaded + /// Define how packs will be refreshed when all indices are loaded, which is useful if a lot of objects are missing. pub enum RefreshMode { /// Check for new or changed pack indices (and pack data files) when the last known index is loaded. - /// Note that this might not yield stable pack data or index ids unless the Policy is set to never actually unload indices. - /// The caller cannot know though. + /// During runtime we will keep pack indices stable by never reusing them, however, there is the option for + /// clearing internal cashes which is likely to change pack ids and it will trigger unloading of packs as they are missing on disk. AfterAllIndicesLoaded, /// Use this if you expect a lot of missing objects that shouldn't trigger refreshes even after all packs are loaded. /// This comes at the risk of not learning that the packs have changed in the mean time. Never, } + /// Unloading here means to drop the shared reference to the mapped pack data file. + /// + /// Indices are always handled like that if their file on disk disappears as objects usually never disappear unless they are unreachable, + /// meaning that new indices always contain the old objects in some way. + pub enum PackDataUnloadMode { + /// Keep pack data always available in loaded memory maps even if the underlying data file (and usually index) are gone. + /// This means algorithms that keep track of packs like pack-generators will always be able to resolve the data they reference. + /// This also means, however, that one might run out of system resources some time, which means the coordinator of such users + /// needs to check resource usage vs amount of uses and replace this instance with a new policy to eventually have the memory + /// mapped packs drop (as references to them disappear once consumers disappear). + /// We will also not ask Store handles to remove their pack data references. + /// We will never rebuild our internal data structures to keep pack ids unique indefinitely (i.e. won't reuse a pack id with a different pack). + Never, + /// Forget/drop the mapped pack data file when its file on disk disappeared and store handles to remove their pack data references + /// for them to be able to go out of scope. + /// We are allowed to rebuild our internal data structures to save on memory but invalidate all previous pack ids. + WhenDiskFileIsMissing, + } + pub mod load_indices { - use crate::odb::policy::{PackIndexMarker, RefreshMode}; + use crate::odb::policy::{PackId, PackIndexMarker, RefreshMode}; use std::path::Path; pub struct Options<'a> { @@ -153,15 +171,16 @@ mod odb { } pub enum Outcome { - /// Replace your set with the given one + /// Replace your set with the given one and drop all packs data handles. Replace { indices: Vec, mark: PackIndexMarker, }, - /// Extend with the given indices and keep searching + /// Extend with the given indices and keep searching. Extend { - indices: Vec, // should probably be small vec to get around most allocations - mark: PackIndexMarker, // use to show where you left off next time you call + indices: Vec, // should probably be small vec to get around most allocations + drop_packs: Vec, // which packs to forget about because they were removed from disk. + mark: PackIndexMarker, // use to show where you left off next time you call }, /// No new indices to look at, caller should stop give up NoMoreIndices, @@ -213,7 +232,12 @@ mod odb { Some(marker) => { if marker.generation != state.generation { load_indices::Outcome::Replace { - indices: state.loaded_indices.clone(), + indices: state + .loaded_indices + .iter() + .filter_map(Option::as_ref) + .cloned() + .collect(), mark: PackIndexMarker { generation: state.generation, pack_index_sequence: state.loaded_indices.len(), @@ -230,7 +254,12 @@ mod odb { } } else { load_indices::Outcome::Extend { - indices: state.loaded_indices[marker.pack_index_sequence..].to_vec(), + indices: state.loaded_indices[marker.pack_index_sequence..] + .iter() + .filter_map(Option::as_ref) + .cloned() + .collect(), + drop_packs: Vec::new(), mark: PackIndexMarker { generation: state.generation, pack_index_sequence: state.loaded_indices.len(), @@ -240,7 +269,12 @@ mod odb { } } None => load_indices::Outcome::Replace { - indices: state.loaded_indices.clone(), + indices: state + .loaded_indices + .iter() + .filter_map(Option::as_ref) + .cloned() + .collect(), mark: PackIndexMarker { generation: state.generation, pack_index_sequence: state.loaded_indices.len(), From fae3e7c40497c62cadb1738fa63cda2a9fbe639e Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 24 Nov 2021 09:20:38 +0800 Subject: [PATCH 43/57] A better idea on how multi-pack indices (and changes to them) work (#259) We keep them like a normal index, which may mean we have HAD multiple of them. Thus we can't make it special, I think, but keep all in the same index-providing array. --- experiments/odb-redesign/src/lib.rs | 30 ++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 58ed99253f8..6e918dd2775 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -61,15 +61,8 @@ mod odb { mod index_file { pub enum State { - Index { - /// The identifier for the index, matching its location in a lookup table for the paths to the files so that it can - /// be used to load pack data files on the fly. - id: u32, - file: git_pack::index::File, - }, - MultipackIndex { - id: usize, - }, + Single(git_pack::index::File), + Multi(()), } } @@ -80,29 +73,33 @@ mod odb { object_index_in_pack: u32, } + /// An id to refer to an index file or a multipack index file + pub struct IndexId(u32); + /// A way to load and refer to a pack uniquely, namespaced by their indexing mechanism, aka multi-pack or not. pub struct PackId { id: u32, - in_multi_pack: bool, + multipack_index: Option, } pub(crate) struct IndexFile { state: index_file::State, + id: IndexId, } impl IndexFile { fn lookup(&self, object_id: &oid) -> Option { match &self.state { - index_file::State::Index { id, file } => { + index_file::State::Single(file) => { file.lookup(object_id).map(|object_index_in_pack| IndexForObjectInPack { pack_id: PackId { - id: *id, - in_multi_pack: false, + id: self.id.0, + multipack_index: None, }, object_index_in_pack, }) } - index_file::State::MultipackIndex { .. } => todo!("required with multi-pack index"), + index_file::State::Multi(()) => todo!("required with multi-pack index"), } } } @@ -117,6 +114,7 @@ mod odb { pub(crate) struct State { pub(crate) db_paths: Vec, pub(crate) loaded_indices: Vec>>, + pub(crate) multi_index: Option>, pub(crate) generation: u8, pub(crate) allow_unload: bool, } @@ -147,7 +145,8 @@ mod odb { /// Indices are always handled like that if their file on disk disappears as objects usually never disappear unless they are unreachable, /// meaning that new indices always contain the old objects in some way. pub enum PackDataUnloadMode { - /// Keep pack data always available in loaded memory maps even if the underlying data file (and usually index) are gone. + /// Keep pack data (and multi-pack index-to-pack lookup tables) always available in loaded memory maps even + /// if the underlying data file (and usually index) are gone. /// This means algorithms that keep track of packs like pack-generators will always be able to resolve the data they reference. /// This also means, however, that one might run out of system resources some time, which means the coordinator of such users /// needs to check resource usage vs amount of uses and replace this instance with a new policy to eventually have the memory @@ -289,6 +288,7 @@ mod odb { policy::State { db_paths, allow_unload: _, + multi_index: _, loaded_indices: bundles, generation, }: &mut policy::State, From 5124abfb62545d3eb4640f9042412985daf4df57 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Wed, 24 Nov 2021 18:17:37 +0800 Subject: [PATCH 44/57] =?UTF-8?q?A=20big=20step=20towards=20getting=20IDs?= =?UTF-8?q?=20straight,=20but=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …pack loading states need some more detail to be able to decide what to do. --- experiments/odb-redesign/src/lib.rs | 98 +++++++++++++++++++---------- 1 file changed, 64 insertions(+), 34 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 6e918dd2775..1fcff55a130 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -53,14 +53,14 @@ mod odb { use std::path::{Path, PathBuf}; pub mod policy { - use crate::features; use crate::odb::policy; use git_hash::oid; + use std::collections::BTreeMap; use std::path::PathBuf; mod index_file { - pub enum State { + pub enum SingleOrMulti { Single(git_pack::index::File), Multi(()), } @@ -78,28 +78,30 @@ mod odb { /// A way to load and refer to a pack uniquely, namespaced by their indexing mechanism, aka multi-pack or not. pub struct PackId { - id: u32, - multipack_index: Option, + /// Note that if `multipack_index = None`, this id is corresponding to the index id. + /// If it is a multipack index, `id` is the id / offset of the pack in the `multipack_index`. + pub(crate) index: u32, + pub(crate) multipack_index: Option, } pub(crate) struct IndexFile { - state: index_file::State, - id: IndexId, + file: index_file::SingleOrMulti, + pub id: IndexId, } impl IndexFile { fn lookup(&self, object_id: &oid) -> Option { - match &self.state { - index_file::State::Single(file) => { + match &self.file { + index_file::SingleOrMulti::Single(file) => { file.lookup(object_id).map(|object_index_in_pack| IndexForObjectInPack { pack_id: PackId { - id: self.id.0, + index: self.id.0, multipack_index: None, }, object_index_in_pack, }) } - index_file::State::Multi(()) => todo!("required with multi-pack index"), + index_file::SingleOrMulti::Multi(()) => todo!("required with multi-pack index"), } } } @@ -110,22 +112,29 @@ mod odb { } } + type PackDataFiles = Vec>>; + #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, pub(crate) loaded_indices: Vec>>, - pub(crate) multi_index: Option>, + pub(crate) loaded_packs: PackDataFiles, + /// Each change in the multi-pack index creates a new index entry here and typically drops all knowledge about its removed packs. + /// We do this because there can be old pack ids around that refer to the old (now deleted) multi-pack index along with a possibly + /// now invalid (or ambiguous) local pack id, i.e. pack A was meant, but now that's pack B because the multi-index has changed. + pub(crate) loaded_packs_by_multi_index: BTreeMap, + /// Generations are incremented whenever we decide to clear out our vectors if they are too big and contains too many empty slots. + /// If we are not allowed to unload files, the generation will never be changed. pub(crate) generation: u8, pub(crate) allow_unload: bool, } /// A way to indicate which pack indices we have seen already pub struct PackIndexMarker { - /// The generation the marker belongs to, is incremented to indicate that there were changes that caused the removal of a - /// pack and require the caller to rebuild their cache to free resources. + /// The generation the `pack_index_sequence` belongs to. Indices of different generations are completely incompatible. pub(crate) generation: u8, - /// An ever increasing number within a generation indicating the number of loaded pack indices. It's reset with - /// each new generation. + /// An ever increasing number within a generation indicating the maximum number of loaded pack indices and + /// the amount of slots we have for indices and packs. pub(crate) pack_index_sequence: usize, } @@ -161,27 +170,32 @@ mod odb { } pub mod load_indices { - use crate::odb::policy::{PackId, PackIndexMarker, RefreshMode}; + use crate::features; + use crate::odb::policy; + use crate::odb::policy::{IndexId, PackId, PackIndexMarker, RefreshMode}; use std::path::Path; - pub struct Options<'a> { + pub(crate) struct Options<'a> { pub objects_directory: &'a Path, - pub mode: RefreshMode, + pub refresh_mode: RefreshMode, } - pub enum Outcome { - /// Replace your set with the given one and drop all packs data handles. + pub(crate) enum Outcome { + /// Drop all data and fully replace it with `indices`. + /// This happens if we have witnessed a generational change invalidating all of our ids and causing currently loaded + /// indices and maps to be dropped. Replace { - indices: Vec, - mark: PackIndexMarker, + indices: Vec>, // should probably be small vec to get around most allocations + mark: PackIndexMarker, // use to show where the caller left off last time }, - /// Extend with the given indices and keep searching. + /// Extend with the given indices and keep searching, while dropping invalidated/unloaded ids. Extend { - indices: Vec, // should probably be small vec to get around most allocations - drop_packs: Vec, // which packs to forget about because they were removed from disk. - mark: PackIndexMarker, // use to show where you left off next time you call + drop_packs: Vec, // which packs to forget about because they were removed from disk. + drop_indices: Vec, // which packs to forget about because they were removed from disk. + indices: Vec>, // should probably be small vec to get around most allocations + mark: PackIndexMarker, // use to show where the caller left off last time }, - /// No new indices to look at, caller should stop give up + /// No new indices to look at, caller should give up NoMoreIndices, } } @@ -204,17 +218,31 @@ mod odb { } impl Policy { - pub(crate) fn load_pack(id: policy::PackId) -> std::io::Result> { - todo!("find in our own cache or load it, which needs a way to find their path by id") + /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack. + /// If pack-ids are kept long enough for this to happen or things are racy, the store policy must be changed to never unload packs + /// along with regular cleanup to not run out of handles while getting some reuse, or if the oid is known, just refresh indices + /// and redo the entire lookup for a valid pack id whose pack can probably be loaded next time. + pub(crate) fn load_pack( + &self, + id: policy::PackId, + ) -> std::io::Result>> { + match id.multipack_index { + None => { + let state = get_ref(&self.state); + // state.loaded_packs.get(id.index).map(|p| p.clone()).flatten() + todo!() + } + Some(multipack_id) => todo!("load from given multipack which needs additional lookup"), + } } pub(crate) fn load_next_indices( &self, load_indices::Options { objects_directory, - mode, + refresh_mode, }: load_indices::Options<'_>, marker: Option, - ) -> std::io::Result>> { + ) -> std::io::Result { let state = get_ref(&self.state); if state.db_paths.is_empty() { let mut state = upgrade_ref_to_mut(state); @@ -244,7 +272,7 @@ mod odb { } } else { if marker.pack_index_sequence == state.loaded_indices.len() { - match mode { + match refresh_mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { let mut state = upgrade_ref_to_mut(state); @@ -259,6 +287,7 @@ mod odb { .cloned() .collect(), drop_packs: Vec::new(), + drop_indices: Vec::new(), mark: PackIndexMarker { generation: state.generation, pack_index_sequence: state.loaded_indices.len(), @@ -288,12 +317,13 @@ mod odb { policy::State { db_paths, allow_unload: _, - multi_index: _, loaded_indices: bundles, + loaded_packs: _, + loaded_packs_by_multi_index: _, generation, }: &mut policy::State, objects_directory: &Path, - ) -> io::Result>> { + ) -> io::Result { db_paths.extend( git_odb::alternate::resolve(objects_directory) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, From 5180f1185495c89ec0c43bcb3e16bfd572b1325a Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 08:57:30 +0800 Subject: [PATCH 45/57] Flesh out Sync requirement for Repository (#259) --- Cargo.lock | 1 + DEVELOPMENT.md | 1 + experiments/odb-redesign/Cargo.toml | 1 + experiments/odb-redesign/src/lib.rs | 89 +++++++++++++++++++++++------ 4 files changed, 73 insertions(+), 19 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 19184079585..6755a823413 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1845,6 +1845,7 @@ dependencies = [ "git-hash", "git-odb", "git-pack", + "git-ref", "parking_lot", "thiserror", ] diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index 87a778c5a8f..80d147b1170 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -696,3 +696,4 @@ Please note that these are based on the following value system: - …cloning use an eagerly loaded Repository just for that particular clone for stable pack ids - …fetches use a lazy-loaded Repository with refresh disabled, and full retries if the pack they were referring to goes away. Maybe there can be a policy for that to keep pack ids stable despite refresh, which would also solve clones which could then lazy-load. + - `Repository` must remain `Sync`. diff --git a/experiments/odb-redesign/Cargo.toml b/experiments/odb-redesign/Cargo.toml index 9b212fdb8dc..f04615c380b 100644 --- a/experiments/odb-redesign/Cargo.toml +++ b/experiments/odb-redesign/Cargo.toml @@ -13,6 +13,7 @@ thread-safe = [] git-pack = { path = "../../git-pack", version = "*" } git-odb = { path = "../../git-odb", version = "*" } git-hash = { path = "../../git-hash", version ="^0.8.0" } +git-ref = { path = "../../git-ref", version ="^0.9.1" } parking_lot = { version = "0.11.0", default-features = false } thiserror = "1.0.30" anyhow = "1.0.47" diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 1fcff55a130..c69c119a078 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -68,7 +68,7 @@ mod odb { pub(crate) struct IndexForObjectInPack { /// The internal identifier of the pack itself, which either is referred to by an index or a multi-pack index. - pack_id: PackId, + pub(crate) pack_id: PackId, /// The index of the object within the pack object_index_in_pack: u32, } @@ -201,23 +201,22 @@ mod odb { } } - #[derive(Default)] - pub struct Policy { + pub struct Store { state: features::MutableOnDemand, } - impl Policy { - fn new(unload_mode: policy::PackDataUnloadMode) -> Self { - Policy { + impl Store { + pub fn new(unload_mode: policy::PackDataUnloadMode) -> features::OwnShared { + features::OwnShared::new(Store { state: features::MutableOnDemand::new(policy::State { allow_unload: matches!(unload_mode, policy::PackDataUnloadMode::WhenDiskFileIsMissing), ..policy::State::default() }), - } + }) } } - impl Policy { + impl Store { /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack. /// If pack-ids are kept long enough for this to happen or things are racy, the store policy must be changed to never unload packs /// along with regular cleanup to not run out of handles while getting some reuse, or if the oid is known, just refresh indices @@ -332,13 +331,14 @@ mod odb { } } - /// The store shares a policy and - pub struct Store { - policy: features::OwnShared, + /// The store shares a policy and keeps a couple of thread-local configuration + pub struct Handle { + store: features::OwnShared, objects_directory: PathBuf, + refresh_mode: policy::RefreshMode, } - impl git_odb::Find for Store { + impl git_odb::Find for Handle { type Error = git_odb::compound::find::Error; fn try_find<'a>( @@ -367,14 +367,14 @@ mod odb { } fn try_setup() -> anyhow::Result<()> { - let policy = Policy::default(); + let policy = Store::new(policy::PackDataUnloadMode::Never); Ok(()) } } mod refs { use crate::features; - use std::path::PathBuf; + use std::path::{Path, PathBuf}; pub struct LooseStore { path: PathBuf, @@ -398,10 +398,41 @@ mod refs { } } + pub struct Store { + inner: inner::StoreSelection, + packed_refs: features::MutableOnDemand>, + // And of course some more information to track if packed buffer is fresh + } + + impl Store { + pub fn new(path: impl AsRef) -> features::OwnShared { + features::OwnShared::new(Store { + inner: inner::StoreSelection::Loose(LooseStore { + path: path.as_ref().to_owned(), + reflog_mode: 0, + }), + packed_refs: features::MutableOnDemand::new(None), + }) + } + pub fn to_handle(self: &features::OwnShared) -> Handle { + Handle { + store: self.clone(), + namespace: None, + } + } + + pub fn to_namespaced_handle(self: &features::OwnShared, namespace: git_ref::Namespace) -> Handle { + Handle { + store: self.clone(), + namespace: namespace.into(), + } + } + } + #[derive(Clone)] - struct Store { - inner: features::OwnShared, - namespace: u32, + pub struct Handle { + store: features::OwnShared, + namespace: Option, } // impl common interface but check how all this works with iterators, there is some implementation around that already @@ -409,7 +440,7 @@ mod refs { } mod repository { - use crate::odb; + use crate::{features, odb, refs}; mod raw { use git_pack::Find; @@ -427,7 +458,27 @@ mod repository { } /// Maybe we will end up providing a generic version as there still seems to be benefits in having a read-only Store implementation. + /// MUST BE `Sync` + #[derive(Clone)] pub struct Repository { - odb: odb::Store, + odb: features::OwnShared, + refs: features::OwnShared, + } + + #[cfg(test)] + #[cfg(feature = "thread-safe")] + mod tests { + use super::*; + + #[test] + fn is_sync_and_send() { + fn spawn(_v: T) {} + let repository = Repository { + odb: odb::Store::new(odb::policy::PackDataUnloadMode::WhenDiskFileIsMissing), + refs: refs::Store::new("./foo"), + }; + spawn(&repository); + spawn(repository); + } } } From 70dc445c895eaa26f06e49c8378f29dba8c7b3e7 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 11:25:31 +0800 Subject: [PATCH 46/57] Make handles tracked to allow automatic handling of pack-unloading (#259) This enables a single repository instance to be used for pack receivers and pack writers. --- experiments/odb-redesign/src/lib.rs | 176 ++++++++++++++++++++-------- 1 file changed, 127 insertions(+), 49 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index c69c119a078..5774c58304b 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -7,10 +7,18 @@ mod features { pub type OwnShared = Arc; pub type MutableOnDemand = parking_lot::RwLock; - pub fn get_ref(v: &MutableOnDemand) -> parking_lot::RwLockUpgradableReadGuard<'_, T> { + pub fn get_ref_upgradeable(v: &MutableOnDemand) -> parking_lot::RwLockUpgradableReadGuard<'_, T> { v.upgradable_read() } + pub fn get_ref(v: &MutableOnDemand) -> parking_lot::RwLockReadGuard<'_, T> { + v.read() + } + + pub fn get_mut(v: &MutableOnDemand) -> parking_lot::RwLockWriteGuard<'_, T> { + v.write() + } + pub fn upgrade_ref_to_mut( v: parking_lot::RwLockUpgradableReadGuard<'_, T>, ) -> parking_lot::RwLockWriteGuard<'_, T> { @@ -19,16 +27,24 @@ mod features { } mod local { - use std::cell::{RefCell, RefMut}; + use std::cell::{Ref, RefCell, RefMut}; use std::rc::Rc; pub type OwnShared = Rc; pub type MutableOnDemand = RefCell; - pub fn get_ref(v: &RefCell) -> RefMut<'_, T> { + pub fn get_ref_upgradeable(v: &RefCell) -> RefMut<'_, T> { + v.borrow_mut() + } + + pub fn get_mut(v: &RefCell) -> RefMut<'_, T> { v.borrow_mut() } + pub fn get_ref(v: &RefCell) -> Ref<'_, T> { + v.borrow() + } + pub fn upgrade_ref_to_mut(v: RefMut<'_, T>) -> RefMut<'_, T> { v } @@ -42,7 +58,7 @@ mod features { mod odb { use crate::features; - use crate::features::{get_ref, upgrade_ref_to_mut}; + use crate::features::{get_mut, get_ref_upgradeable, upgrade_ref_to_mut}; use crate::odb::policy::{load_indices, PackIndexMarker}; use git_odb::data::Object; use git_odb::pack::bundle::Location; @@ -50,7 +66,7 @@ mod odb { use git_odb::pack::find::Entry; use git_odb::pack::Bundle; use std::io; - use std::path::{Path, PathBuf}; + use std::path::PathBuf; pub mod policy { use crate::features; @@ -114,6 +130,13 @@ mod odb { type PackDataFiles = Vec>>; + pub(crate) type HandleId = u32; + pub(crate) enum HandleMode { + /// Pack-ids may change which may cause lookups by pack-id (without an oid available) to fail. + Unstable, + Stable, + } + #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, @@ -126,7 +149,9 @@ mod odb { /// Generations are incremented whenever we decide to clear out our vectors if they are too big and contains too many empty slots. /// If we are not allowed to unload files, the generation will never be changed. pub(crate) generation: u8, - pub(crate) allow_unload: bool, + pub(crate) next_handle_id: HandleId, + pub(crate) handles: BTreeMap, + pub(crate) objects_directory: PathBuf, } /// A way to indicate which pack indices we have seen already @@ -139,6 +164,7 @@ mod odb { } /// Define how packs will be refreshed when all indices are loaded, which is useful if a lot of objects are missing. + #[derive(Clone, Copy)] pub enum RefreshMode { /// Check for new or changed pack indices (and pack data files) when the last known index is loaded. /// During runtime we will keep pack indices stable by never reusing them, however, there is the option for @@ -172,13 +198,7 @@ mod odb { pub mod load_indices { use crate::features; use crate::odb::policy; - use crate::odb::policy::{IndexId, PackId, PackIndexMarker, RefreshMode}; - use std::path::Path; - - pub(crate) struct Options<'a> { - pub objects_directory: &'a Path, - pub refresh_mode: RefreshMode, - } + use crate::odb::policy::{IndexId, PackId, PackIndexMarker}; pub(crate) enum Outcome { /// Drop all data and fully replace it with `indices`. @@ -201,33 +221,74 @@ mod odb { } } + /// Note that each store is strictly per repository, and that we don't implement any kind of limit of file handles. + /// The reason for the latter is that it's close to impossible to enforce correctly without heuristics that are probably + /// better done on the server if there is an indication. + /// + /// There is, however, a way to obtain the current amount of open files held by this instance and it's possible to trigger them + /// to be closed. Alternatively, it's safe to replace this instance with a new one which starts fresh. + /// + /// Note that it's possible to let go of loaded files here as well, even though that would be based on a setting allowed file handles + /// which would be managed elsewhere. + /// + /// For maintenance, I think it would be best create an instance when a connection comes in, share it across connections to the same + /// repository, and remove it as the last connection to it is dropped. pub struct Store { state: features::MutableOnDemand, } impl Store { - pub fn new(unload_mode: policy::PackDataUnloadMode) -> features::OwnShared { + pub fn at(objects_directory: impl Into) -> features::OwnShared { features::OwnShared::new(Store { state: features::MutableOnDemand::new(policy::State { - allow_unload: matches!(unload_mode, policy::PackDataUnloadMode::WhenDiskFileIsMissing), + objects_directory: objects_directory.into(), ..policy::State::default() }), }) } + + /// Allow actually finding things + pub fn to_handle(self: &features::OwnShared) -> Handle { + let next = self.register_handle(); + Handle { + store: self.clone(), + refresh_mode: policy::RefreshMode::AfterAllIndicesLoaded, // todo: remove this + id: next, + } + } } impl Store { + pub(crate) fn register_handle(&self) -> policy::HandleId { + let mut state = get_mut(&self.state); + let next = state.next_handle_id; + state.next_handle_id = state.next_handle_id.wrapping_add(1); + state.handles.insert(next, policy::HandleMode::Unstable); + next + } + pub(crate) fn remove_handle(&self, id: &policy::HandleId) { + let mut state = get_mut(&self.state); + state.handles.remove(id); + } + pub(crate) fn upgrade_handle(&self, id: &policy::HandleId) { + let mut state = get_mut(&self.state); + *state + .handles + .get_mut(&id) + .expect("BUG: handles must be made known on creation") = policy::HandleMode::Stable; + } + /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack. - /// If pack-ids are kept long enough for this to happen or things are racy, the store policy must be changed to never unload packs - /// along with regular cleanup to not run out of handles while getting some reuse, or if the oid is known, just refresh indices + /// If the oid is known, just refresh indices /// and redo the entire lookup for a valid pack id whose pack can probably be loaded next time. + /// Otherwise one should use or upgrade the handle to enforce stable indices. pub(crate) fn load_pack( &self, id: policy::PackId, ) -> std::io::Result>> { match id.multipack_index { None => { - let state = get_ref(&self.state); + let state = get_ref_upgradeable(&self.state); // state.loaded_packs.get(id.index).map(|p| p.clone()).flatten() todo!() } @@ -236,22 +297,13 @@ mod odb { } pub(crate) fn load_next_indices( &self, - load_indices::Options { - objects_directory, - refresh_mode, - }: load_indices::Options<'_>, + refresh_mode: policy::RefreshMode, marker: Option, ) -> std::io::Result { - let state = get_ref(&self.state); + let state = get_ref_upgradeable(&self.state); if state.db_paths.is_empty() { let mut state = upgrade_ref_to_mut(state); - return Self::refresh(&mut state, objects_directory); - } else { - debug_assert_eq!( - Some(objects_directory), - state.db_paths.get(0).map(|p| p.as_path()), - "Eager policy can't be shared across different repositories" - ); + return Self::refresh(&mut state); } Ok(match marker { @@ -275,7 +327,7 @@ mod odb { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { let mut state = upgrade_ref_to_mut(state); - return Self::refresh(&mut state, objects_directory); + return Self::refresh(&mut state); } } } else { @@ -309,33 +361,59 @@ mod odb { }, }) } + } - /// If there is only additive changes, there is no need for a new `generation` actually, which helps - /// callers to retain stability. - fn refresh( - policy::State { - db_paths, - allow_unload: _, - loaded_indices: bundles, - loaded_packs: _, - loaded_packs_by_multi_index: _, - generation, - }: &mut policy::State, - objects_directory: &Path, - ) -> io::Result { - db_paths.extend( - git_odb::alternate::resolve(objects_directory) + /// Utilities + impl Store { + /// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated. + fn refresh(state: &mut policy::State) -> io::Result { + state.db_paths.extend( + git_odb::alternate::resolve(&state.objects_directory) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, ); todo!() } + + /// If there is no handle with stable pack ids requirements, unload them. + /// This property also relates to us pruning our internal state/doing internal maintenance which affects ids, too. + fn may_unload_packs(state: &policy::State) -> bool { + state + .handles + .values() + .all(|m| matches!(m, policy::HandleMode::Unstable)) + } } /// The store shares a policy and keeps a couple of thread-local configuration pub struct Handle { store: features::OwnShared, - objects_directory: PathBuf, refresh_mode: policy::RefreshMode, + id: policy::HandleId, + } + + impl Handle { + /// Call once if pack ids are stored and later used for lookup, meaning they should always remain mapped and not be unloaded + /// even if they disappear from disk. + /// This must be called if there is a chance that git maintenance is happening while a pack is created. + pub fn prevent_pack_unload(&self) { + self.store.upgrade_handle(&self.id); + } + } + + impl Clone for Handle { + fn clone(&self) -> Self { + Handle { + store: self.store.clone(), + refresh_mode: self.refresh_mode, + id: self.store.register_handle(), + } + } + } + + impl Drop for Handle { + fn drop(&mut self) { + self.store.remove_handle(&self.id) + } } impl git_odb::Find for Handle { @@ -367,7 +445,7 @@ mod odb { } fn try_setup() -> anyhow::Result<()> { - let policy = Store::new(policy::PackDataUnloadMode::Never); + let policy = Store::at("foo"); Ok(()) } } @@ -474,7 +552,7 @@ mod repository { fn is_sync_and_send() { fn spawn(_v: T) {} let repository = Repository { - odb: odb::Store::new(odb::policy::PackDataUnloadMode::WhenDiskFileIsMissing), + odb: odb::Store::at("./foo/objects"), refs: refs::Store::new("./foo"), }; spawn(&repository); From ef9b08ceb4a40eef6c8fa7dcdba06e760ed9ac95 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 11:42:00 +0800 Subject: [PATCH 47/57] Learn about store's resource consumption (#259) --- experiments/odb-redesign/src/lib.rs | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 5774c58304b..e4358361346 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -58,7 +58,7 @@ mod features { mod odb { use crate::features; - use crate::features::{get_mut, get_ref_upgradeable, upgrade_ref_to_mut}; + use crate::features::{get_mut, get_ref, get_ref_upgradeable, upgrade_ref_to_mut}; use crate::odb::policy::{load_indices, PackIndexMarker}; use git_odb::data::Object; use git_odb::pack::bundle::Location; @@ -219,6 +219,13 @@ mod odb { NoMoreIndices, } } + + /// A snapshot about resource usage. + pub struct StateInformation { + pub num_handles: usize, + pub open_indices: usize, + pub open_packs: usize, + } } /// Note that each store is strictly per repository, and that we don't implement any kind of limit of file handles. @@ -256,6 +263,23 @@ mod odb { id: next, } } + + /// Get a snapshot of the current amount of handles and open packs and indices. + /// If there are no handles, we are only consuming resources, which might indicate that this instance should be + /// discarded. + pub fn state_snapshot(&self) -> policy::StateInformation { + let state = get_ref(&self.state); + policy::StateInformation { + num_handles: state.handles.len(), + open_packs: state.loaded_packs.iter().filter(|p| p.is_some()).count() + + state + .loaded_packs_by_multi_index + .values() + .map(|packs| packs.iter().filter(|p| p.is_some()).count()) + .sum::(), + open_indices: state.loaded_indices.iter().filter(|i| i.is_some()).count(), + } + } } impl Store { From 58b0bcf622d0c3e518fe2450b0932c102a0931f0 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 12:10:52 +0800 Subject: [PATCH 48/57] Get a better understanding on how on-disk state can be represented in our state (#259) --- experiments/odb-redesign/src/lib.rs | 46 ++++++++++------------------- 1 file changed, 16 insertions(+), 30 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index e4358361346..cfeb7a45ee9 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -122,13 +122,17 @@ mod odb { } } - impl Default for PackDataUnloadMode { - fn default() -> Self { - PackDataUnloadMode::WhenDiskFileIsMissing - } - } - - type PackDataFiles = Vec>>; + pub(crate) enum PackDataFile { + /// The file is on disk and can be loaded from there. + Unloaded(PathBuf), + Loaded(git_pack::data::File), + /// The file was loaded, but appeared to be missing on disk after reconciling our state with what's on disk. + /// As there were handles that required pack-id stability we had to keep the pack. + Garbage(git_pack::data::File), + /// File is missing on disk and could not be loaded when we tried or turned missing after reconciling our state. + Missing, + } + type PackDataFiles = Vec>>; pub(crate) type HandleId = u32; pub(crate) enum HandleMode { @@ -175,26 +179,6 @@ mod odb { Never, } - /// Unloading here means to drop the shared reference to the mapped pack data file. - /// - /// Indices are always handled like that if their file on disk disappears as objects usually never disappear unless they are unreachable, - /// meaning that new indices always contain the old objects in some way. - pub enum PackDataUnloadMode { - /// Keep pack data (and multi-pack index-to-pack lookup tables) always available in loaded memory maps even - /// if the underlying data file (and usually index) are gone. - /// This means algorithms that keep track of packs like pack-generators will always be able to resolve the data they reference. - /// This also means, however, that one might run out of system resources some time, which means the coordinator of such users - /// needs to check resource usage vs amount of uses and replace this instance with a new policy to eventually have the memory - /// mapped packs drop (as references to them disappear once consumers disappear). - /// We will also not ask Store handles to remove their pack data references. - /// We will never rebuild our internal data structures to keep pack ids unique indefinitely (i.e. won't reuse a pack id with a different pack). - Never, - /// Forget/drop the mapped pack data file when its file on disk disappeared and store handles to remove their pack data references - /// for them to be able to go out of scope. - /// We are allowed to rebuild our internal data structures to save on memory but invalidate all previous pack ids. - WhenDiskFileIsMissing, - } - pub mod load_indices { use crate::features; use crate::odb::policy; @@ -302,10 +286,11 @@ mod odb { .expect("BUG: handles must be made known on creation") = policy::HandleMode::Stable; } - /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack. - /// If the oid is known, just refresh indices + /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack or a pack which couldn't be + /// loaded as its file didn't exist on disk anymore. + /// If the oid is known, just load indices again to continue + /// (objects rarely ever removed so should be present, maybe in another pack though), /// and redo the entire lookup for a valid pack id whose pack can probably be loaded next time. - /// Otherwise one should use or upgrade the handle to enforce stable indices. pub(crate) fn load_pack( &self, id: policy::PackId, @@ -314,6 +299,7 @@ mod odb { None => { let state = get_ref_upgradeable(&self.state); // state.loaded_packs.get(id.index).map(|p| p.clone()).flatten() + // If there file on disk wasn't found, reconcile the on-disk state with our state right away and try again. todo!() } Some(multipack_id) => todo!("load from given multipack which needs additional lookup"), From 6187b5cbb323cadffc09d280bcaf880def91f554 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 15:08:36 +0800 Subject: [PATCH 49/57] sketch a more concise way of keeping indices and packs relative to disk locations (#259) --- experiments/odb-redesign/src/lib.rs | 34 +++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index cfeb7a45ee9..fa18a8181ae 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -100,12 +100,12 @@ mod odb { pub(crate) multipack_index: Option, } - pub(crate) struct IndexFile { + pub(crate) struct IndexLookup { file: index_file::SingleOrMulti, pub id: IndexId, } - impl IndexFile { + impl IndexLookup { fn lookup(&self, object_id: &oid) -> Option { match &self.file { index_file::SingleOrMulti::Single(file) => { @@ -122,17 +122,17 @@ mod odb { } } - pub(crate) enum PackDataFile { + pub(crate) enum OnDiskFile { /// The file is on disk and can be loaded from there. Unloaded(PathBuf), - Loaded(git_pack::data::File), + Loaded(PathBuf, T), /// The file was loaded, but appeared to be missing on disk after reconciling our state with what's on disk. /// As there were handles that required pack-id stability we had to keep the pack. - Garbage(git_pack::data::File), + Garbage(PathBuf, T), /// File is missing on disk and could not be loaded when we tried or turned missing after reconciling our state. Missing, } - type PackDataFiles = Vec>>; + type PackDataFiles = Vec>>>; pub(crate) type HandleId = u32; pub(crate) enum HandleMode { @@ -141,10 +141,26 @@ mod odb { Stable, } + pub(crate) struct IndexFileBundle { + index: OnDiskFile, + data: OnDiskFile, + } + + pub(crate) struct MultiIndexFileBundle { + multi_index: OnDiskFile, // TODO: turn that into multi-index file when available + data: Vec>, + } + + pub(crate) enum IndexAndPacks { + Index(IndexFileBundle), + MultiIndex(MultiIndexFileBundle), + } + #[derive(Default)] pub(crate) struct State { pub(crate) db_paths: Vec, - pub(crate) loaded_indices: Vec>>, + pub(crate) files: Vec>, + pub(crate) loaded_indices: Vec>>, pub(crate) loaded_packs: PackDataFiles, /// Each change in the multi-pack index creates a new index entry here and typically drops all knowledge about its removed packs. /// We do this because there can be old pack ids around that refer to the old (now deleted) multi-pack index along with a possibly @@ -189,14 +205,14 @@ mod odb { /// This happens if we have witnessed a generational change invalidating all of our ids and causing currently loaded /// indices and maps to be dropped. Replace { - indices: Vec>, // should probably be small vec to get around most allocations + indices: Vec>, // should probably be small vec to get around most allocations mark: PackIndexMarker, // use to show where the caller left off last time }, /// Extend with the given indices and keep searching, while dropping invalidated/unloaded ids. Extend { drop_packs: Vec, // which packs to forget about because they were removed from disk. drop_indices: Vec, // which packs to forget about because they were removed from disk. - indices: Vec>, // should probably be small vec to get around most allocations + indices: Vec>, // should probably be small vec to get around most allocations mark: PackIndexMarker, // use to show where the caller left off last time }, /// No new indices to look at, caller should give up From 407be6c517cfb89e8bf6931a8b6237c33cb58229 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 15:29:35 +0800 Subject: [PATCH 50/57] Simplify tracking of handles, using tokens to make that safer (#259) --- experiments/odb-redesign/src/lib.rs | 62 +++++++++++++++-------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index fa18a8181ae..ee5fb7bb182 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -135,7 +135,8 @@ mod odb { type PackDataFiles = Vec>>>; pub(crate) type HandleId = u32; - pub(crate) enum HandleMode { + /// These are not to be created by anyone except for the State + pub(crate) enum HandleModeToken { /// Pack-ids may change which may cause lookups by pack-id (without an oid available) to fail. Unstable, Stable, @@ -169,9 +170,10 @@ mod odb { /// Generations are incremented whenever we decide to clear out our vectors if they are too big and contains too many empty slots. /// If we are not allowed to unload files, the generation will never be changed. pub(crate) generation: u8, - pub(crate) next_handle_id: HandleId, - pub(crate) handles: BTreeMap, pub(crate) objects_directory: PathBuf, + + pub(crate) num_handles_stable: usize, + pub(crate) num_handles_unstable: usize, } /// A way to indicate which pack indices we have seen already @@ -256,11 +258,11 @@ mod odb { /// Allow actually finding things pub fn to_handle(self: &features::OwnShared) -> Handle { - let next = self.register_handle(); + let mode = self.register_handle(); Handle { store: self.clone(), - refresh_mode: policy::RefreshMode::AfterAllIndicesLoaded, // todo: remove this - id: next, + refresh_mode: policy::RefreshMode::AfterAllIndicesLoaded, + mode: mode.into(), } } @@ -270,7 +272,7 @@ mod odb { pub fn state_snapshot(&self) -> policy::StateInformation { let state = get_ref(&self.state); policy::StateInformation { - num_handles: state.handles.len(), + num_handles: state.num_handles_unstable + state.num_handles_stable, open_packs: state.loaded_packs.iter().filter(|p| p.is_some()).count() + state .loaded_packs_by_multi_index @@ -282,26 +284,31 @@ mod odb { } } + /// Handle interaction impl Store { - pub(crate) fn register_handle(&self) -> policy::HandleId { + pub(crate) fn register_handle(&self) -> policy::HandleModeToken { let mut state = get_mut(&self.state); - let next = state.next_handle_id; - state.next_handle_id = state.next_handle_id.wrapping_add(1); - state.handles.insert(next, policy::HandleMode::Unstable); - next + state.num_handles_unstable += 1; + policy::HandleModeToken::Unstable } - pub(crate) fn remove_handle(&self, id: &policy::HandleId) { + pub(crate) fn remove_handle(&self, mode: policy::HandleModeToken) { let mut state = get_mut(&self.state); - state.handles.remove(id); + match mode { + policy::HandleModeToken::Stable => state.num_handles_stable -= 1, + policy::HandleModeToken::Unstable => state.num_handles_unstable -= 1, + } } - pub(crate) fn upgrade_handle(&self, id: &policy::HandleId) { - let mut state = get_mut(&self.state); - *state - .handles - .get_mut(&id) - .expect("BUG: handles must be made known on creation") = policy::HandleMode::Stable; + pub(crate) fn upgrade_handle(&self, mode: policy::HandleModeToken) -> policy::HandleModeToken { + if let policy::HandleModeToken::Unstable = mode { + let mut state = get_mut(&self.state); + state.num_handles_unstable -= 1; + state.num_handles_stable += 1; + } + policy::HandleModeToken::Stable } + } + impl Store { /// If Ok(None) is returned, the pack-id was stale and referred to an unloaded pack or a pack which couldn't be /// loaded as its file didn't exist on disk anymore. /// If the oid is known, just load indices again to continue @@ -403,10 +410,7 @@ mod odb { /// If there is no handle with stable pack ids requirements, unload them. /// This property also relates to us pruning our internal state/doing internal maintenance which affects ids, too. fn may_unload_packs(state: &policy::State) -> bool { - state - .handles - .values() - .all(|m| matches!(m, policy::HandleMode::Unstable)) + state.num_handles_stable == 0 } } @@ -414,15 +418,15 @@ mod odb { pub struct Handle { store: features::OwnShared, refresh_mode: policy::RefreshMode, - id: policy::HandleId, + mode: Option, } impl Handle { /// Call once if pack ids are stored and later used for lookup, meaning they should always remain mapped and not be unloaded /// even if they disappear from disk. /// This must be called if there is a chance that git maintenance is happening while a pack is created. - pub fn prevent_pack_unload(&self) { - self.store.upgrade_handle(&self.id); + pub fn prevent_pack_unload(&mut self) { + self.mode = self.mode.take().map(|mode| self.store.upgrade_handle(mode)); } } @@ -431,14 +435,14 @@ mod odb { Handle { store: self.store.clone(), refresh_mode: self.refresh_mode, - id: self.store.register_handle(), + mode: self.store.register_handle().into(), } } } impl Drop for Handle { fn drop(&mut self) { - self.store.remove_handle(&self.id) + self.mode.take().map(|mode| self.store.remove_handle(mode)); } } From d8d5a606679ee4d0e332732a0b0156a7f9455749 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 15:46:44 +0800 Subject: [PATCH 51/57] refactor --- experiments/odb-redesign/src/lib.rs | 75 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index ee5fb7bb182..bca570bb537 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -65,7 +65,6 @@ mod odb { use git_odb::pack::cache::DecodeEntry; use git_odb::pack::find::Entry; use git_odb::pack::Bundle; - use std::io; use std::path::PathBuf; pub mod policy { @@ -73,6 +72,7 @@ mod odb { use crate::odb::policy; use git_hash::oid; use std::collections::BTreeMap; + use std::io; use std::path::PathBuf; mod index_file { @@ -149,18 +149,23 @@ mod odb { pub(crate) struct MultiIndexFileBundle { multi_index: OnDiskFile, // TODO: turn that into multi-index file when available + /// The parent repository owning us. This path could be derived from our own file path, but this seems more controlled + objects_directory: PathBuf, data: Vec>, } pub(crate) enum IndexAndPacks { Index(IndexFileBundle), + /// Note that there can only be one multi-pack file per repository, but thanks to git alternates, there can be multiple overall. MultiIndex(MultiIndexFileBundle), } #[derive(Default)] pub(crate) struct State { + /// The root level directory from which we resolve alternates files. + pub(crate) objects_directory: PathBuf, pub(crate) db_paths: Vec, - pub(crate) files: Vec>, + pub(crate) files: Vec, pub(crate) loaded_indices: Vec>>, pub(crate) loaded_packs: PackDataFiles, /// Each change in the multi-pack index creates a new index entry here and typically drops all knowledge about its removed packs. @@ -170,12 +175,40 @@ mod odb { /// Generations are incremented whenever we decide to clear out our vectors if they are too big and contains too many empty slots. /// If we are not allowed to unload files, the generation will never be changed. pub(crate) generation: u8, - pub(crate) objects_directory: PathBuf, pub(crate) num_handles_stable: usize, pub(crate) num_handles_unstable: usize, } + impl State { + pub(crate) fn snapshot(&self) -> StateInformation { + // state.files.iter().map(|f| match f {IndexAndPacks::}); + StateInformation { + num_handles: self.num_handles_unstable + self.num_handles_stable, + open_packs: self.loaded_packs.iter().filter(|p| p.is_some()).count() + + self + .loaded_packs_by_multi_index + .values() + .map(|packs| packs.iter().filter(|p| p.is_some()).count()) + .sum::(), + open_indices: self.loaded_indices.iter().filter(|i| i.is_some()).count(), + } + } + + /// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated. + pub(crate) fn refresh(&mut self) -> io::Result { + self.db_paths = git_odb::alternate::resolve(&self.objects_directory) + .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + todo!() + } + + /// If there is no handle with stable pack ids requirements, unload them. + /// This property also relates to us pruning our internal state/doing internal maintenance which affects ids, too. + pub(crate) fn may_unload_packs(&mut self) -> bool { + self.num_handles_stable == 0 + } + } + /// A way to indicate which pack indices we have seen already pub struct PackIndexMarker { /// The generation the `pack_index_sequence` belongs to. Indices of different generations are completely incompatible. @@ -270,17 +303,7 @@ mod odb { /// If there are no handles, we are only consuming resources, which might indicate that this instance should be /// discarded. pub fn state_snapshot(&self) -> policy::StateInformation { - let state = get_ref(&self.state); - policy::StateInformation { - num_handles: state.num_handles_unstable + state.num_handles_stable, - open_packs: state.loaded_packs.iter().filter(|p| p.is_some()).count() - + state - .loaded_packs_by_multi_index - .values() - .map(|packs| packs.iter().filter(|p| p.is_some()).count()) - .sum::(), - open_indices: state.loaded_indices.iter().filter(|i| i.is_some()).count(), - } + get_ref(&self.state).snapshot() } } @@ -335,8 +358,7 @@ mod odb { ) -> std::io::Result { let state = get_ref_upgradeable(&self.state); if state.db_paths.is_empty() { - let mut state = upgrade_ref_to_mut(state); - return Self::refresh(&mut state); + return upgrade_ref_to_mut(state).refresh(); } Ok(match marker { @@ -359,8 +381,7 @@ mod odb { match refresh_mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { - let mut state = upgrade_ref_to_mut(state); - return Self::refresh(&mut state); + return upgrade_ref_to_mut(state).refresh() } } } else { @@ -396,24 +417,6 @@ mod odb { } } - /// Utilities - impl Store { - /// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated. - fn refresh(state: &mut policy::State) -> io::Result { - state.db_paths.extend( - git_odb::alternate::resolve(&state.objects_directory) - .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?, - ); - todo!() - } - - /// If there is no handle with stable pack ids requirements, unload them. - /// This property also relates to us pruning our internal state/doing internal maintenance which affects ids, too. - fn may_unload_packs(state: &policy::State) -> bool { - state.num_handles_stable == 0 - } - } - /// The store shares a policy and keeps a couple of thread-local configuration pub struct Handle { store: features::OwnShared, From 6f5eaf145f9462629c873af2f49bf029bf962972 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 16:00:48 +0800 Subject: [PATCH 52/57] Allow on-disk files to be reused as will in case they re-appear (#259) Probably rare, but maybe the reconcilation can handle that without overhead, and if so, it's just a benefit. --- experiments/odb-redesign/src/lib.rs | 58 ++++++++++++++++++++++------- 1 file changed, 44 insertions(+), 14 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index bca570bb537..634db67f000 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -122,16 +122,30 @@ mod odb { } } - pub(crate) enum OnDiskFile { + pub(crate) struct OnDiskFile { + /// The last known path of the file + path: PathBuf, + state: OnDiskFileState, + } + + pub(crate) enum OnDiskFileState { /// The file is on disk and can be loaded from there. - Unloaded(PathBuf), - Loaded(PathBuf, T), + Unloaded, + Loaded(T), /// The file was loaded, but appeared to be missing on disk after reconciling our state with what's on disk. /// As there were handles that required pack-id stability we had to keep the pack. - Garbage(PathBuf, T), + Garbage(T), /// File is missing on disk and could not be loaded when we tried or turned missing after reconciling our state. Missing, } + + impl OnDiskFile { + /// Return true if we hold a memory map of the file already. + pub fn is_loaded(&self) -> bool { + matches!(self.state, OnDiskFileState::Loaded(_) | OnDiskFileState::Garbage(_)) + } + } + type PackDataFiles = Vec>>>; pub(crate) type HandleId = u32; @@ -144,13 +158,13 @@ mod odb { pub(crate) struct IndexFileBundle { index: OnDiskFile, + path: PathBuf, data: OnDiskFile, } pub(crate) struct MultiIndexFileBundle { multi_index: OnDiskFile, // TODO: turn that into multi-index file when available - /// The parent repository owning us. This path could be derived from our own file path, but this seems more controlled - objects_directory: PathBuf, + path: PathBuf, data: Vec>, } @@ -182,16 +196,32 @@ mod odb { impl State { pub(crate) fn snapshot(&self) -> StateInformation { - // state.files.iter().map(|f| match f {IndexAndPacks::}); + let mut open_packs = 0; + let mut open_indices = 0; + + for f in &self.files { + match f { + IndexAndPacks::Index(bundle) => { + if bundle.index.is_loaded() { + open_indices += 1; + } + if bundle.index.is_loaded() { + open_packs += 1; + } + } + IndexAndPacks::MultiIndex(multi) => { + if multi.multi_index.is_loaded() { + open_indices += 1; + } + open_packs += multi.data.iter().filter(|f| f.is_loaded()).count(); + } + } + } + StateInformation { num_handles: self.num_handles_unstable + self.num_handles_stable, - open_packs: self.loaded_packs.iter().filter(|p| p.is_some()).count() - + self - .loaded_packs_by_multi_index - .values() - .map(|packs| packs.iter().filter(|p| p.is_some()).count()) - .sum::(), - open_indices: self.loaded_indices.iter().filter(|i| i.is_some()).count(), + open_packs, + open_indices, } } From a88981b6f38b86624588f0c8ff200d17f38d0263 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 17:23:59 +0800 Subject: [PATCH 53/57] btree/hashmap free lookup of packs in store, keeping things more bundled (#259) Now even in the Store packs belong to indices. Indices are sparse in the Policy but are dense in the store. Packs for multi-pack indices are always dense at first but get less dense over time. Note quite sure how changes are communicated in case of multi-pack indices. --- .../src/command/release/manifest.rs | 3 +- experiments/object-access/src/main.rs | 3 +- experiments/odb-redesign/src/lib.rs | 208 ++++++++++-------- git-object/src/commit/write.rs | 3 +- git-object/src/object/mod.rs | 7 +- git-odb/src/alternate/mod.rs | 2 +- .../loose/reflog/create_or_update/tests.rs | 8 +- .../prepare_and_commit/create_or_update.rs | 21 +- .../transaction/prepare_and_commit/delete.rs | 17 +- 9 files changed, 155 insertions(+), 117 deletions(-) diff --git a/cargo-smart-release/src/command/release/manifest.rs b/cargo-smart-release/src/command/release/manifest.rs index 4b88438b2fd..1d4f895991b 100644 --- a/cargo-smart-release/src/command/release/manifest.rs +++ b/cargo-smart-release/src/command/release/manifest.rs @@ -11,12 +11,11 @@ use git_repository::lock::File; use semver::{Version, VersionReq}; use super::{cargo, git, Context, Oid, Options}; -use crate::utils::version_req_unset_or_default; use crate::{ changelog, changelog::{write::Linkables, Section}, traverse::Dependency, - utils::{names_and_versions, try_to_published_crate_and_new_version, will}, + utils::{names_and_versions, try_to_published_crate_and_new_version, version_req_unset_or_default, will}, version, ChangeLog, }; diff --git a/experiments/object-access/src/main.rs b/experiments/object-access/src/main.rs index 9b1b713f331..b7bf8b7df0d 100644 --- a/experiments/object-access/src/main.rs +++ b/experiments/object-access/src/main.rs @@ -1,5 +1,4 @@ -use std::sync::Arc; -use std::{path::Path, time::Instant}; +use std::{path::Path, sync::Arc, time::Instant}; use anyhow::anyhow; use git_repository::{hash::ObjectId, odb, prelude::*, Repository}; diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 634db67f000..883e49480e2 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -1,4 +1,4 @@ -#![allow(dead_code, unused_variables)] +#![allow(dead_code, unused_variables, unreachable_code)] mod features { mod threaded { @@ -27,8 +27,10 @@ mod features { } mod local { - use std::cell::{Ref, RefCell, RefMut}; - use std::rc::Rc; + use std::{ + cell::{Ref, RefCell, RefMut}, + rc::Rc, + }; pub type OwnShared = Rc; pub type MutableOnDemand = RefCell; @@ -57,28 +59,38 @@ mod features { } mod odb { - use crate::features; - use crate::features::{get_mut, get_ref, get_ref_upgradeable, upgrade_ref_to_mut}; - use crate::odb::policy::{load_indices, PackIndexMarker}; - use git_odb::data::Object; - use git_odb::pack::bundle::Location; - use git_odb::pack::cache::DecodeEntry; - use git_odb::pack::find::Entry; - use git_odb::pack::Bundle; use std::path::PathBuf; + use git_odb::{ + data::Object, + pack::{bundle::Location, cache::DecodeEntry, find::Entry, Bundle}, + }; + + use crate::{ + features, + features::{get_mut, get_ref, get_ref_upgradeable, upgrade_ref_to_mut}, + odb::policy::{load_indices, PackIndexMarker}, + }; + pub mod policy { - use crate::features; - use crate::odb::policy; + use std::{io, path::PathBuf}; + use git_hash::oid; - use std::collections::BTreeMap; - use std::io; - use std::path::PathBuf; + + use crate::features; mod index_file { + use crate::{features, odb::policy}; + pub enum SingleOrMulti { - Single(git_pack::index::File), - Multi(()), + Single { + index: features::OwnShared, + data: Option>, + }, + Multi { + index: features::OwnShared, + data: Vec>>, + }, } } @@ -90,13 +102,14 @@ mod odb { } /// An id to refer to an index file or a multipack index file - pub struct IndexId(u32); + pub type IndexId = usize; /// A way to load and refer to a pack uniquely, namespaced by their indexing mechanism, aka multi-pack or not. pub struct PackId { - /// Note that if `multipack_index = None`, this id is corresponding to the index id. - /// If it is a multipack index, `id` is the id / offset of the pack in the `multipack_index`. - pub(crate) index: u32, + /// Note that if `multipack_index = None`, this index is corresponding to the index id. + /// So a pack is always identified by its corresponding index. + /// If it is a multipack index, this is the id / offset of the pack in the `multipack_index`. + pub(crate) index: IndexId, pub(crate) multipack_index: Option, } @@ -106,18 +119,34 @@ mod odb { } impl IndexLookup { - fn lookup(&self, object_id: &oid) -> Option { - match &self.file { - index_file::SingleOrMulti::Single(file) => { - file.lookup(object_id).map(|object_index_in_pack| IndexForObjectInPack { - pack_id: PackId { - index: self.id.0, - multipack_index: None, - }, - object_index_in_pack, + /// See if the oid is contained in this index, and return its full id for lookup possibly alongside its data file if already + /// loaded. + /// If it is not loaded, ask it to be loaded and put it into the returned mutable option for safe-keeping. + fn lookup( + &mut self, + object_id: &oid, + ) -> Option<( + IndexForObjectInPack, + &mut Option>, + )> { + match &mut self.file { + index_file::SingleOrMulti::Single { index, data } => { + index.lookup(object_id).map(|object_index_in_pack| { + ( + IndexForObjectInPack { + pack_id: PackId { + index: self.id, + multipack_index: None, + }, + object_index_in_pack, + }, + data, + ) }) } - index_file::SingleOrMulti::Multi(()) => todo!("required with multi-pack index"), + index_file::SingleOrMulti::Multi { index, data } => { + todo!("find respective pack and return it as &mut Option<>") + } } } } @@ -144,10 +173,17 @@ mod odb { pub fn is_loaded(&self) -> bool { matches!(self.state, OnDiskFileState::Loaded(_) | OnDiskFileState::Garbage(_)) } - } - type PackDataFiles = Vec>>>; + pub fn loaded(&self) -> Option<&T> { + use OnDiskFileState::*; + match &self.state { + Loaded(v) | Garbage(v) => Some(v), + Unloaded | Missing => None, + } + } + } + pub(crate) type MultiIndex = (); pub(crate) type HandleId = u32; /// These are not to be created by anyone except for the State pub(crate) enum HandleModeToken { @@ -157,15 +193,14 @@ mod odb { } pub(crate) struct IndexFileBundle { - index: OnDiskFile, + index: OnDiskFile>, path: PathBuf, - data: OnDiskFile, + data: OnDiskFile>, } pub(crate) struct MultiIndexFileBundle { - multi_index: OnDiskFile, // TODO: turn that into multi-index file when available - path: PathBuf, - data: Vec>, + multi_index: OnDiskFile>, + data: Vec>>, } pub(crate) enum IndexAndPacks { @@ -178,14 +213,10 @@ mod odb { pub(crate) struct State { /// The root level directory from which we resolve alternates files. pub(crate) objects_directory: PathBuf, + /// All of our database paths, including `objects_directory` as entrypoint pub(crate) db_paths: Vec, pub(crate) files: Vec, - pub(crate) loaded_indices: Vec>>, - pub(crate) loaded_packs: PackDataFiles, - /// Each change in the multi-pack index creates a new index entry here and typically drops all knowledge about its removed packs. - /// We do this because there can be old pack ids around that refer to the old (now deleted) multi-pack index along with a possibly - /// now invalid (or ambiguous) local pack id, i.e. pack A was meant, but now that's pack B because the multi-index has changed. - pub(crate) loaded_packs_by_multi_index: BTreeMap, + /// Generations are incremented whenever we decide to clear out our vectors if they are too big and contains too many empty slots. /// If we are not allowed to unload files, the generation will never be changed. pub(crate) generation: u8, @@ -227,8 +258,10 @@ mod odb { /// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated. pub(crate) fn refresh(&mut self) -> io::Result { - self.db_paths = git_odb::alternate::resolve(&self.objects_directory) + let mut db_paths = git_odb::alternate::resolve(&self.objects_directory) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; + // These are in addition to our objects directory + db_paths.insert(0, self.objects_directory.clone()); todo!() } @@ -237,6 +270,29 @@ mod odb { pub(crate) fn may_unload_packs(&mut self) -> bool { self.num_handles_stable == 0 } + + pub(crate) fn collect_replace_outcome(&self) -> load_indices::Outcome { + let indices = self.files.iter().enumerate().filter_map(|(id, file)| { + let lookup = match file { + IndexAndPacks::Index(bundle) => index_file::SingleOrMulti::Single { + index: bundle.index.loaded()?.clone(), + data: bundle.data.loaded().cloned(), + }, + IndexAndPacks::MultiIndex(multi) => index_file::SingleOrMulti::Multi { + index: multi.multi_index.loaded()?.clone(), + data: multi.data.iter().map(|f| f.loaded().cloned()).collect(), + }, + }; + IndexLookup { file: lookup, id }.into() + }); + load_indices::Outcome::Replace { + indices: todo!(), + mark: PackIndexMarker { + generation: self.generation, + pack_index_sequence: self.files.len(), + }, + } + } } /// A way to indicate which pack indices we have seen already @@ -261,24 +317,25 @@ mod odb { } pub mod load_indices { - use crate::features; - use crate::odb::policy; - use crate::odb::policy::{IndexId, PackId, PackIndexMarker}; + use crate::odb::{ + policy, + policy::{IndexId, PackId, PackIndexMarker}, + }; pub(crate) enum Outcome { /// Drop all data and fully replace it with `indices`. /// This happens if we have witnessed a generational change invalidating all of our ids and causing currently loaded /// indices and maps to be dropped. Replace { - indices: Vec>, // should probably be small vec to get around most allocations - mark: PackIndexMarker, // use to show where the caller left off last time + indices: Vec, // should probably be small vec to get around most allocations + mark: PackIndexMarker, // use to show where the caller left off last time }, /// Extend with the given indices and keep searching, while dropping invalidated/unloaded ids. Extend { drop_packs: Vec, // which packs to forget about because they were removed from disk. drop_indices: Vec, // which packs to forget about because they were removed from disk. - indices: Vec>, // should probably be small vec to get around most allocations - mark: PackIndexMarker, // use to show where the caller left off last time + indices: Vec, // should probably be small vec to get around most allocations + mark: PackIndexMarker, // use to show where the caller left off last time }, /// No new indices to look at, caller should give up NoMoreIndices, @@ -394,20 +451,9 @@ mod odb { Ok(match marker { Some(marker) => { if marker.generation != state.generation { - load_indices::Outcome::Replace { - indices: state - .loaded_indices - .iter() - .filter_map(Option::as_ref) - .cloned() - .collect(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_sequence: state.loaded_indices.len(), - }, - } + state.collect_replace_outcome() } else { - if marker.pack_index_sequence == state.loaded_indices.len() { + if marker.pack_index_sequence == state.files.len() { match refresh_mode { policy::RefreshMode::Never => load_indices::Outcome::NoMoreIndices, policy::RefreshMode::AfterAllIndicesLoaded => { @@ -416,33 +462,18 @@ mod odb { } } else { load_indices::Outcome::Extend { - indices: state.loaded_indices[marker.pack_index_sequence..] - .iter() - .filter_map(Option::as_ref) - .cloned() - .collect(), + indices: todo!("state.files[marker.pack_index_sequence..]"), drop_packs: Vec::new(), drop_indices: Vec::new(), mark: PackIndexMarker { generation: state.generation, - pack_index_sequence: state.loaded_indices.len(), + pack_index_sequence: state.files.len(), }, } } } } - None => load_indices::Outcome::Replace { - indices: state - .loaded_indices - .iter() - .filter_map(Option::as_ref) - .cloned() - .collect(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_sequence: state.loaded_indices.len(), - }, - }, + None => state.collect_replace_outcome(), }) } } @@ -514,9 +545,10 @@ mod odb { } mod refs { - use crate::features; use std::path::{Path, PathBuf}; + use crate::features; + pub struct LooseStore { path: PathBuf, reflog_mode: u32, @@ -530,8 +562,10 @@ mod refs { } mod inner { - use crate::features; - use crate::refs::{LooseStore, RefTable}; + use crate::{ + features, + refs::{LooseStore, RefTable}, + }; pub(crate) enum StoreSelection { Loose(LooseStore), diff --git a/git-object/src/commit/write.rs b/git-object/src/commit/write.rs index 24d4be2dda7..ce16bf82843 100644 --- a/git-object/src/commit/write.rs +++ b/git-object/src/commit/write.rs @@ -1,6 +1,7 @@ -use bstr::ByteSlice; use std::io; +use bstr::ByteSlice; + use crate::{encode, encode::NL, Commit, CommitRef, Kind}; impl crate::WriteTo for Commit { diff --git a/git-object/src/object/mod.rs b/git-object/src/object/mod.rs index 2e4a01d0b6a..710356d626e 100644 --- a/git-object/src/object/mod.rs +++ b/git-object/src/object/mod.rs @@ -162,10 +162,13 @@ impl Object { } } -use crate::decode::{loose_header, Error as DecodeError, LooseHeaderDecodeError}; -use crate::{BlobRef, CommitRef, Kind, ObjectRef, TagRef, TreeRef}; use quick_error::quick_error; +use crate::{ + decode::{loose_header, Error as DecodeError, LooseHeaderDecodeError}, + BlobRef, CommitRef, Kind, ObjectRef, TagRef, TreeRef, +}; + quick_error! { #[derive(Debug)] #[allow(missing_docs)] diff --git a/git-odb/src/alternate/mod.rs b/git-odb/src/alternate/mod.rs index 505f266db90..5dac0464695 100644 --- a/git-odb/src/alternate/mod.rs +++ b/git-odb/src/alternate/mod.rs @@ -40,7 +40,7 @@ pub enum Error { } /// Given an objects directory, try to resolve alternate object directories possibly located in the -/// `./info/alternates` file. +/// `./info/alternates` file into canonical paths. /// If no alternate object database was resolved, the resulting `Vec` is empty (it is not an error /// if there are no alternates). /// It is an error once a repository is seen again as it would lead to a cycle. diff --git a/git-ref/src/store/file/loose/reflog/create_or_update/tests.rs b/git-ref/src/store/file/loose/reflog/create_or_update/tests.rs index 3f7102b9c98..6b46576277d 100644 --- a/git-ref/src/store/file/loose/reflog/create_or_update/tests.rs +++ b/git-ref/src/store/file/loose/reflog/create_or_update/tests.rs @@ -1,12 +1,14 @@ -use super::*; -use crate::{file::WriteReflog, FullNameRef}; +use std::{convert::TryInto, path::Path}; + use git_actor::{Sign, Signature, Time}; use git_lock::acquire::Fail; use git_object::bstr::ByteSlice; use git_testtools::hex_to_id; -use std::{convert::TryInto, path::Path}; use tempfile::TempDir; +use super::*; +use crate::{file::WriteReflog, FullNameRef}; + type Result = std::result::Result>; fn empty_store(writemode: WriteReflog) -> Result<(TempDir, file::Store)> { diff --git a/git-ref/tests/file/transaction/prepare_and_commit/create_or_update.rs b/git-ref/tests/file/transaction/prepare_and_commit/create_or_update.rs index df4024de4cb..7aba8f60236 100644 --- a/git-ref/tests/file/transaction/prepare_and_commit/create_or_update.rs +++ b/git-ref/tests/file/transaction/prepare_and_commit/create_or_update.rs @@ -1,23 +1,22 @@ -use crate::file::{ - store_with_packed_refs, store_writable, - transaction::prepare_and_commit::{committer, empty_store, log_line, reflog_lines}, -}; +use std::convert::TryInto; + use git_hash::ObjectId; use git_lock::acquire::Fail; -use git_object::bstr::BString; -use git_object::bstr::ByteSlice; -use git_ref::file::ReferenceExt; -use git_ref::transaction::PreviousValue; +use git_object::bstr::{BString, ByteSlice}; use git_ref::{ file::{ transaction::{self, PackedRefs}, - WriteReflog, + ReferenceExt, WriteReflog, }, - transaction::{Change, LogChange, RefEdit, RefLog}, + transaction::{Change, LogChange, PreviousValue, RefEdit, RefLog}, Target, }; use git_testtools::hex_to_id; -use std::convert::TryInto; + +use crate::file::{ + store_with_packed_refs, store_writable, + transaction::prepare_and_commit::{committer, empty_store, log_line, reflog_lines}, +}; #[test] fn reference_with_equally_named_empty_or_non_empty_directory_already_in_place_can_potentially_recover() -> crate::Result diff --git a/git-ref/tests/file/transaction/prepare_and_commit/delete.rs b/git-ref/tests/file/transaction/prepare_and_commit/delete.rs index 8d36f66ea1d..029803f3393 100644 --- a/git-ref/tests/file/transaction/prepare_and_commit/delete.rs +++ b/git-ref/tests/file/transaction/prepare_and_commit/delete.rs @@ -1,16 +1,17 @@ -use crate::file::{ - store_writable, - transaction::prepare_and_commit::{committer, empty_store}, -}; +use std::convert::TryInto; + use git_lock::acquire::Fail; -use git_ref::file::ReferenceExt; -use git_ref::transaction::PreviousValue; use git_ref::{ - transaction::{Change, RefEdit, RefLog}, + file::ReferenceExt, + transaction::{Change, PreviousValue, RefEdit, RefLog}, Reference, Target, }; use git_testtools::hex_to_id; -use std::convert::TryInto; + +use crate::file::{ + store_writable, + transaction::prepare_and_commit::{committer, empty_store}, +}; #[test] fn delete_a_ref_which_is_gone_succeeds() -> crate::Result { From 87544c343401fdd6183e1f63d67df3caf523dfd5 Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 18:09:02 +0800 Subject: [PATCH 54/57] =?UTF-8?q?try=20loading=20actual=20packs=20with=20t?= =?UTF-8?q?he=20current=20data=20structure=E2=80=A6=20(#259)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …and run into a hopefully fixable lifetime issue --- experiments/odb-redesign/src/lib.rs | 72 ++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 883e49480e2..90861f081ae 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -73,6 +73,7 @@ mod odb { }; pub mod policy { + use std::path::Path; use std::{io, path::PathBuf}; use git_hash::oid; @@ -181,6 +182,25 @@ mod odb { Unloaded | Missing => None, } } + + pub fn load_if_needed(&mut self, load: impl FnOnce(&Path) -> io::Result) -> io::Result> { + use OnDiskFileState::*; + Ok(match &mut self.state { + Loaded(v) | Garbage(v) => Some(v), + Missing => None, + Unloaded => match load(&self.path) { + Ok(v) => { + // self.state = OnDiskFileState::Loaded(v); + todo!() + } + Err(err) if err.kind() == io::ErrorKind::NotFound => { + // self.state = OnDiskFileState::Missing; + None + } + Err(err) => return Err(err), + }, + }) + } } pub(crate) type MultiIndex = (); @@ -193,14 +213,14 @@ mod odb { } pub(crate) struct IndexFileBundle { - index: OnDiskFile>, - path: PathBuf, - data: OnDiskFile>, + pub index: OnDiskFile>, + pub path: PathBuf, + pub data: OnDiskFile>, } pub(crate) struct MultiIndexFileBundle { - multi_index: OnDiskFile>, - data: Vec>>, + pub multi_index: OnDiskFile>, + pub data: Vec>>, } pub(crate) enum IndexAndPacks { @@ -257,6 +277,10 @@ mod odb { } /// refresh and possibly clear out our existing data structures, causing all pack ids to be invalidated. + /// + /// Note that updates to multi-pack indices always cause the old index to be invalidated (Missing) and transferred + /// into the new MultiPack index (reusing previous maps as good as possible), effectively treating them as new index entirely. + /// That way, extension still work as before such that old indices may be pruned, and changes/new ones are always appended. pub(crate) fn refresh(&mut self) -> io::Result { let mut db_paths = git_odb::alternate::resolve(&self.objects_directory) .map_err(|err| io::Error::new(io::ErrorKind::Other, err))?; @@ -319,7 +343,7 @@ mod odb { pub mod load_indices { use crate::odb::{ policy, - policy::{IndexId, PackId, PackIndexMarker}, + policy::{IndexId, PackIndexMarker}, }; pub(crate) enum Outcome { @@ -332,8 +356,7 @@ mod odb { }, /// Extend with the given indices and keep searching, while dropping invalidated/unloaded ids. Extend { - drop_packs: Vec, // which packs to forget about because they were removed from disk. - drop_indices: Vec, // which packs to forget about because they were removed from disk. + drop_indices: Vec, // which packs to forget about (along their packs) because they were removed from disk. indices: Vec, // should probably be small vec to get around most allocations mark: PackIndexMarker, // use to show where the caller left off last time }, @@ -431,9 +454,35 @@ mod odb { match id.multipack_index { None => { let state = get_ref_upgradeable(&self.state); - // state.loaded_packs.get(id.index).map(|p| p.clone()).flatten() - // If there file on disk wasn't found, reconcile the on-disk state with our state right away and try again. - todo!() + match state.files.get(id.index) { + Some(f) => match f { + policy::IndexAndPacks::Index(bundle) => match bundle.data.loaded() { + Some(pack) => Ok(Some(pack.clone())), + None => { + let mut state = upgrade_ref_to_mut(state); + let f = &mut state.files[id.index]; + match f { + policy::IndexAndPacks::Index(bundle) => Ok(bundle + .data + .load_if_needed(|path| { + Ok(features::OwnShared::new(git_pack::data::File::at(path).map_err( + |err| match err { + git_odb::data::header::decode::Error::Io { source, .. } => { + source + } + other => std::io::Error::new(std::io::ErrorKind::Other, other), + }, + )?)) + })? + .cloned()), + _ => unreachable!(), + } + } + }, + policy::IndexAndPacks::MultiIndex(_) => Ok(None), + }, + None => Ok(None), + } } Some(multipack_id) => todo!("load from given multipack which needs additional lookup"), } @@ -463,7 +512,6 @@ mod odb { } else { load_indices::Outcome::Extend { indices: todo!("state.files[marker.pack_index_sequence..]"), - drop_packs: Vec::new(), drop_indices: Vec::new(), mark: PackIndexMarker { generation: state.generation, From 297751858f4d36068bde1008b2d35c9c2de6662c Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 18:12:39 +0800 Subject: [PATCH 55/57] workaround borrow check, which in this case fits nicely (#259) --- experiments/odb-redesign/src/lib.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 90861f081ae..a5c340f7505 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -183,23 +183,26 @@ mod odb { } } - pub fn load_if_needed(&mut self, load: impl FnOnce(&Path) -> io::Result) -> io::Result> { + pub fn do_load(&mut self, load: impl FnOnce(&Path) -> io::Result) -> io::Result> { use OnDiskFileState::*; - Ok(match &mut self.state { - Loaded(v) | Garbage(v) => Some(v), - Missing => None, + match &mut self.state { + Loaded(_) | Garbage(_) => unreachable!("BUG: check before calling this"), + Missing => Ok(None), Unloaded => match load(&self.path) { Ok(v) => { - // self.state = OnDiskFileState::Loaded(v); - todo!() + self.state = OnDiskFileState::Loaded(v); + match &self.state { + Loaded(v) => Ok(Some(v)), + _ => unreachable!(), + } } Err(err) if err.kind() == io::ErrorKind::NotFound => { - // self.state = OnDiskFileState::Missing; - None + self.state = OnDiskFileState::Missing; + Ok(None) } - Err(err) => return Err(err), + Err(err) => Err(err), }, - }) + } } } @@ -464,7 +467,7 @@ mod odb { match f { policy::IndexAndPacks::Index(bundle) => Ok(bundle .data - .load_if_needed(|path| { + .do_load(|path| { Ok(features::OwnShared::new(git_pack::data::File::at(path).map_err( |err| match err { git_odb::data::header::decode::Error::Io { source, .. } => { From a5a6a783750657721cfd997fda466d2a2a03301d Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 18:18:40 +0800 Subject: [PATCH 56/57] realize that pack lookup needs the marker or wrong packs might be returned (#259) --- experiments/odb-redesign/src/lib.rs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index a5c340f7505..5a18b676b61 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -183,6 +183,8 @@ mod odb { } } + /// We do it like this as we first have to check for a loaded interior in read-only mode, and then upgrade + /// when we know that loading is necessary. This also works around borrow check, which is a nice coincidence. pub fn do_load(&mut self, load: impl FnOnce(&Path) -> io::Result) -> io::Result> { use OnDiskFileState::*; match &mut self.state { @@ -468,22 +470,25 @@ mod odb { policy::IndexAndPacks::Index(bundle) => Ok(bundle .data .do_load(|path| { - Ok(features::OwnShared::new(git_pack::data::File::at(path).map_err( + git_pack::data::File::at(path).map(features::OwnShared::new).map_err( |err| match err { git_odb::data::header::decode::Error::Io { source, .. } => { source } other => std::io::Error::new(std::io::ErrorKind::Other, other), }, - )?)) + ) })? .cloned()), _ => unreachable!(), } } }, + // This can also happen if they use an old index into our new and refreshed data which might have a multi-index + // here. policy::IndexAndPacks::MultiIndex(_) => Ok(None), }, + // This can happen if we condense our data, returning None tells the caller to refresh their indices None => Ok(None), } } From a14e4d0c7f530a0fb1b32642a4b5b75cdb10c65f Mon Sep 17 00:00:00 2001 From: Sebastian Thiel Date: Thu, 25 Nov 2021 18:25:50 +0800 Subject: [PATCH 57/57] A good way to avoid using a potentially wrong pack (#259) --- experiments/odb-redesign/src/lib.rs | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/experiments/odb-redesign/src/lib.rs b/experiments/odb-redesign/src/lib.rs index 5a18b676b61..b1a553f87f1 100644 --- a/experiments/odb-redesign/src/lib.rs +++ b/experiments/odb-redesign/src/lib.rs @@ -251,6 +251,12 @@ mod odb { } impl State { + pub(crate) fn marker(&self) -> PackIndexMarker { + PackIndexMarker { + generation: self.generation, + pack_index_sequence: self.files.len(), + } + } pub(crate) fn snapshot(&self) -> StateInformation { let mut open_packs = 0; let mut open_indices = 0; @@ -325,6 +331,7 @@ mod odb { } /// A way to indicate which pack indices we have seen already + #[derive(Copy, Clone)] pub struct PackIndexMarker { /// The generation the `pack_index_sequence` belongs to. Indices of different generations are completely incompatible. pub(crate) generation: u8, @@ -452,17 +459,19 @@ mod odb { /// If the oid is known, just load indices again to continue /// (objects rarely ever removed so should be present, maybe in another pack though), /// and redo the entire lookup for a valid pack id whose pack can probably be loaded next time. + /// The caller has to check the generation of the returned pack and compare it with their last generation, + /// reloading the indices and retrying if it doesn't match. pub(crate) fn load_pack( &self, id: policy::PackId, - ) -> std::io::Result>> { + ) -> std::io::Result)>> { match id.multipack_index { None => { let state = get_ref_upgradeable(&self.state); match state.files.get(id.index) { Some(f) => match f { policy::IndexAndPacks::Index(bundle) => match bundle.data.loaded() { - Some(pack) => Ok(Some(pack.clone())), + Some(pack) => Ok(Some((state.marker(), pack.clone()))), None => { let mut state = upgrade_ref_to_mut(state); let f = &mut state.files[id.index]; @@ -479,7 +488,8 @@ mod odb { }, ) })? - .cloned()), + .cloned() + .map(|f| (state.marker(), f))), _ => unreachable!(), } } @@ -521,10 +531,7 @@ mod odb { load_indices::Outcome::Extend { indices: todo!("state.files[marker.pack_index_sequence..]"), drop_indices: Vec::new(), - mark: PackIndexMarker { - generation: state.generation, - pack_index_sequence: state.files.len(), - }, + mark: state.marker(), } } }