diff --git a/lsm-tree.cabal b/lsm-tree.cabal index 3d18ab109..ca13efd7a 100644 --- a/lsm-tree.cabal +++ b/lsm-tree.cabal @@ -366,6 +366,7 @@ test-suite lsm-tree-test Test.Database.LSMTree.Internal.Serialise Test.Database.LSMTree.Internal.Serialise.Class Test.Database.LSMTree.Internal.Vector + Test.Database.LSMTree.Internal.Vector.Growing Test.Database.LSMTree.Model.Table Test.Database.LSMTree.Monoidal Test.Database.LSMTree.Normal.StateMachine diff --git a/test/Main.hs b/test/Main.hs index 6178ecb74..fba957a06 100644 --- a/test/Main.hs +++ b/test/Main.hs @@ -28,6 +28,7 @@ import qualified Test.Database.LSMTree.Internal.RunReaders import qualified Test.Database.LSMTree.Internal.Serialise import qualified Test.Database.LSMTree.Internal.Serialise.Class import qualified Test.Database.LSMTree.Internal.Vector +import qualified Test.Database.LSMTree.Internal.Vector.Growing import qualified Test.Database.LSMTree.Model.Table import qualified Test.Database.LSMTree.Monoidal import qualified Test.Database.LSMTree.Normal.StateMachine @@ -63,6 +64,7 @@ main = defaultMain $ testGroup "lsm-tree" , Test.Database.LSMTree.Internal.Serialise.tests , Test.Database.LSMTree.Internal.Serialise.Class.tests , Test.Database.LSMTree.Internal.Vector.tests + , Test.Database.LSMTree.Internal.Vector.Growing.tests , Test.Database.LSMTree.Model.Table.tests , Test.Database.LSMTree.Monoidal.tests , Test.Database.LSMTree.Normal.UnitTests.tests diff --git a/test/Test/Database/LSMTree/Internal/Vector/Growing.hs b/test/Test/Database/LSMTree/Internal/Vector/Growing.hs new file mode 100644 index 000000000..4b748e6d5 --- /dev/null +++ b/test/Test/Database/LSMTree/Internal/Vector/Growing.hs @@ -0,0 +1,222 @@ +module Test.Database.LSMTree.Internal.Vector.Growing (tests) where + +import Prelude hiding (head, length, tail) + +import Control.Category ((>>>)) +import Control.Monad.ST.Strict (ST, runST) +import qualified Data.List as List (length) +import Data.Vector (Vector, toList) +import Database.LSMTree.Internal.Vector.Growing (GrowingVector, + append, freeze, new) +import Test.QuickCheck (Arbitrary (arbitrary, shrink), Gen, + NonNegative (NonNegative, getNonNegative), + Positive (Positive), Property, Small (Small), Testable, + chooseInt, coverTable, shrinkList, shrinkMap, shrinkMapBy, + sized, tabulate, (===)) +import Test.Tasty (TestTree, testGroup) +import Test.Tasty.QuickCheck (testProperty) + +-- * Tests + +tests :: TestTree +tests = testGroup "Test.Database.LSMTree.Internal.Vector.Growing" $ + [ + testProperty "Final vector is correct" + prop_finalVectorIsCorrect + ] + +-- * Utilities + +-- ** Segments + +data Segment a = Replicate Int a deriving stock Show + +segmentLength :: Segment a -> Int +segmentLength (Replicate count _) = max 0 count + +segmentToList :: Segment a -> [a] +segmentToList (Replicate count val) = replicate count val + +appendSegment :: GrowingVector s a -> Segment a -> ST s () +appendSegment vec (Replicate count val) = append vec count val + +-- ** Vector construction + +{- + Constructs an ordinary vector by creating a growing vector, appending + segments to it, and finally freezing it. +-} +finalVector :: Int -- Initial buffer size + -> [Segment a] -- Segments + -> Vector a -- Final vector +finalVector initialBufferSize segments = runST $ do + vec <- new initialBufferSize + mapM_ (appendSegment vec) segments + freeze vec + +{- + Supplies the final contents of a growing vector for constructing a property. + + The resulting property additionally provides information about the + occurrence of buffer size scaling exponents, where a buffer size scaling + exponent is the binary logarithm of the ratio between the buffer size after + and before appending a segment. If buffer enlargement for an append would + happen in cycles where in each cycle the buffer size is doubled, then the + buffer size scaling exponent would be equal to the number of such cycles. + Therefore, a buffer size scaling exponent tells if the buffer is enlarged + and, if yes, how much. +-} +withFinalVector + :: Testable prop + => Int -- Initial buffer size + -> [Segment a] -- Segments + -> (Vector a -> prop) -- Property from final vector + -> Property -- Resulting property +withFinalVector initialBufferSize segments consumer + = coverTable + "Buffer size scaling exponents" + [(show scalingExp, 10) | scalingExp <- [0 .. 3] :: [Int]] + $ + tabulate + "Buffer size scaling exponents" + (show <$> bufferSizeScalingExponents availableBufferSizes 0 segments) + $ + consumer (finalVector initialBufferSize segments) + where + + availableBufferSizes :: [Int] + availableBufferSizes = iterate (* 2) initialBufferSize + {- + We do not need to account for overflow here, because the lengths of the + vectors we construct are small compared to @maxBound :: Int@. + -} + + bufferSizeScalingExponents :: [Int] -> Int -> [Segment a] -> [Int] + bufferSizeScalingExponents _ _ [] + = [] + bufferSizeScalingExponents availableSizes length (head : tail) + = List.length insufficient : + bufferSizeScalingExponents sufficient length' tail + where + + length' :: Int + !length' = length + segmentLength head + + insufficient, sufficient :: [Int] + (insufficient, sufficient) = span (< length') availableSizes + +-- * Properties to test + +prop_finalVectorIsCorrect :: Positive (Small Int) + -> Segments Integer + -> Property +prop_finalVectorIsCorrect (Positive (Small initialBufferSize)) + (Segments segments) + = withFinalVector initialBufferSize segments $ \ vec -> + toList vec === concatMap segmentToList segments + +-- * Test case generation and shrinking + +generateSegment :: Arbitrary a => Int -> Gen (Segment a) +generateSegment maxCount = Replicate <$> chooseInt (0, maxCount) <*> arbitrary + +shrinkSegment :: Arbitrary a => Segment a -> [Segment a] +shrinkSegment (Replicate count val) + = [Replicate count' val | count' <- shrinkNat count] ++ + [Replicate count val' | val' <- shrink val] + where + + shrinkNat :: Int -> [Int] + shrinkNat = shrinkMap getNonNegative NonNegative + +{- + A list of segments to be appended to an initially empty vector. 'Segments a' + is isomorphic to '[Segment a]' but has a special way of generating test + cases, which avoids skewing with respect to buffer size scaling exponents. + + The key issue of segment list generation is how to generate the length of an + individual segment. It seems worthwhile to randomly pick a natural number + smaller than or equal to some maximum length, using a uniform distribution. + The question to be answered is how to choose the maximum lengths for the + different segments. + + A naïve approach is to use a common maximum length for all segments in a + particular list, possibly computed from the size parameter. However, this + results in most appends not enlarging the buffer, meaning that the buffer + size scaling exponent is zero in most cases. This is because, as the buffer + size increases, the number of elements needed to cause the next buffer + enlargement increases too. + + Note that it does not help much to choose the maximum segment length large + compared to the initial buffer size. If this is done, then the buffer is + enlarged very quickly, likely during the first append, and the distributions + of buffer size scaling exponents up to this first enlargement may actually + be very good. However, after the first enlargement, the buffer size is + typically of the same order of magnitude as the maximum segment length, so + that the buffer size scaling exponents of the following appends are likely + to be zero or close to it, with further buffer enlargements making the + situation only worse. + + A better idea is to choose the maximum length of each segment but the first + proportional to the length of the vector just before appending it (for the + first segment, such a choice is not appropriate, because it would lead to a + maximum length of zero). With this approach, the distribution of buffer size + scaling exponents stays roughly the same as soon as the buffer has been + enlarged for the first time, because then the ratio between the current + buffer size and the current vector length is always in the interval \([1, + 2)\) and thus varies only slightly. Reasonable distributions of buffer size + scaling exponents can be achieved by choosing the ratio between maximum + segment lengths and corresponding current-vector lengths appropriately. + + A downside of this approach is that it requires tracking the current vector + length. This tracking can be avoided by considering only the /expected/ + current vector length and choosing the maximum segment length proportional + to it. The expected length of a vector is the sum of the expected lengths of + the segments constituting it, and the expected length of a segment is + proportional to the maximum length used for generating it. Therefore, this + modified solution boils down to choosing the maximum length of each segment + but the first proportional to the sum of the maximum lengths of the segments + preceding it. + + Note that, with the approach just set out, the maximum lengths of the + segments in a segment list almost form an exponential sequence, whose base + is determined by the chosen ratio between maximum segment lengths and + corresponding sums of maximum predecessor segment lengths. Therefore, a + similarly good, but simpler, solution is to have the maximum segment lengths + precisely form an exponential sequence. This is the approach that we use in + our segment list generation algorithm. + + The concrete exponential sequences that we employ start with the size + parameter and use 3 as the base of the exponentiation. Since initial buffer + sizes are generated using a uniform distribution with the size parameter as + the maximum, the choice of the size parameter as the first segment’s maximum + length leads to non-trivial distributions of buffer size scaling exponents + for the appends up to the one that causes the first buffer enlargement. The + choice of 3 as the exponentiation’s base leads to a good overall + distribution of buffer size scaling exponents, as experiments have shown. + + Because of the exponential growth of maximum segment lengths, vectors + constructed during testing get very large for longer segment lists. + Therefore, we do not pick the lengths of segment lists in the usual way but + instead make all segment lists have a common, small length. Concretely, we + use 9 as this common length, because this leads to vectors that are not + trivial but still consume reasonable amounts of memory. +-} +newtype Segments a = Segments [Segment a] deriving stock Show + +instance Arbitrary a => Arbitrary (Segments a) where + + arbitrary = sized $ iterate (* scalingFactor) >>> + take count >>> + mapM generateSegment >>> + fmap Segments + where + + scalingFactor :: Int + scalingFactor = 3 + + count :: Int + count = 9 + + shrink = shrinkMapBy Segments (\ (Segments segments) -> segments) $ + shrinkList shrinkSegment