Skip to content

Commit

Permalink
Use vectors in prepLookups
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisdral committed Mar 28, 2024
1 parent ded5a16 commit f3b01eb
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 63 deletions.
42 changes: 9 additions & 33 deletions bench/micro/Bench/Database/LSMTree/Internal/Lookup.hs
Expand Up @@ -9,7 +9,7 @@
{-# LANGUAGE TypeApplications #-}
{- HLINT ignore "Use const" -}

module Bench.Database.LSMTree.Internal.Lookup (benchmarks, analysis) where
module Bench.Database.LSMTree.Internal.Lookup (benchmarks) where

import Bench.Database.LSMTree.Internal.Run.BloomFilter (elems)
import Bench.Database.LSMTree.Internal.Run.Index.Compact
Expand All @@ -23,6 +23,8 @@ import Data.List.NonEmpty (NonEmpty (..))
import qualified Data.List.NonEmpty as NonEmpty
import Data.Maybe (fromMaybe)
import Data.Proxy (Proxy (..))
import Data.Vector (Vector)
import qualified Data.Vector as V
import Database.LSMTree.Generators (ChunkSize (..), RFPrecision (..),
UTxOKey)
import Database.LSMTree.Internal.Lookup (prepLookups)
Expand Down Expand Up @@ -78,40 +80,14 @@ benchmarks = bgroup "Bench.Database.LSMTree.Internal.Lookup" [
bench "Bloomfilter query" $ whnf (elems b) ks
-- the compact index is only searched for (true and false) positive
-- lookup keys
, env (pure $ filter (`Bloom.elem` b) ks) $ \ks' ->
bench "Compact index search" $ whnf (searches ci) ks'
, env (pure $ V.filter (`Bloom.elem` b) ks) $ \ks' ->
bench "Compact index search" $ whnf (searches ci) ks'
-- All prepped lookups are going to be used eventually so we use
-- @nf@ on the result of 'prepLookups' to ensure that we actually
-- compute the full list.
, bench "In-memory lookup" $ nf (prepLookups [((), b, ci)]) ks
, bench "In-memory lookup" $ nf (prepLookups (V.singleton ((), b, ci))) ks
]

{-------------------------------------------------------------------------------
Analysis
-------------------------------------------------------------------------------}

-- In this analysis, around @15%@ to @20%@ of the measured time for
-- 'prepLookups' is not accounted for by bloom filter queries and compact index
-- searches.
analysis :: IO ()
analysis = do
-- (name, bloomfilter query, compact index search, prepLookups)
let def = ("default", 1.722 , 0.966 , 3.294)
onlyPos = ("onlyPos", 0.9108, 0.8873 , 2.139)
onlyNeg = ("onlyNeg", 0.6784, 0.009573, 0.8683)
highFpr = ("highFpr", 1.155 , 1.652 , 3.417)
small = ("small" , 0.1602, 0.06589 , 0.2823)

results :: [(String, Double, Double, Double)]
results = [def, onlyPos, onlyNeg, highFpr, small]

forM_ results $ \(name, query, search, prep) -> do
-- the measured time for 'prepLookups' should be close to the time spent on
-- bloom filter queries and compact index searches
let diff = prep - (query + search)
diffPercent = diff / prep
print (name, query, search, prep, diff, diffPercent)

{-------------------------------------------------------------------------------
Environments
-------------------------------------------------------------------------------}
Expand Down Expand Up @@ -140,7 +116,7 @@ data Config = Config {

defaultConfig :: Config
defaultConfig = Config {
name = "default"
name = "defaulty"
, rfprecDef = Nothing
, csize = ChunkSize 100
, npages = 50_000
Expand All @@ -156,15 +132,15 @@ prepLookupsEnv ::
forall k. (Ord k, Uniform k, SerialiseKey k)
=> Proxy k
-> Config
-> IO (Bloom SerialisedKey, CompactIndex, [SerialisedKey])
-> IO (Bloom SerialisedKey, CompactIndex, Vector SerialisedKey)
prepLookupsEnv _ Config {..} = do
(storedKeys, lookupKeys) <- lookupsEnv @k (mkStdGen 17) totalEntries npos nneg
let b = Bloom.fromList fpr $ fmap serialiseKey storedKeys
ps = mkPages rfprec $ NonEmpty.fromList storedKeys
ps' = fmap serialiseKey ps
ps'' = fromPage <$> getPages ps'
ci = constructCompactIndex csize (rfprec, ps'')
pure (b, ci, fmap serialiseKey lookupKeys)
pure (b, ci, serialiseKey <$> V.fromList lookupKeys)
where
totalEntries = npages * npageEntries
rfprec = RFPrecision $
Expand Down
Expand Up @@ -14,6 +14,8 @@ import qualified Data.BloomFilter.Easy as Bloom.Easy
import Data.Foldable (Foldable (..))
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Vector (Vector)
import qualified Data.Vector as V
import Database.LSMTree.Extras
import Database.LSMTree.Generators
import Database.LSMTree.Internal.Run.BloomFilter as Bloom
Expand Down Expand Up @@ -52,18 +54,18 @@ elemEnv ::
-> Int -- ^ Number of entries in the bloom filter
-> Int -- ^ Number of positive lookups
-> Int -- ^ Number of negative lookups
-> IO (Bloom SerialisedKey, [SerialisedKey])
-> IO (Bloom SerialisedKey, Vector SerialisedKey)
elemEnv fpr nbloom nelemsPositive nelemsNegative = do
stdgen <- newStdGen
stdgen' <- newStdGen
let (xs, ys1) = splitAt nbloom
$ uniformWithoutReplacement @UTxOKey stdgen (nbloom + nelemsNegative)
ys2 = sampleUniformWithReplacement @UTxOKey stdgen' nelemsPositive xs
zs <- generate $ shuffle (ys1 ++ ys2)
pure (Bloom.Easy.easyList fpr (fmap serialiseKey xs), fmap serialiseKey zs)
pure (Bloom.Easy.easyList fpr (fmap serialiseKey xs), V.fromList $ fmap serialiseKey zs)

-- | Used for benchmarking 'Bloom.elem'.
elems :: Bloom.Hashable a => Bloom a -> [a] -> ()
elems :: Bloom.Hashable a => Bloom a -> Vector a -> ()
elems b xs = foldl' (\acc x -> Bloom.elem x b `seq` acc) () xs

-- | Input environment for benchmarking 'constructBloom'.
Expand Down
Expand Up @@ -13,6 +13,8 @@ import Control.DeepSeq (deepseq)
import Control.Monad.ST (runST)
import Criterion.Main
import Data.Foldable (Foldable (..))
import Data.Vector (Vector)
import qualified Data.Vector as V
import Database.LSMTree.Generators
import Database.LSMTree.Internal.Run.Index.Compact
import Database.LSMTree.Internal.Run.Index.Compact.Construction
Expand Down Expand Up @@ -44,17 +46,17 @@ searchEnv ::
RFPrecision -- ^ Range-finder bit-precision
-> Int -- ^ Number of pages
-> Int -- ^ Number of searches
-> IO (CompactIndex, [SerialisedKey])
-> IO (CompactIndex, Vector SerialisedKey)
searchEnv rfprec npages nsearches = do
ci <- constructCompactIndex 100 <$> constructionEnv rfprec npages
stdgen <- newStdGen
let ks = serialiseKey <$> uniformWithReplacement @UTxOKey stdgen nsearches
pure (ci, ks)
pure (ci, V.fromList ks)

-- | Used for benchmarking 'search'.
searches ::
CompactIndex
-> [SerialisedKey] -- ^ Keys to search for
-> Vector SerialisedKey -- ^ Keys to search for
-> ()
searches ci ks = foldl' (\acc k -> search k ci `deepseq` acc) () ks

Expand Down
1 change: 1 addition & 0 deletions lsm-tree.cabal
Expand Up @@ -312,6 +312,7 @@ benchmark lsm-tree-micro-bench
, primitive
, QuickCheck
, random
, vector

ghc-options: -threaded

Expand Down
40 changes: 25 additions & 15 deletions src/Database/LSMTree/Internal/Lookup.hs
@@ -1,13 +1,18 @@
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE TupleSections #-}

module Database.LSMTree.Internal.Lookup (
Run
, prepLookups
) where

import Data.Foldable (Foldable (..))
import Data.Vector (Vector)
import qualified Data.Vector as V
-- import qualified Data.Vector.Unboxed as VU
import Database.LSMTree.Internal.Run.BloomFilter (Bloom)
import qualified Database.LSMTree.Internal.Run.BloomFilter as Bloom
import Database.LSMTree.Internal.Run.Index.Compact (CompactIndex,
PageSpan)
PageSpan, SearchResult)
import qualified Database.LSMTree.Internal.Run.Index.Compact as Index
import Database.LSMTree.Internal.Serialise

Expand All @@ -16,17 +21,22 @@ type Run fd = (fd, Bloom SerialisedKey, CompactIndex)

-- | Prepare disk lookups by doing bloom filter queries and index searches.
--
-- Note: results are grouped by key instead of file descriptor, because this
-- means that results for a single key are close together.
prepLookups :: [Run fd] -> [SerialisedKey] -> [(SerialisedKey, (fd, PageSpan))]
prepLookups runs ks =
[ (k, (fd, pspan))
| k <- ks
, r@(fd,_,_) <- runs
, pspan <- toList (prepLookup r k)
]
-- Note: results are tagged with a 'KeyIx', an index into the 'SerialisedKey'
-- vector.
prepLookups :: Vector (Run fd) -> Vector SerialisedKey -> Vector (KeyIx, fd, PageSpan)
prepLookups runs ks = V.concatMap f runs
where
f r@(fd,_,_) =
let x = bloomBatch r ks
y = searchBatch r ks x
in V.imapMaybe (\i sres -> (i, fd,) <$> Index.toPageSpan sres) y

type KeyIx = Int

bloomBatch :: Run fd -> Vector SerialisedKey -> Vector Bool
bloomBatch (_fd, b, _ix) = V.map (`Bloom.elem` b)

prepLookup :: Run fd -> SerialisedKey -> Maybe PageSpan
prepLookup (_fd, b, fpix) k
| Bloom.elem k b = Index.toPageSpan $ Index.search k fpix
| otherwise = Nothing
searchBatch :: Run fd -> Vector SerialisedKey -> Vector Bool -> Vector SearchResult
searchBatch (_fd, _, cix) = V.zipWith f
where f k b | b = Index.search k cix
| otherwise = Index.NoResult
4 changes: 2 additions & 2 deletions src/Database/LSMTree/Internal/Run/Index/Compact.hs
Expand Up @@ -508,8 +508,8 @@ data SearchResult =

-- | A span of pages, representing an inclusive interval of page numbers.
data PageSpan = PageSpan {
pageSpanStart :: PageNo
, pageSpanEnd :: PageNo
pageSpanStart :: !PageNo
, pageSpanEnd :: !PageNo
}

toPageSpan :: SearchResult -> Maybe PageSpan
Expand Down
20 changes: 13 additions & 7 deletions test/Test/Database/LSMTree/Internal/Lookup.hs
Expand Up @@ -17,12 +17,14 @@ import qualified Data.ByteString as BS
import qualified Data.ByteString.Builder as BB
import qualified Data.ByteString.Short as SBS
import Data.Coerce (coerce)
import qualified Data.Foldable as F
import Data.Map.Strict (Map)
import qualified Data.Map.Strict as Map
import Data.Maybe
import Data.Primitive.ByteArray (ByteArray (ByteArray),
sizeofByteArray)
import qualified Data.Set as Set
import qualified Data.Vector as V
import qualified Data.Vector.Primitive as P
import Data.Word
import Database.LSMTree.Generators
Expand All @@ -48,13 +50,12 @@ import Test.Util.Orphans ()
import Test.Util.QuickCheck as Util.QC

tests :: TestTree
tests = testGroup "Test.Database.LSMTree.Internal.Integration" [
tests = testGroup "Test.Database.LSMTree.Internal.Lookup" [
testGroup "With multi-page values" [
testGroup "InMemLookupData" $
prop_arbitraryAndShrinkPreserveInvariant (deepseqInvariant @(InMemLookupData SerialisedKey SerialisedValue))
, localOption (QuickCheckMaxSize 1000) $
testProperty "prop_inMemRunLookupAndConstruction" prop_inMemRunLookupAndConstruction

]
, testGroup "Without multi-page values" [
testGroup "InMemLookupData" $
Expand All @@ -71,6 +72,9 @@ tests = testGroup "Test.Database.LSMTree.Internal.Integration" [
genNoMultiPage = liftArbitrary arbitrary
shrinkNoMultiPage = liftShrink shrink

conjoinF :: (Testable prop, Foldable f) => f prop -> Property
conjoinF = conjoin . F.toList

-- | Construct a run incrementally, then test a number of positive and negative lookups.
prop_inMemRunLookupAndConstruction :: InMemLookupData SerialisedKey SerialisedValue -> Property
prop_inMemRunLookupAndConstruction dat =
Expand All @@ -80,7 +84,7 @@ prop_inMemRunLookupAndConstruction dat =
$ tabulateNumKeyEntryPairs
$ tabulateNumPages
$ tabulateNumLookups
$ conjoin (fmap checkMaybeInRun keysMaybeInRun) .&&. conjoin (fmap checkNotInRun keysNotInRun)
$ conjoinF (fmap checkMaybeInRun keysMaybeInRun) .&&. conjoinF (fmap checkNotInRun keysNotInRun)
where
InMemLookupData{runData, lookups} = dat

Expand All @@ -91,10 +95,12 @@ prop_inMemRunLookupAndConstruction dat =
tabulateNumLookups = tabulate "Number of lookups" [showPowersOf10 (length lookups)]

run = mkTestRun runData
keys = V.fromList lookups
-- prepLookups says that a key /could be/ in the given page
keysMaybeInRun = prepLookups [run] lookups
keyixsMaybeInRun = prepLookups (V.singleton run) keys
keysMaybeInRun = V.map (\(kix, ps, pspan) -> (keys V.! kix, ps, pspan)) keyixsMaybeInRun
-- prepLookups says that a key /is definitely not/ in the given page
keysNotInRun = Set.toList (Set.fromList lookups Set.\\ Set.fromList (fmap fst keysMaybeInRun))
keysNotInRun = Set.toList (Set.fromList lookups Set.\\ Set.fromList (V.toList (V.map (\(k, _, _) -> k) keysMaybeInRun)))

-- Check that a key /is definitely not/ in the given page.
checkNotInRun :: SerialisedKey -> Property
Expand All @@ -105,8 +111,8 @@ prop_inMemRunLookupAndConstruction dat =
test = Nothing

-- | Check that a key /could be/ in the given page
checkMaybeInRun :: (SerialisedKey, (Map Int RawPage, PageSpan)) -> Property
checkMaybeInRun (k, (ps, PageSpan (PageNo i) (PageNo j)))
checkMaybeInRun :: (SerialisedKey, Map Int RawPage, PageSpan) -> Property
checkMaybeInRun (k, ps, PageSpan (PageNo i) (PageNo j))
| i <= j = tabulate "PageSpan size" [showPowersOf10 $ j - i + 1]
$ tabulate1Pre (classifyBin (isJust truth) True)
$ truth === test
Expand Down

0 comments on commit f3b01eb

Please sign in to comment.