WIP

IntersectMBO · Apr 25, 2024 · a429b2d · a429b2d
1 parent 431901b
commit a429b2d
Show file tree

Hide file tree

Showing 4 changed files with 327 additions and 2 deletions.
diff --git a/bench/macro/Bench/Database/LSMTree/Internal/BloomFilter.hs b/bench/macro/Bench/Database/LSMTree/Internal/BloomFilter.hs
@@ -6,6 +6,7 @@ module Bench.Database.LSMTree.Internal.BloomFilter (
     benchmarks
   ) where
 
+import           Control.DeepSeq
 import           Control.Exception (evaluate)
 import           Control.Monad
 import           Control.Monad.ST
@@ -76,7 +77,7 @@ benchmarks = do
 
     putStrLn "Generating bloom filters..."
     let rng0 = mkStdGen 42
-    vbs <- elemManyEnv filterSizes rng0
+    !vbs <- elemManyEnv filterSizes rng0
     putStrLn " finished."
     putStrLn ""
 
@@ -233,7 +234,8 @@ elemManyEnv filterSizes rng0 =
            (cycle [ mb'
                   | (mb, (_, sizeFactor, _, _)) <- zip mbs filterSizes
                   , mb' <- replicate sizeFactor mb ]))
-    Vector.fromList <$> mapM Bloom.unsafeFreeze mbs
+    vbs <- Vector.fromList <$> mapM Bloom.unsafeFreeze mbs
+    pure $!! vbs
 
 -- | This gives us a baseline cost of just calculating the series of keys.
 benchBaseline :: Vector (Bloom SerialisedKey) -> StdGen -> Int -> ()

diff --git a/bench/macro/Bench/Database/LSMTree/Internal/Lookup.hs b/bench/macro/Bench/Database/LSMTree/Internal/Lookup.hs
@@ -0,0 +1,317 @@
+{-# LANGUAGE BangPatterns       #-}
+{-# LANGUAGE CPP                #-}
+{-# LANGUAGE DerivingStrategies #-}
+{-# LANGUAGE NumericUnderscores #-}
+{-# LANGUAGE TypeApplications   #-}
+
+module Bench.Database.LSMTree.Internal.Lookup (
+    benchmarks
+  ) where
+
+import           Control.DeepSeq
+import           Control.Exception (evaluate)
+import           Control.Monad
+import           Data.Bits ((.&.))
+import           Data.BloomFilter (Bloom)
+import qualified Data.BloomFilter as Bloom
+import qualified Data.BloomFilter.Hash as Bloom
+import qualified Data.BloomFilter.Mutable as MBloom
+import           Data.List (foldl')
+import           Data.Proxy
+import           Data.Time
+import           Data.Vector (Vector)
+import qualified Data.Vector as V
+import qualified Data.Vector as Vector
+import qualified Data.Vector.Unboxed as VU
+import           Data.WideWord.Word256 (Word256)
+import           Data.Word (Word64)
+import           Database.LSMTree.Extras.Orphans ()
+import           Database.LSMTree.Extras.UTxO
+import           Database.LSMTree.Internal.Entry (Entry (Insert),
+                     NumEntries (..))
+import           Database.LSMTree.Internal.IndexCompact (IndexCompact)
+import           Database.LSMTree.Internal.Lookup
+import           Database.LSMTree.Internal.Monkey
+import           Database.LSMTree.Internal.Run (RunFsPaths (..))
+import qualified Database.LSMTree.Internal.RunBuilder as MRun
+import           Database.LSMTree.Internal.Serialise (SerialisedKey,
+                     serialiseKey, serialiseValue)
+import           GHC.Stats
+import           Numeric
+import qualified System.FS.API as FS
+import           System.FS.API (MountPoint (..))
+import qualified System.FS.IO as FS
+import           System.IO.Temp (withSystemTempDirectory)
+import           System.Mem (performMajorGC)
+import           System.Random
+import           Text.Printf (printf)
+
+
+-- Benchmark parameters you can tweak. The defaults are rather small, so the
+-- runtime is short and do not show the effects of data sizes no longer fitting
+-- into the CPU cache.
+
+-- | The number of entries in the filter in the smallest LSM runs is
+-- @2^benchmarkSizeBase@
+benchmarkSizeBase :: SizeBase
+benchmarkSizeBase = 16
+
+-- | The number of lookups to do. This has to be smaller than the total size of
+-- all the filters (otherwise we will not get true positive probes, which is
+-- part of the point of this benchmark).
+benchmarkNumLookups :: Int
+benchmarkNumLookups = 25_000_000
+
+benchmarkBatchSize :: BatchSize
+benchmarkBatchSize = BatchSize 64
+
+benchmarkExtBatchSize :: Int
+benchmarkExtBatchSize = 256
+
+-- A value of 0.02 results in 5 hashes and just over 8 bits per key.
+benchmarkRequestedFPR :: RequestedFPR
+benchmarkRequestedFPR = 0.02
+
+benchmarks :: IO ()
+benchmarks = do
+#ifdef NO_IGNORE_ASSERTS
+    putStrLn "BENCHMARKING A BUILD WITH -fno-ignore-asserts"
+#endif
+
+    enabled <- getRTSStatsEnabled
+    when (not enabled) $ fail "Need RTS +T statistics enabled"
+    let filterSizes = lsmStyleBloomFilters benchmarkSizeBase
+                                          benchmarkRequestedFPR
+    putStrLn "Bloom filter stats:"
+    putStrLn "(numEntries, sizeFactor, numBits, numHashFuncs)"
+    mapM_ print filterSizes
+    putStrLn $ "total number of entries:\t " ++ show (totalNumEntries filterSizes)
+    putStrLn $ "total filter size in bytes:\t " ++ show (totalNumBytes filterSizes)
+    putStrLn $ "total number of key lookups:\t " ++ show benchmarkNumLookups
+
+    unless (totalNumEntriesSanityCheck benchmarkSizeBase filterSizes) $
+      fail "totalNumEntriesSanityCheck failed"
+    unless (totalNumEntries filterSizes >= benchmarkNumLookups) $
+      fail "number of key lookups is more than number of entries"
+
+    putStrLn "Generating bloom filters..."
+    let keyRng0 = mkStdGen 42
+        valRng0 = mkStdGen 17
+    (blooms, indexes, handles, numEntriess) <- prepLookupsEnv filterSizes keyRng0 valRng0
+
+    putStrLn " finished."
+    putStrLn ""
+
+    baseline <-
+      benchmark "baseline"
+                "(This is the cost of just computing the keys.)"
+                (benchBaseline blooms keyRng0) benchmarkNumLookups
+                (0, 0)
+#ifdef NO_IGNORE_ASSERTS
+                80  -- https://gitlab.haskell.org/ghc/ghc/-/issues/24625
+#else
+                48
+#endif
+
+    hashcost <-
+      benchmark "makeCheapHashes"
+                "(This includes the cost of hashing the keys, less the cost of computing the keys)"
+                (benchMakeCheapHashes blooms keyRng0) benchmarkNumLookups
+                baseline
+                0
+
+    _ <-
+      benchmark "elemCheapHashes"
+                "(this is the simple one-by-one lookup, less the cost of computing and hashing the keys)"
+                (benchElemCheapHashes blooms keyRng0) benchmarkNumLookups
+                (fst hashcost + fst baseline, snd hashcost + snd baseline)
+                0
+
+    return ()
+
+type Alloc = Int
+
+benchmark :: String
+          -> String
+          -> (Int -> ())
+          -> Int
+          -> (NominalDiffTime, Alloc)
+          -> Int
+          -> IO (NominalDiffTime, Alloc)
+benchmark name description action n (subtractTime, subtractAlloc) expectedAlloc = do
+    putStrLn $ "Benchmarking " ++ name ++ " ... "
+    putStrLn description
+    performMajorGC
+    allocBefore <- allocated_bytes <$> getRTSStats
+    timeBefore  <- getCurrentTime
+    evaluate (action n)
+    timeAfter   <- getCurrentTime
+    performMajorGC
+    allocAfter  <- allocated_bytes <$> getRTSStats
+    putStrLn "Finished."
+    let allocTotal :: Alloc
+        timeTotal  = timeAfter `diffUTCTime` timeBefore
+        allocTotal = fromIntegral allocAfter - fromIntegral allocBefore
+        timeNet    = timeTotal - subtractTime
+        allocNet   = allocTotal - subtractAlloc
+
+        timePerKey, allocPerKey :: Double
+        timePerKey     = realToFrac timeNet / fromIntegral n
+        allocPerKey    = fromIntegral allocNet / fromIntegral n
+    let printStat :: String -> Double -> String -> IO ()
+        printStat label v unit =
+          putStrLn $ label ++ showGFloat (Just 2) v (' ':unit)
+    printStat "Time total:        " (realToFrac timeTotal) "seconds"
+    printStat "Alloc total:       " (fromIntegral allocTotal) "bytes"
+    printStat "Time net:          " (realToFrac timeNet) "seconds"
+    printStat "Alloc net:         " (fromIntegral allocNet) "bytes"
+    printStat "Time net per key:  " timePerKey "seconds"
+    printStat "Alloc net per key: " allocPerKey "bytes"
+
+    unless (truncate allocPerKey == expectedAlloc) $ do
+        printf "WARNING: expecting %d, got %d bytes allocated per key\n"
+            expectedAlloc
+            (truncate allocPerKey :: Int)
+
+    putStrLn ""
+    return (timeNet, allocNet)
+
+-- | (numEntries, sizeFactor, numBits, numHashFuncs)
+type BloomFilterSizeInfo = (Int, Int, Word64, Int)
+type SizeBase     = Int
+type RequestedFPR = Double
+
+-- | Calculate the sizes of a realistic LSM style set of Bloom filters, one
+-- for each LSM run. This uses base 4, with 4 disk levels, using tiering
+-- for internal levels and leveling for the final (biggest) level.
+--
+-- Due to the incremental merging, each level actually has (in the worst case)
+-- 2x the number of runs, hence 8 per level for tiering levels.
+--
+lsmStyleBloomFilters :: SizeBase -> RequestedFPR -> [BloomFilterSizeInfo]
+lsmStyleBloomFilters l1 requestedFPR =
+    [ (numEntries, sizeFactor, fromIntegral numBits, numHashFuncs)
+    | (numEntries, sizeFactor)
+        <- replicate 8 (2^(l1+0), 1)   -- 8 runs at level 1 (tiering)
+        ++ replicate 8 (2^(l1+2), 4)   -- 8 runs at level 2 (tiering)
+        ++ replicate 8 (2^(l1+4),16)   -- 8 runs at level 3 (tiering)
+        ++            [(2^(l1+8),256)] -- 1 run  at level 4 (leveling)
+    , let numBits      = monkeyBits numEntries requestedFPR
+          numHashFuncs = monkeyHashFuncs numBits numEntries
+    ]
+
+totalNumEntries, totalNumBytes :: [BloomFilterSizeInfo] -> Int
+totalNumEntries filterSizes =
+    sum [ numEntries | (numEntries, _, _, _) <- filterSizes ]
+
+totalNumBytes filterSizes =
+    fromIntegral $ sum [ numBits | (_,_,numBits,_) <- filterSizes ] `div` 8
+
+totalNumEntriesSanityCheck :: SizeBase -> [BloomFilterSizeInfo] -> Bool
+totalNumEntriesSanityCheck l1 filterSizes =
+    totalNumEntries filterSizes
+    ==
+    sum [ 2^l1 * sizeFactor | (_, sizeFactor, _, _) <- filterSizes ]
+
+
+-- | Input environment for benchmarking 'Bloom.elemMany'.
+--
+-- The idea here is to have a collection of bloom filters corresponding to
+-- the sizes used in a largeish LSM. In particular, the sizes are in increasing
+-- powers of 4, and the largest ones should be bigger than the CPU L3 cache.
+-- Furthermore, the keys in the filters are non-overlapping, and lookups will
+-- be true positives in only one filter. Thus most lookups will be true
+-- negatives.
+--
+-- The goal is to benchmark the benefits of optimisations for the LSM situation:
+--
+-- * where the same key is being looked up in many filters,
+-- * with a hit in only one filter expected, and
+-- * where the total size of the filters is too large to fully fit in cache
+--   (though the smaller ones may fit in the caches).
+--
+prepLookupsEnv :: [BloomFilterSizeInfo]
+                   -> StdGen
+                   -> StdGen
+                   -> IO ( V.Vector (Bloom SerialisedKey)
+                         , V.Vector IndexCompact
+                         , V.Vector (FS.Handle ())
+                         , V.Vector NumEntries
+                         )
+prepLookupsEnv filterSizes keyRng0 valRng0 =
+    withSystemTempDirectory "macro-bench-prepLookups" $ \tmpDir -> do
+      let hasFS = FS.ioHasFS (MountPoint tmpDir)
+      -- create the filters
+      mrs <- sequence
+              [ MRun.new hasFS (RunFsPaths rfp) (NumEntries numEntries) (numEntries * 42)
+              | (rfp, (numEntries, _, numBits, numHashFuncs)) <- zip [0..] filterSizes ]
+      -- add elements
+      foldM_
+        (\(keyRng, valRng) (i, mr) -> do
+          -- progress
+          when (i .&. 0xFFFF == 0) (putStr ".")
+          -- insert n elements into filter b
+          let k :: UTxOKey
+              (!k, !keyRng') = uniform keyRng
+              v :: UTxOValue
+              (!v, !valRng') = uniform valRng
+          MRun.addKeyOp hasFS mr (serialiseKey k) (Insert (serialiseValue v))
+          return (keyRng', valRng')
+        )
+        (keyRng0, valRng0)
+        (zip [0 .. totalNumEntries filterSizes - 1]
+            (cycle [ mb'
+                    | (mb, (_, sizeFactor, _, _)) <- zip mrs filterSizes
+                    , mb' <- replicate sizeFactor mb ]))
+      ress <- V.fromList <$> mapM (MRun.unsafeFinalise hasFS) mrs
+      let (_, blooms, indexes, numEntriess) = V.unzip4 ress
+          handles = V.replicate (V.length blooms) (FS.Handle () (FS.mkFsPath []))
+
+      pure $!! (blooms, indexes, handles, numEntriess)
+
+-- | This gives us a baseline cost of just calculating the series of keys.
+benchBaseline :: Vector (Bloom SerialisedKey) -> StdGen -> Int -> ()
+benchBaseline !_  !_   0 = ()
+benchBaseline !bs !rng !n =
+    let k :: Word256
+        (!k, !rng') = uniform rng
+        !k' = serialiseKey k
+     in k' `seq` benchBaseline bs rng' (n-1)
+
+-- | This gives us a combined cost of calculating the series of keys and their
+-- hashes.
+benchMakeCheapHashes :: Vector (Bloom SerialisedKey) -> StdGen -> Int -> ()
+benchMakeCheapHashes !_  !_   0 = ()
+benchMakeCheapHashes !bs !rng !n =
+    let k :: Word256
+        (!k, !rng') = uniform rng
+        !kh = Bloom.makeHashes (serialiseKey k) :: Bloom.CheapHashes SerialisedKey
+     in kh `seq` benchMakeCheapHashes bs rng' (n-1)
+
+-- | This gives us a combined cost of calculating the series of keys, their
+-- hashes, and then using 'Bloom.elemCheapHashes' with each filter.
+benchElemCheapHashes :: Vector (Bloom SerialisedKey) -> StdGen -> Int -> ()
+benchElemCheapHashes !_  !_   0  = ()
+benchElemCheapHashes !bs !rng !n =
+    let k :: Word256
+        (!k, !rng') = uniform rng
+        !kh =  Bloom.makeHashes (serialiseKey k)
+     in foldl' (\_ b -> Bloom.elemHashes kh b `seq` ()) () bs
+  `seq` benchElemCheapHashes bs rng' (n-1)
+
+-- -- | This gives us a combined cost of calculating the series of keys, their
+-- -- hashes, and then using 'Bloom.elemCheapHashes' with each filter.
+-- benchPrepLookups :: Vector (Bloom SerialisedKey) -> StdGen -> Int -> ()
+-- benchPrepLookups !_  !_   0  = ()
+-- benchPrepLookups !bs !rng !n =
+--     let (!rng1, !rng2) = split rng
+--         vks :: V.Vector UTxOKey
+--         !vks = vUniformWithReplacement' (Proxy @UTxOKey) rng1 benchmarkExtBatchSize
+--         !kh =  Bloom.makeHashes (serialiseKey k)
+--      in foldl' (\_ b -> Bloom.elemHashes kh b `seq` ()) () bs
+--   `seq` benchElemCheapHashes bs rng' (n-1)
+
+
+
+-- vUniformWithReplacement' :: Uniform a => Proxy a -> StdGen -> Int -> V.Vector a
+-- vUniformWithReplacement' _ rng0 n0 =  V.unfoldrExactN n0 uniform rng0
diff --git a/bench/macro/Main.hs b/bench/macro/Main.hs
@@ -10,10 +10,12 @@
 module Main (main) where
 
 import qualified Bench.Database.LSMTree.Internal.BloomFilter
+import qualified Bench.Database.LSMTree.Internal.Lookup
 
 import           System.IO
 
 main :: IO ()
 main = do
    hSetBuffering stdout NoBuffering
    Bench.Database.LSMTree.Internal.BloomFilter.benchmarks
+   Bench.Database.LSMTree.Internal.Lookup.benchmarks
diff --git a/lsm-tree.cabal b/lsm-tree.cabal
@@ -323,17 +323,21 @@ benchmark lsm-tree-macro-bench
   main-is:          Main.hs
   other-modules:
     Bench.Database.LSMTree.Internal.BloomFilter
+    Bench.Database.LSMTree.Internal.Lookup
     Database.LSMTree.Extras.Orphans
+    Database.LSMTree.Extras.UTxO
 
   build-depends:
     , base
+    , bytestring
     , deepseq
     , fs-api
     , lsm-tree
     , lsm-tree:bloomfilter
     , lsm-tree:fs-api-blockio
     , primitive
     , random
+    , temporary
     , time
     , vector
     , wide-word