From 872f315aa74cb8ff3446ab92de4a48ea3cbd63f1 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Mon, 26 May 2025 18:07:49 +0100 Subject: [PATCH 01/11] feat: add dump-diskio-complexities --- README.md | 8 +- lsm-tree.cabal | 8 +- scripts/dump-diskio-complexities.hs | 124 ++++++++++++++++++++++++++++ 3 files changed, 132 insertions(+), 8 deletions(-) create mode 100755 scripts/dump-diskio-complexities.hs diff --git a/README.md b/README.md index 17166064f..fc249eba9 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ schedule are determined by the `TableConfig` parameters Session -Create/Open +Open N/A N/A O(1) @@ -166,7 +166,7 @@ schedule are determined by the `TableConfig` parameters Table -Create +New N/A N/A O(1) @@ -253,7 +253,7 @@ schedule are determined by the `TableConfig` parameters Cursor -Create +New LazyLevelling N/A $O(T \: \log_T \frac{n}{B})$ @@ -267,7 +267,7 @@ schedule are determined by the `TableConfig` parameters -Read next entry +Next N/A N/A $O(\frac{1}{P})$ diff --git a/lsm-tree.cabal b/lsm-tree.cabal index 9bba65a5b..892abff25 100644 --- a/lsm-tree.cabal +++ b/lsm-tree.cabal @@ -87,11 +87,11 @@ description: +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | Resource | Operation | Merge policy | Merge schedule | Worst-case disk I\/O complexity | +==========+========================+=================+=================+================================================+ - | Session | Create\/Open | N\/A | N\/A | \(O(1)\) | + | Session | Open | N\/A | N\/A | \(O(1)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(o \: T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | Table | Create | N\/A | N\/A | \(O(1)\) | + | Table | New | N\/A | N\/A | \(O(1)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ @@ -115,11 +115,11 @@ description: +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | List | N\/A | N\/A | \(O(s)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | Cursor | Create | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | + | Cursor | New | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | | Read next entry | N\/A | N\/A | \(O(\frac{1}{P})\) | + | | Next | N\/A | N\/A | \(O(\frac{1}{P})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ (*The variable \(b\) refers to the number of entries retrieved by the range lookup.) diff --git a/scripts/dump-diskio-complexities.hs b/scripts/dump-diskio-complexities.hs new file mode 100755 index 000000000..069788714 --- /dev/null +++ b/scripts/dump-diskio-complexities.hs @@ -0,0 +1,124 @@ +#!/usr/bin/env cabal +{- cabal: +build-depends: + , base >=4.16 + , bytestring >=0.11 + , Cabal-syntax ^>=3.10 || ^>=3.12 + , cassava ^>=0.5 + , pandoc ^>=3.6.4 + , pandoc-types ^>=1.23.1 + , text >=2.1 +-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} + +module Main (main) where + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as BSL (putStr) +import Data.Csv (Header, NamedRecord, ToField (..), + ToNamedRecord (..), encodeByName, header, namedRecord, + (.=)) +import Data.IORef (atomicModifyIORef, newIORef) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as TIO +import Data.Traversable (for) +import Debug.Trace (traceShow) +import Distribution.PackageDescription.Parsec + (parseGenericPackageDescriptionMaybe) +import qualified Distribution.Types.GenericPackageDescription as GenericPackageDescription +import qualified Distribution.Types.PackageDescription as PackageDescription +import Distribution.Utils.ShortText (fromShortText) +import System.IO (hPutStrLn, stderr) +import Text.Pandoc (runIOorExplode) +import Text.Pandoc.Definition (Block (..), Inline (..), Row (..), + TableBody (..)) +import Text.Pandoc.Extensions (getDefaultExtensions) +import Text.Pandoc.Options (ReaderOptions (..), WriterOptions (..), + def) +import Text.Pandoc.Readers (readHaddock) +import Text.Pandoc.Walk (Walkable (query)) + + +main :: IO () +main = do + let lsmTreeCabalFile = "lsm-tree.cabal" + lsmTreeCabalContent <- BS.readFile lsmTreeCabalFile + case parseGenericPackageDescriptionMaybe lsmTreeCabalContent of + Nothing -> hPutStrLn stderr $ "error: Could not parse '" <> lsmTreeCabalFile <> "'" + Just genericPackageDescription -> do + let packageDescription = GenericPackageDescription.packageDescription genericPackageDescription + let description = T.pack . fromShortText $ PackageDescription.description packageDescription + doc <- runIOorExplode $ readHaddock def description + -- Get the disk I/O complexity table + let diskIOComplexityTable = query dumpDiskIOComplexityTable doc + resourceRef <- newIORef "" + operationsRef <- newIORef "" + entries <- + fmap concat . for diskIOComplexityTable $ \row -> do + let fullRow = replicate (5 - length row) "" <> row + let [newResource, newOperations, mergePolicy, mergeSchedule, rawWorstCaseDiskIOComplexity] = fullRow + resource <- atomicModifyIORef resourceRef (merge newResource) + operations <- atomicModifyIORef operationsRef (merge newOperations) + let worstCaseDiskIOComplexity = T.dropWhileEnd (`elem`[' ','*']) rawWorstCaseDiskIOComplexity + for (T.splitOn "/" operations) $ \operation -> do + let function + | (resource == "Table" || resource == "Cursor") && operation `notElem` ["New", "Close"] = toCamel (T.splitOn " " operation) + | resource == "Snapshot" && operation == "List" = "listSnapshots" -- plural + | otherwise = toCamel [operation, resource] + pure $ DiskIOComplexityEntry (function, mergePolicy, mergeSchedule, worstCaseDiskIOComplexity) + let csvData = encodeByName diskIOComplexityHeader entries + BSL.putStr csvData + +-------------------------------------------------------------------------------- +-- Helper functions +-------------------------------------------------------------------------------- + +newtype DiskIOComplexityEntry = DiskIOComplexityEntry (Text, Text, Text, Text) + +diskIOComplexityHeader :: Header +diskIOComplexityHeader = + header + ["Function", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity"] + +instance ToNamedRecord DiskIOComplexityEntry where + toNamedRecord (DiskIOComplexityEntry (function, mergePolicy, mergeSchedule, worstCaseDiskIOComplexity)) = + namedRecord + [ "Function" .= toField function + , "Merge policy" .= toField mergePolicy + , "Merge schedule" .= toField mergeSchedule + , "Worst-case disk I/O complexity" .= toField worstCaseDiskIOComplexity + ] + +contents :: Walkable Block a => a -> [Text] +contents = query forBlock + where + forBlock :: Block -> [Text] + forBlock block = [T.unwords (query forInline block)] + forInline :: Inline -> [Text] + forInline (Code _ text) = [text] + forInline (Math _ text) = [text] + forInline (Str text) = [text] + forInline _ = [] + +diskIOComplexityTableHeader :: [Text] +diskIOComplexityTableHeader = + ["Resource", "Operation", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity"] + +dumpDiskIOComplexityTable :: Block -> [[Text]] +dumpDiskIOComplexityTable table@(Table _attr _caption _cols tableHead tableBodies _tableFoot) + | contents tableHead == diskIOComplexityTableHeader = + [ [T.unwords (contents cell) | cell <- cells] + | TableBody _ _ _ rows <- tableBodies + , Row _ cells <- rows + ] +dumpDiskIOComplexityTable _ = [] + +merge :: Text -> Text -> (Text, Text) +merge new old = let res = if T.null new then old else new in (res, res) + +toCamel :: [Text] -> Text +toCamel [] = mempty +toCamel (x : xs) = T.toLower x <> T.concat (T.toTitle <$> xs) From 2c02aaed54f561b7675c01c5894c45bb2598a121 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 17:05:13 +0100 Subject: [PATCH 02/11] fix: move both scripts into diskio-complexities directory --- .../dump-from-package-description.hs} | 0 .../from-hs.sh => diskio-complexities/dump-from-source.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/{dump-diskio-complexities.hs => diskio-complexities/dump-from-package-description.hs} (100%) rename scripts/{time-complexities/from-hs.sh => diskio-complexities/dump-from-source.sh} (100%) diff --git a/scripts/dump-diskio-complexities.hs b/scripts/diskio-complexities/dump-from-package-description.hs similarity index 100% rename from scripts/dump-diskio-complexities.hs rename to scripts/diskio-complexities/dump-from-package-description.hs diff --git a/scripts/time-complexities/from-hs.sh b/scripts/diskio-complexities/dump-from-source.sh similarity index 100% rename from scripts/time-complexities/from-hs.sh rename to scripts/diskio-complexities/dump-from-source.sh From 0e0845d74bf295d120a5824c29ba84f1c48b7a88 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 17:05:52 +0100 Subject: [PATCH 03/11] fix: change mention of time complexity to disk I/O complexity --- Makefile | 13 +++++++++++++ scripts/diskio-complexities/dump-from-source.sh | 6 +++--- 2 files changed, 16 insertions(+), 3 deletions(-) create mode 100755 Makefile diff --git a/Makefile b/Makefile new file mode 100755 index 000000000..c8064e82e --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +SOURCE=lsm-tree.cabal $(shell find src -type f -name '*.hs') + +.PHONY: watch +watch: + fswatch -o $(SOURCE) | xargs -n1 -I{} make build + +.PHONY: build +build: $(SOURCE) + time cabal haddock lsm-tree:lib:lsm-tree --builddir="dist-newstyle/haddock" + +.PHONY: serve +serve: + python -m http.server -d "dist-newstyle/haddock/build/"*"/ghc-"*"/lsm-tree-"*"/doc/html/lsm-tree/" diff --git a/scripts/diskio-complexities/dump-from-source.sh b/scripts/diskio-complexities/dump-from-source.sh index 7649364cf..cfe2e75f8 100755 --- a/scripts/diskio-complexities/dump-from-source.sh +++ b/scripts/diskio-complexities/dump-from-source.sh @@ -44,7 +44,7 @@ newline_rs='\ printf '%s\n' 'Function,Merge policy,Merge schedule,Worst-case disk I/O complexity,Condition' sed -En -e ' - # Collect complexity information from a Haddock annotation + # Collect all disk I/O complexity information from a Haddock annotation /^\{- *\|/,/-}/ { # Store an unconditional disk I/O complexity s/'"$unconditional_re"'/,,\1,/ @@ -61,7 +61,7 @@ sed -En -e ' H d } - # Possibly fetch parameter-specific complexity information + # Possibly fetch parameter-specific disk I/O complexity information /^\['"'"'[^'"'"']+'"'"'(\\\/'"'"'[^'"'"']+'"'"')?]/ { # Construct the parameter fields s/\['"'"'([^'"'"']+)'"'"'/\1,/ @@ -69,7 +69,7 @@ sed -En -e ' s/]:$// # Get the next line N - # Store the complexity from a disk I/O complexity item + # Store the disk I/O complexity from a disk I/O complexity item s/\n *'"$o_expr_re"'\.$/,\1,/ t store # Ignore an item that is not a disk I/O complexity item From eeeeb741fd35ebd455160ae6e79a81a15d1a1e52 Mon Sep 17 00:00:00 2001 From: Wolfgang Jeltsch Date: Tue, 27 May 2025 19:47:58 +0300 Subject: [PATCH 04/11] =?UTF-8?q?Fit=20`sed`=20script=20comments=20into=20?= =?UTF-8?q?72=C2=A0columns=20again?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- scripts/diskio-complexities/dump-from-source.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/diskio-complexities/dump-from-source.sh b/scripts/diskio-complexities/dump-from-source.sh index cfe2e75f8..29152413e 100755 --- a/scripts/diskio-complexities/dump-from-source.sh +++ b/scripts/diskio-complexities/dump-from-source.sh @@ -61,7 +61,7 @@ sed -En -e ' H d } - # Possibly fetch parameter-specific disk I/O complexity information + # Possibly fetch parameter-specific complexity information /^\['"'"'[^'"'"']+'"'"'(\\\/'"'"'[^'"'"']+'"'"')?]/ { # Construct the parameter fields s/\['"'"'([^'"'"']+)'"'"'/\1,/ @@ -69,7 +69,7 @@ sed -En -e ' s/]:$// # Get the next line N - # Store the disk I/O complexity from a disk I/O complexity item + # Store the complexity from a disk I/O complexity item s/\n *'"$o_expr_re"'\.$/,\1,/ t store # Ignore an item that is not a disk I/O complexity item From 2faacefcd1fc59d95486a9a16f47f73007b72951 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 17:55:38 +0100 Subject: [PATCH 05/11] fix: rename sub-directory to lint-diskio-complexities --- .../dump-from-package-description.hs | 0 .../dump-from-source.sh | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename scripts/{diskio-complexities => lint-diskio-complexities}/dump-from-package-description.hs (100%) rename scripts/{diskio-complexities => lint-diskio-complexities}/dump-from-source.sh (100%) diff --git a/scripts/diskio-complexities/dump-from-package-description.hs b/scripts/lint-diskio-complexities/dump-from-package-description.hs similarity index 100% rename from scripts/diskio-complexities/dump-from-package-description.hs rename to scripts/lint-diskio-complexities/dump-from-package-description.hs diff --git a/scripts/diskio-complexities/dump-from-source.sh b/scripts/lint-diskio-complexities/dump-from-source.sh similarity index 100% rename from scripts/diskio-complexities/dump-from-source.sh rename to scripts/lint-diskio-complexities/dump-from-source.sh From cdbcfa2da49f65a2df0ed584db393a6703db830c Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 18:01:17 +0100 Subject: [PATCH 06/11] fix: replace N/A with the empty string --- .../lint-diskio-complexities/dump-from-package-description.hs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/lint-diskio-complexities/dump-from-package-description.hs b/scripts/lint-diskio-complexities/dump-from-package-description.hs index 069788714..585dc31cb 100755 --- a/scripts/lint-diskio-complexities/dump-from-package-description.hs +++ b/scripts/lint-diskio-complexities/dump-from-package-description.hs @@ -59,9 +59,11 @@ main = do entries <- fmap concat . for diskIOComplexityTable $ \row -> do let fullRow = replicate (5 - length row) "" <> row - let [newResource, newOperations, mergePolicy, mergeSchedule, rawWorstCaseDiskIOComplexity] = fullRow + let [newResource, newOperations, newMergePolicy, newMergeSchedule, rawWorstCaseDiskIOComplexity] = fullRow resource <- atomicModifyIORef resourceRef (merge newResource) operations <- atomicModifyIORef operationsRef (merge newOperations) + let mergePolicy = if newMergePolicy == "N/A" then "" else newMergePolicy + let mergeSchedule = if newMergeSchedule == "N/A" then "" else newMergeSchedule let worstCaseDiskIOComplexity = T.dropWhileEnd (`elem`[' ','*']) rawWorstCaseDiskIOComplexity for (T.splitOn "/" operations) $ \operation -> do let function From 3fc80801b2d1db32261adddea9ae7097317477ba Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 18:36:15 +0100 Subject: [PATCH 07/11] fix: change emitted conditions to identifiers --- scripts/lint-diskio-complexities/dump-from-source.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/lint-diskio-complexities/dump-from-source.sh b/scripts/lint-diskio-complexities/dump-from-source.sh index 29152413e..ff3196f64 100755 --- a/scripts/lint-diskio-complexities/dump-from-source.sh +++ b/scripts/lint-diskio-complexities/dump-from-source.sh @@ -50,10 +50,10 @@ sed -En -e ' s/'"$unconditional_re"'/,,\1,/ t store # Store a disk I/O complexity for the “nothing left open” case - s/'"$nothing_left_open_re"'/,,\1,Nothing left open/ + s/'"$nothing_left_open_re"'/,,\1,NothingLeftOpen/ t store # Store a disk I/O complexity with unknown condition - s/^.*'"$o_expr_re"'.*$/,,\1,Unknown condition/ + s/^.*'"$o_expr_re"'.*$/,,\1,UnknownCondition/ t store # Note down the occurrence of a “same merge policy” restriction /'"$same_merge_policy_re"'/ { @@ -81,7 +81,7 @@ sed -En -e ' :store H x - s/\n=\n(.*)/'"$newline_rs"'\1Same merge policy'"$newline_rs"'=/ + s/\n=\n(.*)/'"$newline_rs"'\1SameMergePolicy'"$newline_rs"'=/ x # Continue with the next line d From 014fec12f79040c930ea46b2cb9dcb95fc6c2834 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Tue, 27 May 2025 17:55:58 +0100 Subject: [PATCH 08/11] feat(scripts): add lint-diskio-complexities --- scripts/lint-diskio-complexities.hs | 186 ++++++++++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100755 scripts/lint-diskio-complexities.hs diff --git a/scripts/lint-diskio-complexities.hs b/scripts/lint-diskio-complexities.hs new file mode 100755 index 000000000..ab57c7dd6 --- /dev/null +++ b/scripts/lint-diskio-complexities.hs @@ -0,0 +1,186 @@ +#!/usr/bin/env cabal +{- cabal: +build-depends: + , base >=4.16 && <5 + , bytestring ^>=0.11 + , cassava ^>=0.5 + , containers ^>=0.6 || ^>=0.7 || ^>=0.8 + , process ^>=1.6 + , text ^>=2.1 + , vector ^>=0.12 || ^>=0.13 +-} +{-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE TypeApplications #-} + +import Control.Applicative (Alternative (..)) +import qualified Data.ByteString.Lazy as BSL +import Data.Csv +import Data.List (zip4) +import Data.Map (Map) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isNothing) +import Data.Set (Set) +import qualified Data.Set as S +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Traversable (for) +import Data.Vector (Vector) +import qualified Data.Vector as V +import System.Process (readProcess) + +type Function = Text +type MergePolicy = Text +type MergeSchedule = Text +type WorstCaseDiskIOComplexity = Text +type Condition = Text + +data DiskIOComplexity = DiskIOComplexity + { function :: Function + , mergePolicy :: Maybe MergePolicy + , mergeSchedule :: Maybe MergeSchedule + , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity + , condition :: Maybe Condition + } + deriving (Eq, Show) + +looseEq :: DiskIOComplexity -> DiskIOComplexity -> Bool +dioc1 `looseEq` dioc2 = + and + [ dioc1.function == dioc2.function + , dioc1.mergePolicy == dioc2.mergePolicy + , dioc1.mergeSchedule == dioc2.mergeSchedule + , dioc1.worstCaseDiskIOComplexity == dioc2.worstCaseDiskIOComplexity + , isNothing dioc1.condition || dioc1.condition == dioc2.condition + ] + +main :: IO () +main = do + -- Get the disk I/O complexities from the package description + mapForPackageDescription <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-package-description.hs" [] "" + + -- Get the disk I/O complexities from Database.LSMTree + mapForFullApi <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-source.sh" ["./src/Database/LSMTree.hs"] "" + + -- Get the disk I/O complexities from Database.LSMTree.Simple + mapForSimpleApi <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-source.sh" ["./src/Database/LSMTree/Simple.hs"] "" + + -- Comparing Database.LSMTree.Simple to Database.LSMTree + putStrLn "Comparing Database.LSMTree.Simple to Database.LSMTree:" + diskIOComplexityComparisonSimpleToFull <- + for (concat . M.elems $ mapForSimpleApi) $ \diskIOComplexity@DiskIOComplexity{..} -> do + case M.lookup function mapForFullApi of + Nothing -> + pure [("Database.LSMTree.Simple", diskIOComplexity)] + Just fullDiskIOComplexities + | diskIOComplexity `elem` fullDiskIOComplexities -> pure [] + | otherwise -> pure (("Database.LSMTree.Simple", diskIOComplexity) : (("Database.LSMTree",) <$> fullDiskIOComplexities)) + TIO.putStrLn . prettyDiskIOComplexityTable . concat $ diskIOComplexityComparisonSimpleToFull + + -- Comparing lsm-tree.cabal to Database.LSMTree + putStrLn "Comparing lsm-tree.cabal to Database.LSMTree:" + diskIOComplexityComparisonPackageDescriptionToFull <- + for (concat . M.elems $ mapForPackageDescription) $ \diskIOComplexity@DiskIOComplexity{..} -> do + case M.lookup function mapForFullApi of + Nothing -> + pure [("lsm-tree.cabal", diskIOComplexity)] + Just fullDiskIOComplexities + | any (looseEq diskIOComplexity) fullDiskIOComplexities -> pure [] + | otherwise -> pure (("lsm-tree.cabal", diskIOComplexity) : (("Database.LSMTree",) <$> fullDiskIOComplexities)) + TIO.putStrLn . prettyDiskIOComplexityTable . concat $ diskIOComplexityComparisonPackageDescriptionToFull + +-------------------------------------------------------------------------------- +-- Helper functions +-------------------------------------------------------------------------------- + +-- | Typeset a tagged list of 'DiskIOComplexity' records as an aligned table. +prettyDiskIOComplexityTable :: [(Text, DiskIOComplexity)] -> Text +prettyDiskIOComplexityTable diskIOComplexities = + T.unlines + [ T.unwords + [ tag `padUpTo` maxTagLen + , function `padUpTo` maxFunctionLen + , condition `padUpTo` maxConditionLen + , worstCaseDiskIOComplexity `padUpTo` maxWorstCaseDiskIOComplexityLen + ] + | (tag, function, condition, worstCaseDiskIOComplexity) <- + zip4 tags functions conditions worstCaseDiskIOComplexities + ] + where + tags = fst <$> diskIOComplexities + maxTagLen = maximum (T.length <$> tags) + + functions = ((.function) . snd) <$> diskIOComplexities + maxFunctionLen = maximum (T.length <$> functions) + + conditions = (prettyCondition . snd) <$> diskIOComplexities + maxConditionLen = maximum (T.length <$> conditions) + + worstCaseDiskIOComplexities = ((.worstCaseDiskIOComplexity) . snd) <$> diskIOComplexities + maxWorstCaseDiskIOComplexityLen = maximum (T.length <$> worstCaseDiskIOComplexities) + + padUpTo :: Text -> Int -> Text + padUpTo txt len = txt <> T.replicate (len - T.length txt) " " + + prettyCondition :: DiskIOComplexity -> Text + prettyCondition DiskIOComplexity{..} = + fromMaybe "*" (unionMaybeWith slash mergePolicy (unionMaybeWith slash mergeSchedule condition)) + where + slash :: Text -> Text -> Text + x `slash` y = x <> "/" <> y + + unionMaybeWith :: (a -> a -> a) -> Maybe a -> Maybe a -> Maybe a + unionMaybeWith op (Just x) (Just y) = Just (x `op` y) + unionMaybeWith _op (Just x) Nothing = Just x + unionMaybeWith _op Nothing (Just y) = Just y + unionMaybeWith _op Nothing Nothing = Nothing + +-- | Structure vector of 'DiskIOComplexity' records into lookup table by function name. +buildDiskIOComplexityMap :: Vector DiskIOComplexity -> Map Function [DiskIOComplexity] +buildDiskIOComplexityMap = M.unionsWith (<>) . fmap toSingletonMap . V.toList + where + toSingletonMap :: DiskIOComplexity -> Map Function [DiskIOComplexity] + toSingletonMap diskIOComplexity = M.singleton diskIOComplexity.function [diskIOComplexity] + +-- | Parse CSV file into vector of 'DiskIOComplexity' records. +decodeDiskIOComplexities :: String -> Vector DiskIOComplexity +decodeDiskIOComplexities = + either error snd . decodeByName . BSL.fromStrict . TE.encodeUtf8 . T.pack + +-- | CSV file header for 'DiskIOComplexity' records. +diskIOComplexityHeader :: Header +diskIOComplexityHeader = + header ["Function", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity", "Condition"] + +normaliseWorstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity -> WorstCaseDiskIOComplexity +normaliseWorstCaseDiskIOComplexity = + T.replace "+ " "+" . T.replace " +" "+" . T.replace " \\" "\\" . T.replace " " " " . T.replace "\\:" "" + +-- | Parse CSV row into 'DiskIOComplexity' record. +instance FromNamedRecord DiskIOComplexity where + parseNamedRecord :: NamedRecord -> Parser DiskIOComplexity + parseNamedRecord m = do + function <- m .: "Function" + mergePolicy <- orNotApplicable (m .: "Merge policy") + mergeSchedule <- orNotApplicable (m .: "Merge schedule") + worstCaseDiskIOComplexity <- normaliseWorstCaseDiskIOComplexity <$> (m .: "Worst-case disk I/O complexity") + condition <- orNotApplicable (m .: "Condition") + pure DiskIOComplexity{..} + where + orNotApplicable :: Parser Text -> Parser (Maybe Text) + orNotApplicable pText = (emptyTextToNothing <$> pText) <|> pure Nothing + where + emptyTextToNothing :: Text -> Maybe Text + emptyTextToNothing txt = + if T.null txt then Nothing else Just txt From f0c2d64a26bd7e562f26bbd1399521a54b999465 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Wed, 28 May 2025 13:11:42 +0100 Subject: [PATCH 09/11] fix(scripts): ensure consistent data type definitions between lint and dump scripts and move entry-to-function mapping to top-level --- .../dump-from-package-description.hs | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/scripts/lint-diskio-complexities/dump-from-package-description.hs b/scripts/lint-diskio-complexities/dump-from-package-description.hs index 585dc31cb..b7d284958 100755 --- a/scripts/lint-diskio-complexities/dump-from-package-description.hs +++ b/scripts/lint-diskio-complexities/dump-from-package-description.hs @@ -12,6 +12,7 @@ build-depends: {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} module Main (main) where @@ -41,6 +42,11 @@ import Text.Pandoc.Options (ReaderOptions (..), WriterOptions (..), import Text.Pandoc.Readers (readHaddock) import Text.Pandoc.Walk (Walkable (query)) +tableEntryToFunction :: Text -> Text -> Text +tableEntryToFunction resource operation + | (resource == "Table" || resource == "Cursor") && operation `notElem` ["New", "Close"] = toCamel (T.splitOn " " operation) + | resource == "Snapshot" && operation == "List" = "listSnapshots" -- plural + | otherwise = toCamel [operation, resource] main :: IO () main = do @@ -62,15 +68,12 @@ main = do let [newResource, newOperations, newMergePolicy, newMergeSchedule, rawWorstCaseDiskIOComplexity] = fullRow resource <- atomicModifyIORef resourceRef (merge newResource) operations <- atomicModifyIORef operationsRef (merge newOperations) - let mergePolicy = if newMergePolicy == "N/A" then "" else newMergePolicy - let mergeSchedule = if newMergeSchedule == "N/A" then "" else newMergeSchedule + let mergePolicy = if newMergePolicy == "N/A" then Nothing else Just newMergePolicy + let mergeSchedule = if newMergeSchedule == "N/A" then Nothing else Just newMergeSchedule let worstCaseDiskIOComplexity = T.dropWhileEnd (`elem`[' ','*']) rawWorstCaseDiskIOComplexity for (T.splitOn "/" operations) $ \operation -> do - let function - | (resource == "Table" || resource == "Cursor") && operation `notElem` ["New", "Close"] = toCamel (T.splitOn " " operation) - | resource == "Snapshot" && operation == "List" = "listSnapshots" -- plural - | otherwise = toCamel [operation, resource] - pure $ DiskIOComplexityEntry (function, mergePolicy, mergeSchedule, worstCaseDiskIOComplexity) + let function = tableEntryToFunction resource operation + pure $ DiskIOComplexity {..} let csvData = encodeByName diskIOComplexityHeader entries BSL.putStr csvData @@ -78,15 +81,26 @@ main = do -- Helper functions -------------------------------------------------------------------------------- -newtype DiskIOComplexityEntry = DiskIOComplexityEntry (Text, Text, Text, Text) +type Function = Text +type MergePolicy = Text +type MergeSchedule = Text +type WorstCaseDiskIOComplexity = Text + +data DiskIOComplexity = DiskIOComplexity + { function :: Function + , mergePolicy :: Maybe MergePolicy + , mergeSchedule :: Maybe MergeSchedule + , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity + } + deriving (Eq, Show) diskIOComplexityHeader :: Header diskIOComplexityHeader = header ["Function", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity"] -instance ToNamedRecord DiskIOComplexityEntry where - toNamedRecord (DiskIOComplexityEntry (function, mergePolicy, mergeSchedule, worstCaseDiskIOComplexity)) = +instance ToNamedRecord DiskIOComplexity where + toNamedRecord DiskIOComplexity {..} = namedRecord [ "Function" .= toField function , "Merge policy" .= toField mergePolicy From 063e6f51faedd3751423e935ebc9a8ca2d42fa11 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Wed, 28 May 2025 13:12:03 +0100 Subject: [PATCH 10/11] fix(scripts): remap openTableFromSnapshot --- .../lint-diskio-complexities/dump-from-package-description.hs | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/lint-diskio-complexities/dump-from-package-description.hs b/scripts/lint-diskio-complexities/dump-from-package-description.hs index b7d284958..5f7770f15 100755 --- a/scripts/lint-diskio-complexities/dump-from-package-description.hs +++ b/scripts/lint-diskio-complexities/dump-from-package-description.hs @@ -45,6 +45,7 @@ import Text.Pandoc.Walk (Walkable (query)) tableEntryToFunction :: Text -> Text -> Text tableEntryToFunction resource operation | (resource == "Table" || resource == "Cursor") && operation `notElem` ["New", "Close"] = toCamel (T.splitOn " " operation) + | resource == "Snapshot" && operation == "Open" = "openTableFromSnapshot" | resource == "Snapshot" && operation == "List" = "listSnapshots" -- plural | otherwise = toCamel [operation, resource] From 09f8d1624d6b86bf10f4de7e9e7773a14de69c19 Mon Sep 17 00:00:00 2001 From: Wen Kokke Date: Wed, 28 May 2025 13:40:18 +0100 Subject: [PATCH 11/11] fix(scripts): clean up lint-diskio-complexities --- scripts/lint-diskio-complexities.hs | 147 ++++++++++++++-------------- 1 file changed, 71 insertions(+), 76 deletions(-) diff --git a/scripts/lint-diskio-complexities.hs b/scripts/lint-diskio-complexities.hs index ab57c7dd6..a6c84012e 100755 --- a/scripts/lint-diskio-complexities.hs +++ b/scripts/lint-diskio-complexities.hs @@ -17,6 +17,7 @@ build-depends: {-# LANGUAGE TypeApplications #-} import Control.Applicative (Alternative (..)) +import Control.Monad (unless) import qualified Data.ByteString.Lazy as BSL import Data.Csv import Data.List (zip4) @@ -32,33 +33,9 @@ import qualified Data.Text.IO as TIO import Data.Traversable (for) import Data.Vector (Vector) import qualified Data.Vector as V +import System.Exit (ExitCode (..), exitWith) import System.Process (readProcess) -type Function = Text -type MergePolicy = Text -type MergeSchedule = Text -type WorstCaseDiskIOComplexity = Text -type Condition = Text - -data DiskIOComplexity = DiskIOComplexity - { function :: Function - , mergePolicy :: Maybe MergePolicy - , mergeSchedule :: Maybe MergeSchedule - , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity - , condition :: Maybe Condition - } - deriving (Eq, Show) - -looseEq :: DiskIOComplexity -> DiskIOComplexity -> Bool -dioc1 `looseEq` dioc2 = - and - [ dioc1.function == dioc2.function - , dioc1.mergePolicy == dioc2.mergePolicy - , dioc1.mergeSchedule == dioc2.mergeSchedule - , dioc1.worstCaseDiskIOComplexity == dioc2.worstCaseDiskIOComplexity - , isNothing dioc1.condition || dioc1.condition == dioc2.condition - ] - main :: IO () main = do -- Get the disk I/O complexities from the package description @@ -78,91 +55,109 @@ main = do -- Comparing Database.LSMTree.Simple to Database.LSMTree putStrLn "Comparing Database.LSMTree.Simple to Database.LSMTree:" - diskIOComplexityComparisonSimpleToFull <- - for (concat . M.elems $ mapForSimpleApi) $ \diskIOComplexity@DiskIOComplexity{..} -> do + comparisonSimpleToFull <- + fmap concat . for (concat . M.elems $ mapForSimpleApi) $ \simpleEntry@DiskIOComplexity{..} -> do case M.lookup function mapForFullApi of Nothing -> - pure [("Database.LSMTree.Simple", diskIOComplexity)] - Just fullDiskIOComplexities - | diskIOComplexity `elem` fullDiskIOComplexities -> pure [] - | otherwise -> pure (("Database.LSMTree.Simple", diskIOComplexity) : (("Database.LSMTree",) <$> fullDiskIOComplexities)) - TIO.putStrLn . prettyDiskIOComplexityTable . concat $ diskIOComplexityComparisonSimpleToFull + pure [("Database.LSMTree.Simple", simpleEntry)] + Just fullEntries + | simpleEntry `elem` fullEntries -> pure [] + | otherwise -> pure (("Database.LSMTree.Simple", simpleEntry) : (("Database.LSMTree",) <$> fullEntries)) + TIO.putStrLn (prettyDiskIOComplexityTable comparisonSimpleToFull) -- Comparing lsm-tree.cabal to Database.LSMTree putStrLn "Comparing lsm-tree.cabal to Database.LSMTree:" - diskIOComplexityComparisonPackageDescriptionToFull <- - for (concat . M.elems $ mapForPackageDescription) $ \diskIOComplexity@DiskIOComplexity{..} -> do + comparisonPackageDescriptionToFull <- + fmap concat . for (concat . M.elems $ mapForPackageDescription) $ \simpleEntry@DiskIOComplexity{..} -> do case M.lookup function mapForFullApi of Nothing -> - pure [("lsm-tree.cabal", diskIOComplexity)] - Just fullDiskIOComplexities - | any (looseEq diskIOComplexity) fullDiskIOComplexities -> pure [] - | otherwise -> pure (("lsm-tree.cabal", diskIOComplexity) : (("Database.LSMTree",) <$> fullDiskIOComplexities)) - TIO.putStrLn . prettyDiskIOComplexityTable . concat $ diskIOComplexityComparisonPackageDescriptionToFull + pure [("lsm-tree.cabal", simpleEntry)] + Just fullEntries + | any (looseEq simpleEntry) fullEntries -> pure [] + | otherwise -> pure (("lsm-tree.cabal", simpleEntry) : (("Database.LSMTree",) <$> fullEntries)) + TIO.putStrLn (prettyDiskIOComplexityTable comparisonPackageDescriptionToFull) + + -- Set the exit code based on whether any differences were found + unless (null comparisonSimpleToFull && null comparisonPackageDescriptionToFull) $ + exitWith (ExitFailure 1) -------------------------------------------------------------------------------- -- Helper functions -------------------------------------------------------------------------------- +type Function = Text +type MergePolicy = Text +type MergeSchedule = Text +type WorstCaseDiskIOComplexity = Text +type Condition = Text + +data DiskIOComplexity = DiskIOComplexity + { function :: Function + , mergePolicy :: Maybe MergePolicy + , mergeSchedule :: Maybe MergeSchedule + , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity + , condition :: Maybe Condition + } + deriving (Eq, Show) + +-- | Loose equality which is used when comparing the disk I/O complexities +-- listed in the package description to those in the modules. Those in the +-- package description do not list complex side conditions, such as all +-- tables having been closed beforehand, or all tables having the same merge +-- policy. Therefore, this equality disregards mismatches when the first +-- entry does not list a condition. +looseEq :: DiskIOComplexity -> DiskIOComplexity -> Bool +entry1 `looseEq` entry2 = + and + [ entry1.function == entry2.function + , entry1.mergePolicy == entry2.mergePolicy + , entry1.mergeSchedule == entry2.mergeSchedule + , entry1.worstCaseDiskIOComplexity == entry2.worstCaseDiskIOComplexity + , isNothing entry1.condition || entry1.condition == entry2.condition + ] + -- | Typeset a tagged list of 'DiskIOComplexity' records as an aligned table. prettyDiskIOComplexityTable :: [(Text, DiskIOComplexity)] -> Text -prettyDiskIOComplexityTable diskIOComplexities = +prettyDiskIOComplexityTable [] = "No differences found.\n" +prettyDiskIOComplexityTable entries = T.unlines [ T.unwords - [ tag `padUpTo` maxTagLen - , function `padUpTo` maxFunctionLen - , condition `padUpTo` maxConditionLen - , worstCaseDiskIOComplexity `padUpTo` maxWorstCaseDiskIOComplexityLen + [ prettyCellForColumn tag tags + , prettyCellForColumn function functions + , prettyCellForColumn fullCondition fullConditions + , prettyCellForColumn worstCaseDiskIOComplexity worstCaseDiskIOComplexities ] - | (tag, function, condition, worstCaseDiskIOComplexity) <- - zip4 tags functions conditions worstCaseDiskIOComplexities + | (tag, function, fullCondition, worstCaseDiskIOComplexity) <- + zip4 tags functions fullConditions worstCaseDiskIOComplexities ] where - tags = fst <$> diskIOComplexities - maxTagLen = maximum (T.length <$> tags) + tags = fst <$> entries + functions = ((.function) . snd) <$> entries + fullConditions = (prettyFullCondition . snd) <$> entries + worstCaseDiskIOComplexities = ((.worstCaseDiskIOComplexity) . snd) <$> entries - functions = ((.function) . snd) <$> diskIOComplexities - maxFunctionLen = maximum (T.length <$> functions) + prettyCellForColumn :: Text -> [Text] -> Text + prettyCellForColumn cell column = cell <> T.replicate (maximum (T.length <$> column) - T.length cell) " " - conditions = (prettyCondition . snd) <$> diskIOComplexities - maxConditionLen = maximum (T.length <$> conditions) - - worstCaseDiskIOComplexities = ((.worstCaseDiskIOComplexity) . snd) <$> diskIOComplexities - maxWorstCaseDiskIOComplexityLen = maximum (T.length <$> worstCaseDiskIOComplexities) - - padUpTo :: Text -> Int -> Text - padUpTo txt len = txt <> T.replicate (len - T.length txt) " " - - prettyCondition :: DiskIOComplexity -> Text - prettyCondition DiskIOComplexity{..} = - fromMaybe "*" (unionMaybeWith slash mergePolicy (unionMaybeWith slash mergeSchedule condition)) + prettyFullCondition :: DiskIOComplexity -> Text + prettyFullCondition DiskIOComplexity{..} = + fromMaybe "*" mergePolicy `slashWith` mergeSchedule `slashWith` condition where - slash :: Text -> Text -> Text - x `slash` y = x <> "/" <> y - - unionMaybeWith :: (a -> a -> a) -> Maybe a -> Maybe a -> Maybe a - unionMaybeWith op (Just x) (Just y) = Just (x `op` y) - unionMaybeWith _op (Just x) Nothing = Just x - unionMaybeWith _op Nothing (Just y) = Just y - unionMaybeWith _op Nothing Nothing = Nothing + slashWith :: Text -> Maybe Text -> Text + slashWith x my = maybe x (\y -> x <> "/" <> y) my -- | Structure vector of 'DiskIOComplexity' records into lookup table by function name. buildDiskIOComplexityMap :: Vector DiskIOComplexity -> Map Function [DiskIOComplexity] buildDiskIOComplexityMap = M.unionsWith (<>) . fmap toSingletonMap . V.toList where toSingletonMap :: DiskIOComplexity -> Map Function [DiskIOComplexity] - toSingletonMap diskIOComplexity = M.singleton diskIOComplexity.function [diskIOComplexity] + toSingletonMap simpleEntry = M.singleton simpleEntry.function [simpleEntry] -- | Parse CSV file into vector of 'DiskIOComplexity' records. decodeDiskIOComplexities :: String -> Vector DiskIOComplexity decodeDiskIOComplexities = either error snd . decodeByName . BSL.fromStrict . TE.encodeUtf8 . T.pack --- | CSV file header for 'DiskIOComplexity' records. -diskIOComplexityHeader :: Header -diskIOComplexityHeader = - header ["Function", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity", "Condition"] - normaliseWorstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity -> WorstCaseDiskIOComplexity normaliseWorstCaseDiskIOComplexity = T.replace "+ " "+" . T.replace " +" "+" . T.replace " \\" "\\" . T.replace " " " " . T.replace "\\:" ""