diff --git a/Makefile b/Makefile new file mode 100755 index 000000000..c8064e82e --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +SOURCE=lsm-tree.cabal $(shell find src -type f -name '*.hs') + +.PHONY: watch +watch: + fswatch -o $(SOURCE) | xargs -n1 -I{} make build + +.PHONY: build +build: $(SOURCE) + time cabal haddock lsm-tree:lib:lsm-tree --builddir="dist-newstyle/haddock" + +.PHONY: serve +serve: + python -m http.server -d "dist-newstyle/haddock/build/"*"/ghc-"*"/lsm-tree-"*"/doc/html/lsm-tree/" diff --git a/README.md b/README.md index 17166064f..fc249eba9 100644 --- a/README.md +++ b/README.md @@ -151,7 +151,7 @@ schedule are determined by the `TableConfig` parameters Session -Create/Open +Open N/A N/A O(1) @@ -166,7 +166,7 @@ schedule are determined by the `TableConfig` parameters Table -Create +New N/A N/A O(1) @@ -253,7 +253,7 @@ schedule are determined by the `TableConfig` parameters Cursor -Create +New LazyLevelling N/A $O(T \: \log_T \frac{n}{B})$ @@ -267,7 +267,7 @@ schedule are determined by the `TableConfig` parameters -Read next entry +Next N/A N/A $O(\frac{1}{P})$ diff --git a/lsm-tree.cabal b/lsm-tree.cabal index 9bba65a5b..892abff25 100644 --- a/lsm-tree.cabal +++ b/lsm-tree.cabal @@ -87,11 +87,11 @@ description: +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | Resource | Operation | Merge policy | Merge schedule | Worst-case disk I\/O complexity | +==========+========================+=================+=================+================================================+ - | Session | Create\/Open | N\/A | N\/A | \(O(1)\) | + | Session | Open | N\/A | N\/A | \(O(1)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(o \: T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | Table | Create | N\/A | N\/A | \(O(1)\) | + | Table | New | N\/A | N\/A | \(O(1)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ @@ -115,11 +115,11 @@ description: +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | List | N\/A | N\/A | \(O(s)\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | Cursor | Create | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | + | Cursor | New | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ | | Close | @LazyLevelling@ | N\/A | \(O(T \: \log_T \frac{n}{B})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ - | | Read next entry | N\/A | N\/A | \(O(\frac{1}{P})\) | + | | Next | N\/A | N\/A | \(O(\frac{1}{P})\) | +----------+------------------------+-----------------+-----------------+------------------------------------------------+ (*The variable \(b\) refers to the number of entries retrieved by the range lookup.) diff --git a/scripts/lint-diskio-complexities.hs b/scripts/lint-diskio-complexities.hs new file mode 100755 index 000000000..a6c84012e --- /dev/null +++ b/scripts/lint-diskio-complexities.hs @@ -0,0 +1,181 @@ +#!/usr/bin/env cabal +{- cabal: +build-depends: + , base >=4.16 && <5 + , bytestring ^>=0.11 + , cassava ^>=0.5 + , containers ^>=0.6 || ^>=0.7 || ^>=0.8 + , process ^>=1.6 + , text ^>=2.1 + , vector ^>=0.12 || ^>=0.13 +-} +{-# LANGUAGE InstanceSigs #-} +{-# LANGUAGE OverloadedRecordDot #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} +{-# LANGUAGE TupleSections #-} +{-# LANGUAGE TypeApplications #-} + +import Control.Applicative (Alternative (..)) +import Control.Monad (unless) +import qualified Data.ByteString.Lazy as BSL +import Data.Csv +import Data.List (zip4) +import Data.Map (Map) +import qualified Data.Map as M +import Data.Maybe (fromMaybe, isNothing) +import Data.Set (Set) +import qualified Data.Set as S +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.Encoding as TE +import qualified Data.Text.IO as TIO +import Data.Traversable (for) +import Data.Vector (Vector) +import qualified Data.Vector as V +import System.Exit (ExitCode (..), exitWith) +import System.Process (readProcess) + +main :: IO () +main = do + -- Get the disk I/O complexities from the package description + mapForPackageDescription <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-package-description.hs" [] "" + + -- Get the disk I/O complexities from Database.LSMTree + mapForFullApi <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-source.sh" ["./src/Database/LSMTree.hs"] "" + + -- Get the disk I/O complexities from Database.LSMTree.Simple + mapForSimpleApi <- + buildDiskIOComplexityMap . decodeDiskIOComplexities + <$> readProcess "./scripts/lint-diskio-complexities/dump-from-source.sh" ["./src/Database/LSMTree/Simple.hs"] "" + + -- Comparing Database.LSMTree.Simple to Database.LSMTree + putStrLn "Comparing Database.LSMTree.Simple to Database.LSMTree:" + comparisonSimpleToFull <- + fmap concat . for (concat . M.elems $ mapForSimpleApi) $ \simpleEntry@DiskIOComplexity{..} -> do + case M.lookup function mapForFullApi of + Nothing -> + pure [("Database.LSMTree.Simple", simpleEntry)] + Just fullEntries + | simpleEntry `elem` fullEntries -> pure [] + | otherwise -> pure (("Database.LSMTree.Simple", simpleEntry) : (("Database.LSMTree",) <$> fullEntries)) + TIO.putStrLn (prettyDiskIOComplexityTable comparisonSimpleToFull) + + -- Comparing lsm-tree.cabal to Database.LSMTree + putStrLn "Comparing lsm-tree.cabal to Database.LSMTree:" + comparisonPackageDescriptionToFull <- + fmap concat . for (concat . M.elems $ mapForPackageDescription) $ \simpleEntry@DiskIOComplexity{..} -> do + case M.lookup function mapForFullApi of + Nothing -> + pure [("lsm-tree.cabal", simpleEntry)] + Just fullEntries + | any (looseEq simpleEntry) fullEntries -> pure [] + | otherwise -> pure (("lsm-tree.cabal", simpleEntry) : (("Database.LSMTree",) <$> fullEntries)) + TIO.putStrLn (prettyDiskIOComplexityTable comparisonPackageDescriptionToFull) + + -- Set the exit code based on whether any differences were found + unless (null comparisonSimpleToFull && null comparisonPackageDescriptionToFull) $ + exitWith (ExitFailure 1) + +-------------------------------------------------------------------------------- +-- Helper functions +-------------------------------------------------------------------------------- + +type Function = Text +type MergePolicy = Text +type MergeSchedule = Text +type WorstCaseDiskIOComplexity = Text +type Condition = Text + +data DiskIOComplexity = DiskIOComplexity + { function :: Function + , mergePolicy :: Maybe MergePolicy + , mergeSchedule :: Maybe MergeSchedule + , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity + , condition :: Maybe Condition + } + deriving (Eq, Show) + +-- | Loose equality which is used when comparing the disk I/O complexities +-- listed in the package description to those in the modules. Those in the +-- package description do not list complex side conditions, such as all +-- tables having been closed beforehand, or all tables having the same merge +-- policy. Therefore, this equality disregards mismatches when the first +-- entry does not list a condition. +looseEq :: DiskIOComplexity -> DiskIOComplexity -> Bool +entry1 `looseEq` entry2 = + and + [ entry1.function == entry2.function + , entry1.mergePolicy == entry2.mergePolicy + , entry1.mergeSchedule == entry2.mergeSchedule + , entry1.worstCaseDiskIOComplexity == entry2.worstCaseDiskIOComplexity + , isNothing entry1.condition || entry1.condition == entry2.condition + ] + +-- | Typeset a tagged list of 'DiskIOComplexity' records as an aligned table. +prettyDiskIOComplexityTable :: [(Text, DiskIOComplexity)] -> Text +prettyDiskIOComplexityTable [] = "No differences found.\n" +prettyDiskIOComplexityTable entries = + T.unlines + [ T.unwords + [ prettyCellForColumn tag tags + , prettyCellForColumn function functions + , prettyCellForColumn fullCondition fullConditions + , prettyCellForColumn worstCaseDiskIOComplexity worstCaseDiskIOComplexities + ] + | (tag, function, fullCondition, worstCaseDiskIOComplexity) <- + zip4 tags functions fullConditions worstCaseDiskIOComplexities + ] + where + tags = fst <$> entries + functions = ((.function) . snd) <$> entries + fullConditions = (prettyFullCondition . snd) <$> entries + worstCaseDiskIOComplexities = ((.worstCaseDiskIOComplexity) . snd) <$> entries + + prettyCellForColumn :: Text -> [Text] -> Text + prettyCellForColumn cell column = cell <> T.replicate (maximum (T.length <$> column) - T.length cell) " " + + prettyFullCondition :: DiskIOComplexity -> Text + prettyFullCondition DiskIOComplexity{..} = + fromMaybe "*" mergePolicy `slashWith` mergeSchedule `slashWith` condition + where + slashWith :: Text -> Maybe Text -> Text + slashWith x my = maybe x (\y -> x <> "/" <> y) my + +-- | Structure vector of 'DiskIOComplexity' records into lookup table by function name. +buildDiskIOComplexityMap :: Vector DiskIOComplexity -> Map Function [DiskIOComplexity] +buildDiskIOComplexityMap = M.unionsWith (<>) . fmap toSingletonMap . V.toList + where + toSingletonMap :: DiskIOComplexity -> Map Function [DiskIOComplexity] + toSingletonMap simpleEntry = M.singleton simpleEntry.function [simpleEntry] + +-- | Parse CSV file into vector of 'DiskIOComplexity' records. +decodeDiskIOComplexities :: String -> Vector DiskIOComplexity +decodeDiskIOComplexities = + either error snd . decodeByName . BSL.fromStrict . TE.encodeUtf8 . T.pack + +normaliseWorstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity -> WorstCaseDiskIOComplexity +normaliseWorstCaseDiskIOComplexity = + T.replace "+ " "+" . T.replace " +" "+" . T.replace " \\" "\\" . T.replace " " " " . T.replace "\\:" "" + +-- | Parse CSV row into 'DiskIOComplexity' record. +instance FromNamedRecord DiskIOComplexity where + parseNamedRecord :: NamedRecord -> Parser DiskIOComplexity + parseNamedRecord m = do + function <- m .: "Function" + mergePolicy <- orNotApplicable (m .: "Merge policy") + mergeSchedule <- orNotApplicable (m .: "Merge schedule") + worstCaseDiskIOComplexity <- normaliseWorstCaseDiskIOComplexity <$> (m .: "Worst-case disk I/O complexity") + condition <- orNotApplicable (m .: "Condition") + pure DiskIOComplexity{..} + where + orNotApplicable :: Parser Text -> Parser (Maybe Text) + orNotApplicable pText = (emptyTextToNothing <$> pText) <|> pure Nothing + where + emptyTextToNothing :: Text -> Maybe Text + emptyTextToNothing txt = + if T.null txt then Nothing else Just txt diff --git a/scripts/lint-diskio-complexities/dump-from-package-description.hs b/scripts/lint-diskio-complexities/dump-from-package-description.hs new file mode 100755 index 000000000..5f7770f15 --- /dev/null +++ b/scripts/lint-diskio-complexities/dump-from-package-description.hs @@ -0,0 +1,141 @@ +#!/usr/bin/env cabal +{- cabal: +build-depends: + , base >=4.16 + , bytestring >=0.11 + , Cabal-syntax ^>=3.10 || ^>=3.12 + , cassava ^>=0.5 + , pandoc ^>=3.6.4 + , pandoc-types ^>=1.23.1 + , text >=2.1 +-} +{-# LANGUAGE FlexibleContexts #-} +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE RecordWildCards #-} + +module Main (main) where + +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as BSL (putStr) +import Data.Csv (Header, NamedRecord, ToField (..), + ToNamedRecord (..), encodeByName, header, namedRecord, + (.=)) +import Data.IORef (atomicModifyIORef, newIORef) +import Data.Text (Text) +import qualified Data.Text as T +import qualified Data.Text.IO as TIO +import Data.Traversable (for) +import Debug.Trace (traceShow) +import Distribution.PackageDescription.Parsec + (parseGenericPackageDescriptionMaybe) +import qualified Distribution.Types.GenericPackageDescription as GenericPackageDescription +import qualified Distribution.Types.PackageDescription as PackageDescription +import Distribution.Utils.ShortText (fromShortText) +import System.IO (hPutStrLn, stderr) +import Text.Pandoc (runIOorExplode) +import Text.Pandoc.Definition (Block (..), Inline (..), Row (..), + TableBody (..)) +import Text.Pandoc.Extensions (getDefaultExtensions) +import Text.Pandoc.Options (ReaderOptions (..), WriterOptions (..), + def) +import Text.Pandoc.Readers (readHaddock) +import Text.Pandoc.Walk (Walkable (query)) + +tableEntryToFunction :: Text -> Text -> Text +tableEntryToFunction resource operation + | (resource == "Table" || resource == "Cursor") && operation `notElem` ["New", "Close"] = toCamel (T.splitOn " " operation) + | resource == "Snapshot" && operation == "Open" = "openTableFromSnapshot" + | resource == "Snapshot" && operation == "List" = "listSnapshots" -- plural + | otherwise = toCamel [operation, resource] + +main :: IO () +main = do + let lsmTreeCabalFile = "lsm-tree.cabal" + lsmTreeCabalContent <- BS.readFile lsmTreeCabalFile + case parseGenericPackageDescriptionMaybe lsmTreeCabalContent of + Nothing -> hPutStrLn stderr $ "error: Could not parse '" <> lsmTreeCabalFile <> "'" + Just genericPackageDescription -> do + let packageDescription = GenericPackageDescription.packageDescription genericPackageDescription + let description = T.pack . fromShortText $ PackageDescription.description packageDescription + doc <- runIOorExplode $ readHaddock def description + -- Get the disk I/O complexity table + let diskIOComplexityTable = query dumpDiskIOComplexityTable doc + resourceRef <- newIORef "" + operationsRef <- newIORef "" + entries <- + fmap concat . for diskIOComplexityTable $ \row -> do + let fullRow = replicate (5 - length row) "" <> row + let [newResource, newOperations, newMergePolicy, newMergeSchedule, rawWorstCaseDiskIOComplexity] = fullRow + resource <- atomicModifyIORef resourceRef (merge newResource) + operations <- atomicModifyIORef operationsRef (merge newOperations) + let mergePolicy = if newMergePolicy == "N/A" then Nothing else Just newMergePolicy + let mergeSchedule = if newMergeSchedule == "N/A" then Nothing else Just newMergeSchedule + let worstCaseDiskIOComplexity = T.dropWhileEnd (`elem`[' ','*']) rawWorstCaseDiskIOComplexity + for (T.splitOn "/" operations) $ \operation -> do + let function = tableEntryToFunction resource operation + pure $ DiskIOComplexity {..} + let csvData = encodeByName diskIOComplexityHeader entries + BSL.putStr csvData + +-------------------------------------------------------------------------------- +-- Helper functions +-------------------------------------------------------------------------------- + +type Function = Text +type MergePolicy = Text +type MergeSchedule = Text +type WorstCaseDiskIOComplexity = Text + +data DiskIOComplexity = DiskIOComplexity + { function :: Function + , mergePolicy :: Maybe MergePolicy + , mergeSchedule :: Maybe MergeSchedule + , worstCaseDiskIOComplexity :: WorstCaseDiskIOComplexity + } + deriving (Eq, Show) + +diskIOComplexityHeader :: Header +diskIOComplexityHeader = + header + ["Function", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity"] + +instance ToNamedRecord DiskIOComplexity where + toNamedRecord DiskIOComplexity {..} = + namedRecord + [ "Function" .= toField function + , "Merge policy" .= toField mergePolicy + , "Merge schedule" .= toField mergeSchedule + , "Worst-case disk I/O complexity" .= toField worstCaseDiskIOComplexity + ] + +contents :: Walkable Block a => a -> [Text] +contents = query forBlock + where + forBlock :: Block -> [Text] + forBlock block = [T.unwords (query forInline block)] + forInline :: Inline -> [Text] + forInline (Code _ text) = [text] + forInline (Math _ text) = [text] + forInline (Str text) = [text] + forInline _ = [] + +diskIOComplexityTableHeader :: [Text] +diskIOComplexityTableHeader = + ["Resource", "Operation", "Merge policy", "Merge schedule", "Worst-case disk I/O complexity"] + +dumpDiskIOComplexityTable :: Block -> [[Text]] +dumpDiskIOComplexityTable table@(Table _attr _caption _cols tableHead tableBodies _tableFoot) + | contents tableHead == diskIOComplexityTableHeader = + [ [T.unwords (contents cell) | cell <- cells] + | TableBody _ _ _ rows <- tableBodies + , Row _ cells <- rows + ] +dumpDiskIOComplexityTable _ = [] + +merge :: Text -> Text -> (Text, Text) +merge new old = let res = if T.null new then old else new in (res, res) + +toCamel :: [Text] -> Text +toCamel [] = mempty +toCamel (x : xs) = T.toLower x <> T.concat (T.toTitle <$> xs) diff --git a/scripts/time-complexities/from-hs.sh b/scripts/lint-diskio-complexities/dump-from-source.sh similarity index 94% rename from scripts/time-complexities/from-hs.sh rename to scripts/lint-diskio-complexities/dump-from-source.sh index 7649364cf..ff3196f64 100755 --- a/scripts/time-complexities/from-hs.sh +++ b/scripts/lint-diskio-complexities/dump-from-source.sh @@ -44,16 +44,16 @@ newline_rs='\ printf '%s\n' 'Function,Merge policy,Merge schedule,Worst-case disk I/O complexity,Condition' sed -En -e ' - # Collect complexity information from a Haddock annotation + # Collect all disk I/O complexity information from a Haddock annotation /^\{- *\|/,/-}/ { # Store an unconditional disk I/O complexity s/'"$unconditional_re"'/,,\1,/ t store # Store a disk I/O complexity for the “nothing left open” case - s/'"$nothing_left_open_re"'/,,\1,Nothing left open/ + s/'"$nothing_left_open_re"'/,,\1,NothingLeftOpen/ t store # Store a disk I/O complexity with unknown condition - s/^.*'"$o_expr_re"'.*$/,,\1,Unknown condition/ + s/^.*'"$o_expr_re"'.*$/,,\1,UnknownCondition/ t store # Note down the occurrence of a “same merge policy” restriction /'"$same_merge_policy_re"'/ { @@ -81,7 +81,7 @@ sed -En -e ' :store H x - s/\n=\n(.*)/'"$newline_rs"'\1Same merge policy'"$newline_rs"'=/ + s/\n=\n(.*)/'"$newline_rs"'\1SameMergePolicy'"$newline_rs"'=/ x # Continue with the next line d