Skip to content

Commit

Permalink
Add a concrete syntax for macaw
Browse files Browse the repository at this point in the history
This will enable caching analysis results in a text format that can be parsed
much more efficiently than re-running the entire fixed point analysis.
  • Loading branch information
travitch committed Nov 29, 2021
1 parent 952fe55 commit 73a77af
Show file tree
Hide file tree
Showing 4 changed files with 567 additions and 0 deletions.
4 changes: 4 additions & 0 deletions base/macaw-base.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ library
galois-dwarf >= 0.2.2,
IntervalMap >= 0.5,
lens >= 4.7,
megaparsec >= 7 && < 10,
mtl,
parameterized-utils >= 2.1.0 && < 2.2,
prettyprinter >= 1.7.0,
Expand Down Expand Up @@ -78,6 +79,9 @@ library
Data.Macaw.Memory.LoadCommon
Data.Macaw.Memory.Permissions
Data.Macaw.Memory.Symbols
Data.Macaw.Syntax.Atom
Data.Macaw.Syntax.Parser
Data.Macaw.Syntax.SExpr
Data.Macaw.Types
Data.Macaw.Utils.Changed
Data.Macaw.Utils.IncComp
Expand Down
103 changes: 103 additions & 0 deletions base/src/Data/Macaw/Syntax/Atom.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
{-# LANGUAGE OverloadedStrings #-}
-- | The atoms of the concrete syntax for macaw
module Data.Macaw.Syntax.Atom (
Keyword(..)
, keywords
, AtomName(..)
, Atom(..)
)
where

import qualified Data.Map.Strict as Map
import qualified Data.Text as T
import Numeric.Natural ( Natural )

-- | Macaw syntax keywords
--
-- These are the keywords for the *base* macaw IR (i.e., without
-- architecture-specific extensions). The architecture-specific operations are
-- parsed as 'AtomName's initially and resolved by architecture-specific parsers
-- at the atom level.
data Keyword = BVAdd
| BVSub
| BVMul
| BVSDiv
| BVUDiv
| Lt
| Le
| Sle
| Slt
-- Syntax
| Assign
-- Statements
| Comment
| InstructionStart
| WriteMemory
| CondWriteMemory
-- Boolean operations
| Not_
| And_
| Or_
| Xor_
-- Endianness
| BigEndian
| LittleEndian
-- MemRepr
| BVMemRepr
-- Types
| Bool_
| BV_
| Float_
| Tuple_
| Vec_
-- Values
| True_
| False_
| BV
deriving (Eq, Ord, Show)

-- | Uninterpreted atoms
newtype AtomName = AtomText T.Text
deriving (Eq, Ord, Show)

data Atom = Keyword !Keyword -- ^ Keywords include all of the built-in expressions and operators
| AtomName !AtomName -- ^ Non-keyword syntax atoms (to be interpreted at parse time)
| Register !Natural -- ^ A numbered local register (e.g., @r12@)
| AddressWord !Natural -- ^ An arbitrary address rendered in hex ('ArchAddrWord')
| SegmentOffset !Natural -- ^ A segment offset address rendered in hex (validation against the Memory object is required)
| Integer_ !Integer -- ^ Literal integers
| Natural_ !Natural -- ^ Literal naturals
| String_ !T.Text -- ^ Literal strings
deriving (Eq, Ord, Show)

keywords :: Map.Map T.Text Keyword
keywords = Map.fromList [ ("bv-add", BVAdd)
, ("bv-sub", BVSub)
, ("bv-mul", BVMul)
, ("bv-sdiv", BVSDiv)
, ("bv-udiv", BVUDiv)
, ("<", Lt)
, ("<=", Le)
, ("<$", Slt)
, ("<=$", Sle)
, ("not", Not_)
, ("and", And_)
, ("or", Or_)
, ("xor", Xor_)
, ("Bool", Bool_)
, ("BV", BV_)
, ("Float", Float_)
, ("Tuple", Tuple_)
, ("Vec", Vec_)
, ("true", True_)
, ("false", False_)
, ("bv", BV)
, (":=", Assign)
, ("comment", Comment)
, ("instruction-start", InstructionStart)
, ("write-memory", WriteMemory)
, ("cond-write-memory", CondWriteMemory)
, ("big-endian", BigEndian)
, ("little-endian", LittleEndian)
, ("bv-mem", BVMemRepr)
]
230 changes: 230 additions & 0 deletions base/src/Data/Macaw/Syntax/Parser.hs
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
{-# LANGUAGE DataKinds #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ViewPatterns #-}
-- | This module defines a concrete syntax parser for macaw, which is meant to
-- make it easy to persist analysis results and reuse them without recomputing
-- everything
--
-- The persisted artifact is a zip file with one file per function, plus a
-- manifest file that records metadata about the binary to enable the loader to
-- check to ensure that the loaded macaw IR actually matches the given binary.
module Data.Macaw.Syntax.Parser (
parseDiscoveryFunInfo
, parseApp
, parseType
) where

import Control.Applicative ( (<|>) )
import qualified Data.ByteString as BS
import qualified Data.Map as Map
import qualified Data.Parameterized.Classes as PC
import qualified Data.Parameterized.NatRepr as PN
import Data.Parameterized.Some ( Some(..) )
import qualified Data.Text as T
import GHC.TypeLits ( type (<=) )
import Numeric.Natural ( Natural )
import qualified Text.Megaparsec as TM

import qualified Data.Macaw.CFG as MC
import qualified Data.Macaw.Discovery as MD
import qualified Data.Macaw.Discovery.State as MDS
import qualified Data.Macaw.Discovery.ParsedContents as Parsed
import qualified Data.Macaw.Memory as MM
import Data.Macaw.Syntax.Atom
import qualified Data.Macaw.Syntax.SExpr as SExpr
import qualified Data.Macaw.Types as MT


parse
:: (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a)
-> SExpr.Parser arch ids a
parse asSomething = do
at <- SExpr.sexp parseAtom
case asSomething at of
Left err -> TM.customFailure err
Right r -> return r

parseSExpr
:: forall arch ids a
. SExpr.Syntax Atom
-> (SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError a)
-> SExpr.Parser arch ids a
parseSExpr sexp asSomething =
case asSomething sexp of
Left err -> TM.customFailure err
Right r -> return r

data WidthRepr where
WidthRepr :: (1 <= n) => PN.NatRepr n -> WidthRepr

-- | Attempt to convert a 'Natural' into a non-zero 'PN.NatRepr'
asWidthRepr :: Natural -> Maybe WidthRepr
asWidthRepr nat =
case PN.mkNatRepr nat of
Some nr
| Right PN.LeqProof <- PN.isZeroOrGT1 nr -> Just (WidthRepr nr)
| otherwise -> Nothing

asEndianness :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError MM.Endianness
asEndianness at =
case at of
SExpr.A (Keyword BigEndian) -> Right MM.BigEndian
SExpr.A (Keyword LittleEndian) -> Right MM.LittleEndian
_ -> Left (SExpr.InvalidEndianness at)

asMemRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (SomeTyped MC.MemRepr)
asMemRepr at =
case at of
SExpr.L [ SExpr.A (Keyword BVMemRepr), SExpr.A (Natural_ w), send ]
| Just (WidthRepr nr) <- asWidthRepr w -> do
end <- asEndianness send
let nr8 = PN.natMultiply (PN.knownNat @8) nr
-- This cannot fail because we already checked that @w@ is >= 1 above
case PN.isZeroOrGT1 nr8 of
Left _ -> error ("Impossible; w was >= 1, so w * 8 must be >= 1: " ++ show nr8)
Right PN.LeqProof -> do
let tyRep = MT.BVTypeRepr nr8
return (SomeTyped tyRep (MC.BVMemRepr nr end))

data SomeTyped f where
SomeTyped :: MT.TypeRepr tp -> f tp -> SomeTyped f

asValue :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (SomeTyped (MC.Value arch ids))
asValue at =
case at of
SExpr.A (Keyword True_) -> Right (SomeTyped MT.BoolTypeRepr (MC.BoolValue True))
SExpr.A (Keyword False_) -> Right (SomeTyped MT.BoolTypeRepr (MC.BoolValue False))
SExpr.L [SExpr.A (Keyword BV), SExpr.A (Natural_ w), SExpr.A (Integer_ i)]
| Just (WidthRepr nr) <- asWidthRepr w -> Right (SomeTyped (MT.BVTypeRepr nr) (MC.BVValue nr i))
| otherwise -> Left SExpr.InvalidZeroWidthBV

parseApp :: SExpr.Parser arch ids (SomeTyped (MC.App (MC.Value arch ids)))
parseApp = undefined

parseAtom :: SExpr.Parser arch ids Atom
parseAtom = TM.try parseKeywordOrAtom

parseIdentifier :: SExpr.Parser arch ids T.Text
parseIdentifier = undefined

parseKeywordOrAtom :: SExpr.Parser arch ids Atom
parseKeywordOrAtom = do
x <- parseIdentifier
return $! maybe (AtomName (AtomText x)) Keyword (Map.lookup x keywords)

-- | Parse a single type as a 'MT.TypeRepr'
--
-- The type forms are:
--
-- * @Bool@
-- * @(BV n)@
-- * @(Vec n ty)@
asTypeRepr :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (Some MT.TypeRepr)
asTypeRepr at =
case at of
SExpr.A (Keyword Bool_) -> Right (Some MT.BoolTypeRepr)
SExpr.L [SExpr.A (Keyword BV_), SExpr.A (Natural_ w)]
| Just (WidthRepr nr) <- asWidthRepr w -> Right (Some (MT.BVTypeRepr nr))
| otherwise -> Left SExpr.InvalidZeroWidthBV
SExpr.L [SExpr.A (Keyword Vec_), SExpr.A (Natural_ w), mty] ->
case PN.mkNatRepr w of
Some nr ->
-- Note that zero-width vectors are technically allowed
case asTypeRepr mty of
Right (Some ty) -> Right (Some (MT.VecTypeRepr nr ty))
Left _errs -> Left (SExpr.InvalidVectorPayload mty)
_ -> Left (SExpr.InvalidType at)

asRHS :: SExpr.Syntax Atom -> Either SExpr.MacawSyntaxError (Some (MC.AssignRhs arch (MC.Value arch ids)))
asRHS = undefined

parseType :: SExpr.Parser arch ids (Some MT.TypeRepr)
parseType = parse asTypeRepr

-- | Forms:
--
-- * @(comment "msg")@
-- * @(instruction-start addr decoded-asm-text)@
-- * @(write-memory addr mem-rep value)@
-- * @(cond-write-memory cond addr mem-rep value)@
-- * @(reg := rhs)@
parseStmt :: forall arch ids . SExpr.Parser arch ids (MC.Stmt arch ids)
parseStmt = do
at <- SExpr.sexp parseAtom
case at of
SExpr.L [ SExpr.A (Keyword Comment), SExpr.A (String_ txt) ] ->
return (MC.Comment txt)
SExpr.L [ SExpr.A (Keyword InstructionStart), SExpr.A (AddressWord addr), SExpr.A (String_ txt) ] ->
return (MC.InstructionStart (MM.memWord (fromIntegral addr)) txt)
SExpr.L [ SExpr.A (Keyword WriteMemory), stargetAddr, smemRepr, svalue ] -> do
SomeTyped addrTy addr <- parseSExpr stargetAddr (asValue @arch @ids)
SomeTyped vtp value <- parseSExpr svalue (asValue @arch @ids)
SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr
memWidth <- SExpr.archMemWidth
let addrRepr = MT.BVTypeRepr memWidth
case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr) of
(Just PC.Refl, Just PC.Refl) -> do
return (MC.WriteMem addr memRepr value)
-- FIXME: Make a more-specific error
_ -> TM.customFailure (SExpr.InvalidStatement at)
SExpr.L [ SExpr.A (Keyword CondWriteMemory), scond, stargetAddr, smemRepr, svalue ] -> do
SomeTyped condTy cond <- parseSExpr scond (asValue @arch @ids)
SomeTyped addrTy addr <- parseSExpr stargetAddr (asValue @arch @ids)
SomeTyped vtp value <- parseSExpr svalue (asValue @arch @ids)
SomeTyped mtp memRepr <- parseSExpr smemRepr asMemRepr
memWidth <- SExpr.archMemWidth
let addrRepr = MT.BVTypeRepr memWidth
case (PC.testEquality vtp mtp, PC.testEquality addrTy addrRepr, PC.testEquality condTy MT.BoolTypeRepr) of
(Just PC.Refl, Just PC.Refl, Just PC.Refl) -> do
return (MC.CondWriteMem cond addr memRepr value)
-- FIXME: Make a more-specific error
_ -> TM.customFailure (SExpr.InvalidStatement at)
SExpr.L [ SExpr.A (Register r), SExpr.A (Keyword Assign), srhs ] -> do
Some rhs <- parseSExpr srhs (asRHS @arch @ids)
Some asgn <- SExpr.defineRegister r rhs
return (MC.AssignStmt asgn)
_ -> TM.customFailure (SExpr.InvalidStatement at)

parseBlock :: SExpr.Parser arch ids (Parsed.ParsedBlock arch ids)
parseBlock = do

stmts <- TM.many parseStmt

return Parsed.ParsedBlock { Parsed.pblockAddr = undefined
, Parsed.pblockPrecond = undefined
, Parsed.blockSize = undefined
, Parsed.blockReason = undefined
, Parsed.blockAbstractState = undefined
, Parsed.blockJumpBounds = undefined
, Parsed.pblockStmts = stmts
, Parsed.pblockTermStmt = undefined
}

parseFunction :: SExpr.Parser arch ids (Some (MD.DiscoveryFunInfo arch))
parseFunction = do

blocks <- TM.many parseBlock

let dfi = MDS.DiscoveryFunInfo { MDS.discoveredFunReason = undefined
, MDS.discoveredFunAddr = undefined
, MDS.discoveredFunSymbol = undefined
, MDS._parsedBlocks = Map.fromList [ (Parsed.pblockAddr pb, pb)
| pb <- blocks
]
, MDS.discoveredClassifyFailureResolutions = undefined
}
return (Some dfi)

parseDiscoveryFunInfo
:: (MC.ArchConstraints arch)
=> MM.Memory (MC.ArchAddrWidth arch)
-- ^ The memory of the binary we are loading results for
-> BS.ByteString
-- ^ The bytes of the persisted file
-> Either SExpr.MacawParseError (Some (MD.DiscoveryFunInfo arch))
parseDiscoveryFunInfo mem bytes = SExpr.runParser mem bytes parseFunction
Loading

0 comments on commit 73a77af

Please sign in to comment.