-
Notifications
You must be signed in to change notification settings - Fork 463
/
Type.hs
197 lines (175 loc) · 7.24 KB
/
Type.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE OverloadedStrings #-}
module PlutusCore.Lexer.Type
( Keyword (..)
, Special (..)
, Token (..)
, allKeywords
, IdentifierState
, newIdentifier
, emptyIdentifierState
, identifierStateFrom
) where
import PlutusPrelude
import PlutusCore.Name
import Control.Monad.State
import Data.Map qualified as M
import Data.Text qualified as T
import Prettyprinter.Internal ((<+>))
import Text.Megaparsec (SourcePos)
{- Note [Literal Constants]
For literal constants, we accept certain types of character sequences that are
then passed to user-defined parsers which convert them to built-in constants.
Literal constants have to specify the type of the constant, so we have (con
integer 9), (con string "Hello"), and so on. This allows us to use the same
literal syntax for different types (eg, integer, short, etc) and shift most
of the responsibility for parsing constants out of the lexer and into the
parser (and eventually out of the parser to parsers supplied by the types
themselves).
In the body of a constant we allow:
* ()
* Single-quoted possibly empty sequences of printable characters
* Double-quoted possibly empty sequences of printable characters
* Unquoted non-empty sequences of printable characters not including '(' or ')',
and not beginning with a single or double quote. Spaces are allowed in the
body of the sequence, but are ignored at the beginning or end.
"Printable" here uses Alex's definition: Unicode code points 32 to 0x10ffff.
This includes spaces but excludes tabs amongst other things. One can use the
usual escape sequences though, as long as the type-specific parser deals with
them.
These allow us to parse all of the standard types. We just return all of the
characters in a TkLiteralConst token, not attempting to do things like stripping
off quotes or interpreting escape sequences: it's the responsibility of the
parser for the relevant type to do these things. Note that 'read' will often do
the right thing.
The final item above even allows the possibility of parsing complex types such as
tuples and lists as long as parentheses are not involved. For example, (con
tuple <1,2.3,"yes">) and (con intlist [1, 2, -7]) are accepted by the lexer, as
is the somewhat improbable-looking (con intseq 12 4 55 -4). Comment characters
are also allowed, but are not treated specially. We don't allow (con )) or (con
tuple (1,2,3)) because it would be difficult for the lexer to decide when it
had reached the end of the literal: consider a tuple containing a quoted string
containing ')', for example.
-}
-- | A keyword in Plutus Core. Some of these are only for UPLC or TPLC, but it's simplest to share
-- the lexer, so we have a joint enumeration of them.
data Keyword
= KwLam
| KwProgram
| KwCon --TODO `andBegin` conargs
-- ^ (con tyname) or (con tyname const)
| KwBuiltin --TODO `andBegin` builtin
-- ^ Switch the lexer into a mode where it's looking for a builtin id.
-- These are converted into Builtin names in the parser.
-- Outside this mode, all ids are parsed as Names.
| KwError
-- TPLC only
| KwAbs
| KwFun
| KwAll
| KwType
| KwIFix
| KwIWrap
| KwUnwrap
-- UPLC only
| KwForce
| KwDelay
deriving (Show, Eq, Ord, Enum, Bounded, Generic, NFData)
-- See note [Literal Constants].
-- | A literal constant.
data LiteralConst
= EmptyBrackets
-- ^ ()
| SingleQuotedChars
-- ^ Single-quoted possibly empty sequences of printable characters
| DoubleQuotedChars
-- ^ Double-quoted possibly empty sequences of printable characters
| UnQuotedChars
-- ^ Unquoted non-empty sequences of printable characters not including '(' or ')',
-- and not beginning with a single or double quote. Spaces are allowed in the
-- body of the sequence, but are ignored at the beginning or end.
deriving (Show, Eq, Ord, Generic, NFData)
-- | A special character. This type is only used internally between the lexer
-- and the parser.
data Special
= OpenParen --TODO `andBegin` kwd
| CloseParen
| OpenBracket
| CloseBracket
| Dot
| OpenBrace
| CloseBrace
deriving (Show, Eq, Ord, Generic, NFData)
-- | A token generated by the tker.
data Token
= TkName { tkLoc :: SourcePos
, tkName :: T.Text
, tkIdentifier :: Unique -- ^ A 'Unique' assigned to the identifier during lexing.
}
| TkBuiltinFnId { tkLoc :: SourcePos, tkBuiltinFnId :: T.Text }
| TkBuiltinTypeId { tkLoc :: SourcePos, tkBuiltinTypeId :: T.Text }
| TkConArgs { tkLoc :: SourcePos, tkConArgsTy :: T.Text, tkConArgsName :: LiteralConst}
-- ^ Things that can follow 'con': the name of a built-in type and possibly a literal constant of that type.
| TkKeyword { tkLoc :: SourcePos, tkKeyword :: Keyword }
| TkLiteralConst { tkLoc :: SourcePos, tkLiteralConst :: LiteralConst }
| TkNat { tkLoc :: SourcePos, tkNat :: Natural }
| TkSpecial { tkLoc :: SourcePos, tkSpecial :: Special }
| EOF { tkLoc :: SourcePos }
deriving (Show, Eq, Ord, Generic, NFData)
instance Pretty Keyword where
pretty KwAbs = "abs"
pretty KwLam = "lam"
pretty KwIFix = "ifix"
pretty KwFun = "fun"
pretty KwAll = "all"
pretty KwType = "type"
pretty KwProgram = "program"
pretty KwCon = "con"
pretty KwIWrap = "iwrap"
pretty KwBuiltin = "builtin"
pretty KwUnwrap = "unwrap"
pretty KwError = "error"
pretty KwForce = "force"
pretty KwDelay = "delay"
instance Pretty LiteralConst where
pretty EmptyBrackets = "lit ()"
pretty SingleQuotedChars = "lit '"
pretty DoubleQuotedChars = "lit \""
pretty UnQuotedChars = "lit"
instance Pretty Special where
pretty OpenParen = "("
pretty CloseParen = ")"
pretty OpenBracket = "["
pretty CloseBracket = "]"
pretty Dot = "."
pretty OpenBrace = "{"
pretty CloseBrace = "}"
instance Pretty Token where
pretty (TkName _ n _) = pretty n
pretty (TkNat _ n) = pretty n
pretty (TkBuiltinFnId _ ident) = pretty ident
pretty (TkBuiltinTypeId _ ident) = pretty ident
pretty (TkConArgs _ ty lit) = pretty ty <+> pretty lit
pretty (TkLiteralConst _ lit) = pretty lit
pretty (TkKeyword _ kw) = pretty kw
pretty (TkSpecial _ s) = pretty s
pretty EOF{} = mempty
-- | The list of all 'Keyword's.
allKeywords :: [Keyword]
allKeywords = [minBound .. maxBound]
-- | An 'IdentifierState' includes a map indexed by 'Int's as well as a map
-- indexed by 'ByteString's. It is used during parsing.
type IdentifierState = (M.Map T.Text Unique, Unique)
emptyIdentifierState :: IdentifierState
emptyIdentifierState = (mempty, Unique 0)
identifierStateFrom :: Unique -> IdentifierState
identifierStateFrom u = (mempty, u)
newIdentifier :: (MonadState IdentifierState m) => T.Text -> m Unique
newIdentifier str = do
(ss, nextU) <- get
case M.lookup str ss of
Just k -> pure k
Nothing -> do
let nextU' = Unique $ unUnique nextU + 1
put (M.insert str nextU ss, nextU')
pure nextU