-
Notifications
You must be signed in to change notification settings - Fork 463
/
Lexer.x
294 lines (234 loc) · 11.7 KB
/
Lexer.x
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
{
{-# OPTIONS_GHC -fno-warn-unused-imports #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE DeriveAnyClass #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE MultiParamTypeClasses #-}
module PlutusCore.Lexer ( alexMonadScan
, runAlexST'
-- * Types
, AlexPosn (..)
, Alex (..)
, topAlexPosn
) where
import PlutusPrelude
import PlutusCore.Error
import PlutusCore.Parser.Type
import PlutusCore.Name
import Control.Monad.Except
import Control.Monad.State
import qualified Data.Text as T
import qualified Data.Text.Encoding as T
import Data.Char (isSpace)
import qualified Data.ByteString.Lazy as BSL
import qualified Data.ByteString.Lazy.Char8 as ASCII
import Language.Haskell.TH.Syntax (Lift)
import Text.Read (readMaybe)
{- Note [Keywords] This version of the lexer relaxes the syntax so that keywords
(con, lam, ...) and built-in names can be re-used as variable names, reducing
the risk of textual source code being unparsable due to illegal names (produced
by a compiler, for example). To achieve this, we use Alex's "start codes" which
allow you to put the lexer into modes in which only certain actions are valid.
In the PLC grammar, keywords like `abs`, `con` and so on can only occur after a
`(`, so when we see one of these we put the lexer into a special mode where
these are interpreted as keywords and converted into elements of the LexKeyword
type; having done this, we return to the initial lexer state, denoted by 0,
where we can use keywords as variable names. A similar strategy is used for
built in type names. -}
{- Note [Literal Constants]
For literal constants, we accept certain types of character sequences that are
then passed to user-defined parsers which convert them to built-in constants.
Literal constants have to specify the type of the constant, so we have (con
integer 9), (con string "Hello"), and so on. This allows us to use the same
literal syntax for different types (eg, integer, short, etc) and shift most
of the responsibility for parsing constants out of the lexer and into the
parser (and eventually out of the parser to parsers supplied by the types
themselves).
In the body of a constant we allow:
* ()
* Single-quoted possibly empty sequences of printable characters
* Double-quoted possibly empty sequences of printable characters
* Unquoted non-empty sequences of printable characters not including '(' or ')',
and not beginning with a single or double quote. Spaces are allowed in the
body of the sequence, but are ignored at the beginning or end.
"Printable" here uses Alex's definition: Unicode code points 32 to 0x10ffff.
This includes spaces but excludes tabs amongst other things. One can use the
usual escape sequences though, as long as the type-specific parser deals with
them.
These allow us to parse all of the standard types. We just return all of the
characters in a TkLiteralConst token, not attempting to do things like stripping
off quotes or interpreting escape sequences: it's the responsibility of the
parser for the relevant type to do these things. Note that 'read' will often do
the right thing.
The final item above even allows the possibility of parsing complex types such as
tuples and lists as long as parentheses are not involved. For example, (con
tuple <1,2.3,"yes">) and (con intlist [1, 2, -7]) are accepted by the lexer, as
is the somewhat improbable-looking (con intseq 12 4 55 -4). Comment characters
are also allowed, but are not treated specially. We don't allow (con )) or (con
tuple (1,2,3)) because it would be difficult for the lexer to decide when it
had reached the end of the literal: consider a tuple containing a quoted string
containing ')', for example.
-}
{- Note [Precedence of regular expression matches]
For reference, Section 3.2.2 of the Alex manual says "When the input stream
matches more than one rule, the rule which matches the longest prefix of the
input stream wins. If there are still several rules which match an equal
number of characters, then the rule which appears earliest in the file wins."
-}
}
%wrapper "monadUserState-bytestring"
-- $ = character set macro
-- @ = regular expression macro
$digit = 0-9
$lower = [a-z]
$upper = [A-Z]
@nat = $digit+
@name = [$lower $upper][$lower $upper $digit \_ \']*
@builtinid = [$lower $upper][$lower $upper $digit \_ \']*
-- Regular expressions for literal constants
-- A single quoted string, allowing escaped characters including \'.
-- This says "Single quotes enclosing a sequence of either (a) printable
-- characters excluding ' and \ , or (b) a backslash followed by
-- any printable character (single quote included)"
@sqs = ' ( ($printable # ['\\]) | (\\$printable) )* '
-- A double quoted string, allowing escaped characters including \". Similar to @sqs
@dqs = \" ( ($printable # [\"\\]) | (\\$printable) )* \"
-- A sequence of printable characters not containing '(' or ')' such that the
-- first character is not a space or a single or double quote. If there are any
-- further characters then they must comprise a sequence of printable characters
-- possibly including spaces, followed by a non-space character. If there are
-- any leading or trailing spaces they will be consumed by the $white+ token
-- below.
$nonparen = $printable # [\(\)]
@chars = ($nonparen # ['\"$white]) ($nonparen* ($nonparen # $white))?
tokens :-
$white+ ;
"--".* ;
"{-" { \_ _ -> nested_comment }
-- Keywords: we only expect these after '('; elsewhere they can be
-- used freely as identifiers: see Note [Keywords].
<kwd> abs { mkKeyword KwAbs `andBegin` 0 }
<kwd> lam { mkKeyword KwLam `andBegin` 0 }
<kwd> ifix { mkKeyword KwIFix `andBegin` 0 }
<kwd> fun { mkKeyword KwFun `andBegin` 0 }
<kwd> all { mkKeyword KwAll `andBegin` 0 }
<kwd> type { mkKeyword KwType `andBegin` 0 }
<kwd> program { mkKeyword KwProgram `andBegin` 0 }
<kwd> iwrap { mkKeyword KwIWrap `andBegin` 0 }
<kwd> unwrap { mkKeyword KwUnwrap `andBegin` 0 }
<kwd> error { mkKeyword KwError `andBegin` 0 }
<kwd> force { mkKeyword KwForce `andBegin` 0 }
<kwd> delay { mkKeyword KwDelay `andBegin` 0 }
<kwd> builtin { mkKeyword KwBuiltin `andBegin` builtin }
-- ^ Switch the lexer into a mode where it's looking for a builtin id.
-- These are converted into Builtin names in the parser.
-- Outside this mode, all ids are parsed as Names.
<kwd> con { mkKeyword KwCon `andBegin` conargs }
-- ^ (con tyname) or (con tyname const)
-- Various special characters
"(" { mkSpecial OpenParen `andBegin` kwd }
")" { mkSpecial CloseParen `andBegin` 0}
"[" { mkSpecial OpenBracket }
"]" { mkSpecial CloseBracket }
"." { mkSpecial Dot }
"{" { mkSpecial OpenBrace }
"}" { mkSpecial CloseBrace }
-- Natural literal, used in version numbers
<0> @nat { tok (\p s -> alex $ TkNat p (readBSL s)) }
-- Identifiers
<0> @name { tok (\p s -> handle_name p (textOf s)) }
-- Names of built-in functions
<builtin> @builtinid { tok (\p s -> alex $ TkBuiltinFnId p (textOf s)) `andBegin` 0 }
-- Things that can follow 'con': the name of a built-in type and possibly a literal constant of that type
<conargs> @name { tok (\p s -> alex $ TkBuiltinTypeId p (textOf s)) `andBegin` literalconst }
-- Literal built-in constants. See Note [Literal Constants].
<literalconst> "()" | @sqs | @dqs | @chars { tok (\p s -> alex $ TkLiteralConst p (textOf s)) `andBegin` 0 }
{
deriving instance Generic AlexPosn
deriving instance NFData AlexPosn
deriving instance Lift AlexPosn
deriving instance Ord AlexPosn
topAlexPosn :: AlexPosn
topAlexPosn = AlexPn 0 0 0
instance Pretty (AlexPosn) where
pretty (AlexPn _ line col) = "line " <> pretty line <> ", column " <> pretty col
chr8 :: Word8 -> Char
chr8 = Data.Char.chr . fromIntegral
textOf :: BSL.ByteString -> T.Text
textOf = T.decodeUtf8 . BSL.toStrict
-- Taken from example by Simon Marlow.
-- This handles Haskell-style comments
nested_comment :: Alex (Token AlexPosn)
nested_comment = go 1 =<< alexGetInput
where go :: Int -> AlexInput -> Alex (Token AlexPosn)
go 0 input = alexSetInput input >> alexMonadScan
go n input =
case alexGetChar input of
Nothing -> err input
Just (c, input') ->
case c of
'-' ->
case alexGetChar input' of
Nothing -> err input'
Just ('}', input'') -> go (n-1) input''
Just (_,input'') -> go n input''
'{' ->
case alexGetChar input' of
Nothing -> err input'
Just (c',input'') -> go (addLevel c' $ n) input''
_ -> go n input'
addLevel c' = bool id (+1) (c' == '-')
alexGetChar = fmap (first chr8) . alexGetByte
err (pos,_,_,_) =
let (AlexPn _ line col) = pos in
alexError ("Error in nested comment at line " ++ show line ++ ", column " ++ show col)
constructor c t = tok (\p _ -> alex $ c p t)
mkSpecial = constructor TkSpecial
mkKeyword = constructor TkKeyword
handle_name :: AlexPosn -> T.Text -> Alex (Token AlexPosn)
handle_name p str = do
s1 <- gets alex_ust
let (u, s2) = runState (newIdentifier str) s1
modify (\s -> s { alex_ust = s2})
pure $ TkName p str u
-- this conversion is safe because we only lex digits
readBSL :: (Read a) => BSL.ByteString -> a
readBSL = read . ASCII.unpack
alex :: a -> Alex a
alex = pure
tok f (p,_,s,_) len = f p (BSL.take len s)
{- Rule (stuff in {}):
{ ... } :: user -- predicate state
-> AlexInput -- input stream before the token
-> Int -- length of the token
-> AlexInput -- input stream after the token
-> Bool -- True <=> accept the token
-}
type AlexUserState = IdentifierState
alexInitUserState :: AlexUserState
alexInitUserState = emptyIdentifierState
instance MonadState AlexState Alex where
get = Alex (\s -> Right (s, s))
put s = Alex (\_ -> Right (s, ()))
alexEOF :: Alex (Token AlexPosn)
alexEOF = EOF . alex_pos <$> get
liftError :: Either String a -> Either (ParseError b) a
liftError(Left s) = Left $ LexErr s
liftError(Right a) = Right $ a
runAlexST :: ByteString.ByteString -> Alex a -> IdentifierState -> Either (ParseError AlexPosn) (IdentifierState, a)
runAlexST input (Alex f) initial = liftError $ first alex_ust <$>
f (AlexState { alex_pos = alexStartPos
, alex_bpos = 0
, alex_inp = input
, alex_chr = '\n'
, alex_ust = initial
, alex_scd = 0
})
runAlexST' :: forall a. ByteString.ByteString -> Alex a -> StateT IdentifierState (Except (ParseError AlexPosn)) a
runAlexST' input al = StateT $ \is -> let
run :: Either (ParseError AlexPosn) (a, IdentifierState)
run = case runAlexST input al is of
Left e -> Left e
Right (s, a) -> Right (a, s)
in liftEither run
}