Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 479 lines (449 sloc) 11.754 kb
fccc685 Initial open-source release
MLstate authored
1 (*
2 Copyright © 2011 MLstate
3
4 This file is part of OPA.
5
6 OPA is free software: you can redistribute it and/or modify it under the
7 terms of the GNU Affero General Public License, version 3, as published by
8 the Free Software Foundation.
9
10 OPA is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
13 more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with OPA. If not, see <http://www.gnu.org/licenses/>.
17 *)
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
18
fccc685 Initial open-source release
MLstate authored
19 {
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
20 (* to know what these tokens correspond to, simply
21 * look at association list below *)
fccc685 Initial open-source release
MLstate authored
22 type token =
23 | Yield
24 | With
25 | While
26 | Void
27 | Var
28 | Typeof
29 | Try
30 | True
31 | TimesEqual
32 | Times
33 | Tilda
34 | Throw
35 | This
36 | Switch
37 | Super
38 | String of (string)
39 | Static
40 | Semic
41 | Rparen
42 | Return
43 | Regexp of (string * string)
44 | Rcurly
45 | Rbracket
46 | Question
47 | Public
48 | Protected
49 | Private
50 | PlusPlus
51 | PlusEqual
52 | Plus
53 | PercentEqual
54 | Percent
55 | Package
56 | Null
57 | New
58 | MinusMinus
59 | MinusEqual
60 | Minus
61 | LtLtEqual
62 | LtLt
63 | Lt
64 | Lparen
65 | Let
66 | Le
67 | Lcurly
68 | Lbracket
69 | LT
70 | Interface
71 | Integer of (string)
72 | Instanceof
73 | In
74 | Import
75 | Implements
76 | If
77 | Ident of (string)
78 | GtGtGtEqual
79 | GtGtGt
80 | GtGtEqual
81 | GtGt
82 | Gt
83 | Ge
84 | Function
85 | For
86 | Finally
87 | False
88 | Extends
89 | Export
90 | EqualEqualEqual
91 | EqualEqual
92 | Equal
93 | Enum
94 | Else
95 | EOF
96 | Dot
97 | Do
98 | DivEqual
99 | Div
100 | Delete
101 | Default
102 | Debugger
103 | Continue
104 | Const
105 | Comma
106 | Colon
107 | Class
108 | ChapeauEqual
109 | Chapeau
110 | Catch
111 | Case
112 | Break
113 | BarEqual
114 | BarBar
115 | Bar
116 | BangEqualEqual
117 | BangEqual
118 | Bang
119 | AmperEqual
120 | AmperAmper
121 | Amper
122
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
123 (* the ecmascript defines two kinds of lexing: for the places in the ast
124 * where a token starting with / is a regular expression, and the places where
125 * it is the division or the division-assignment /=
126 * to avoid having to have some information flow from the parser to the lexer,
127 * this is done by looking at the previous token (see function lex) *)
fccc685 Initial open-source release
MLstate authored
128 let can_have_a_division = ref false
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
129
130 (* mapping keywords to tokens *)
fccc685 Initial open-source release
MLstate authored
131 let keywords_list = [
132 "break", Break;
133 "case", Case;
134 "catch", Catch;
135 "continue", Continue;
136 "debugger", Debugger;
137 "default", Default;
138 "delete", Delete;
139 "do", Do;
140 "else", Else;
141 "finally", Finally;
142 "for", For;
143 "function", Function;
144 "if", If;
145 "in", In;
146 "instanceof", Instanceof;
147 "new", New;
148 "return", Return;
149 "switch", Switch;
150 "this", This;
151 "throw", Throw;
152 "try", Try;
153 "typeof", Typeof;
154 "var", Var;
155 "void", Void;
156 "while", While;
157 "with", With;
158
159 "class", Class;
160 "const", Const;
161 "enum", Enum;
162 "export", Export;
163 "extends", Extends;
164 "import", Import;
165 "super", Super;
166
167 "implements", Implements;
168 "interface", Interface;
169 "let", Let;
170 "package", Package;
171 "private", Private;
172 "protected", Protected;
173 "public", Public;
174 "static", Static;
175 "yield", Yield;
176
177 "null", Null;
178 "true", True;
179 "false", False;
180 ]
181 let keywords = Hashtbl.create 100
182 let () = List.iter (fun (a,b) -> Hashtbl.add keywords a b) keywords_list
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
183
184 (* using a single buffer to store the result of parsing string literals, regexp literals, etc. *)
fccc685 Initial open-source release
MLstate authored
185 let b = Buffer.create 1000
186 }
187
188 let identifier_part = ['a'-'z''A'-'Z''_''$''0'-'9']
189 let identifier = ['a'-'z''A'-'Z''_''$'] identifier_part*
190 let hexa = ['0'-'9''a'-'f''A'-'F']
191
192 rule main = parse
193 | ['\t''\012''\013'' ']+ { main lexbuf }
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
194
195 (* beware that we must not throw newlines to be able to implement semicolon
196 * insertions *)
fccc685 Initial open-source release
MLstate authored
197 | ['\n' '\r']+ { LT }
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
198
fccc685 Initial open-source release
MLstate authored
199 | "//" [^'\n''\r']* { main lexbuf }
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
200
201 (* beware that if a newline appears in a multi line comment
202 * then we _must_ generate a newline token *)
fccc685 Initial open-source release
MLstate authored
203 | "/*" { multiline_comment false lexbuf }
204
205 | '/' { if !can_have_a_division then Div else (Buffer.clear b; regexp_body lexbuf) }
206 | "/=" { if !can_have_a_division then DivEqual else (Buffer.clear b; Buffer.add_char b '='; regexp_body lexbuf) }
207
208 | '{' { Lcurly }
209 | '}' { Rcurly }
210 | '[' { Lbracket }
211 | ']' { Rbracket }
212 | '(' { Lparen }
213 | ')' { Rparen }
214 | '.' { Dot }
215 | ';' { Semic }
216 | ',' { Comma }
217 | '<' { Lt }
218 | '>' { Gt }
219 | "<=" { Le }
220 | ">=" { Ge }
221 | "==" { EqualEqual }
222 | "!=" { BangEqual }
223 | "===" { EqualEqualEqual }
224 | "!==" { BangEqualEqual }
225 | "+" { Plus }
226 | "-" { Minus }
227 | "*" { Times }
228 | "%" { Percent }
229 | "++" { PlusPlus }
230 | "--" { MinusMinus }
231 | "<<" { LtLt }
232 | ">>" { GtGt }
233 | ">>>" { GtGtGt }
234 | "&" { Amper }
235 | "|" { Bar }
236 | "^" { Chapeau }
237 | "!" { Bang }
238 | "~" { Tilda }
239 | "&&" { AmperAmper }
240 | "||" { BarBar }
241 | "?" { Question }
242 | ":" { Colon }
243 | "=" { Equal }
244 | "+=" { PlusEqual }
245 | "-=" { MinusEqual }
246 | "*=" { TimesEqual }
247 | "%=" { PercentEqual }
248 | "<<=" { LtLtEqual }
249 | ">>=" { GtGtEqual }
250 | ">>>=" { GtGtGtEqual }
251 | "&=" { AmperEqual }
252 | "|=" { BarEqual }
253 | "^=" { ChapeauEqual }
254
255 | identifier as s { try Hashtbl.find keywords s with Not_found -> Ident s }
256 | ('0' | ['1'-'9'] ['0'-'9']*) '.' ['0'-'9']* (['e''E'] ['-''+']? ['0'-'9']+)?
257 | '.' ['0'-'9']+ (['e''E'] ['-''+']? ['0'-'9']+)?
258 | ('0' | ['1'-'9'] ['0'-'9']*) (['e''E'] ['-''+']? ['0'-'9']+)?
259 | '0' ['x''X'] hexa*
260 as s { Integer s }
261 | "'" { Buffer.clear b; string false lexbuf }
262 | '"' { Buffer.clear b; string true lexbuf }
263 | eof { EOF }
264 | _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in main lexing" c)) }
265
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
266 (* regular expression are not really parsed, you simply interpret them enough
267 * so that you can find the end of the regexp
268 * in particular, escapes are *not* interpreted, and so the string in the regexp
269 * node and token should not be escaped when printed *)
fccc685 Initial open-source release
MLstate authored
270 and regexp_body = parse
271 | ['\r''\n'] { raise (Stream.Error "Line terminator inside a regexp literal") }
272 | '\\' _ as s { Buffer.add_string b s; regexp_body lexbuf }
273 | '[' as c { Buffer.add_char b c; character_class lexbuf; regexp_body lexbuf }
274 | [^'\\''\r''\n''[' '/']+ as s { Buffer.add_string b s; regexp_body lexbuf }
275 | '/' { let s = Buffer.contents b in
276 Buffer.clear b;
277 regexp_flags s lexbuf }
278 | _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in regexp body" c)) }
279 | eof { raise (Stream.Error "unterminated regexp body ") }
280 and character_class = parse
281 | ']' as c { Buffer.add_char b c }
282 | '\\' _ as s { Buffer.add_string b s; character_class lexbuf }
283 | [^'\\' ']']+ as s { Buffer.add_string b s; character_class lexbuf }
284 | _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in character class" c)) }
285 | eof { raise (Stream.Error "unterminated character class ") }
286 and regexp_flags s1 = parse
287 | identifier_part* as s2 { Regexp (s1,s2) }
288
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
289 (* [double] is true when the string is enclosed in double quotes
290 * and false when it is enclosed in single quotes *)
fccc685 Initial open-source release
MLstate authored
291 and string double = parse
292 | "'" { if double then (Buffer.add_char b '\''; string double lexbuf)
293 else String (Buffer.contents b) }
294 | '"' { if double then String (Buffer.contents b)
295 else (Buffer.add_char b '"'; string double lexbuf) }
296 | [^'\'' '"' '\\''\n''\r']+ as s { Buffer.add_string b s; string double lexbuf }
297 | ['\n' '\r'] { raise (Stream.Error "Line terminator inside a single string literal") }
298 | "\\" (['0'-'7'] ['0'-'7']? ['0'-'7']? as s) { Buffer.add_char b (Char.chr (int_of_string s)); string double lexbuf }
299 | "\\b" { Buffer.add_char b '\008'; string double lexbuf }
300 | "\\t" { Buffer.add_char b '\t'; string double lexbuf }
301 | "\\n" { Buffer.add_char b '\n'; string double lexbuf }
302 | "\\v" { Buffer.add_char b '\011'; string double lexbuf }
303 | "\\f" { Buffer.add_char b '\012'; string double lexbuf }
304 | "\\r" { Buffer.add_char b '\r'; string double lexbuf }
305 | "\\\\" { Buffer.add_char b '\\'; string double lexbuf }
306 | "\\" (['"''\''] as c) { Buffer.add_char b c; string double lexbuf }
307 | "\\u" (hexa hexa hexa hexa as s) { Buffer.add_string b (Scanf.sscanf s "%x" (fun d -> Cactutf.cons d)); string double lexbuf }
308 | "\\x" (hexa hexa as s) { Buffer.add_string b (Scanf.sscanf s "%x" (fun d -> Cactutf.cons d)); string double lexbuf }
309 | eof { raise (Stream.Error "unterminated string literal comment") }
9c715ae @akoprow [fix] JS/Lexer: Fixing lexer on backslashes in string literals.
akoprow authored
310 | "\\" { Buffer.add_char b '\\'; string double lexbuf }
fccc685 Initial open-source release
MLstate authored
311 | _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in a string literal" c)) }
312
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
313 (* [newline] is true when a newline has been parsed in the comment *)
fccc685 Initial open-source release
MLstate authored
314 and multiline_comment newline = parse
315 | [^'*''\n''\r']* { multiline_comment newline lexbuf }
316 | ['\r''\n'] { multiline_comment true lexbuf }
317 | "*/" { if newline then LT else main lexbuf }
318 | '*' { multiline_comment newline lexbuf }
319 | eof { raise (Stream.Error "unterminated multiline comment") }
320
321 {
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
322 (* this global variable is used to ensure that the lexer never returns
323 * two consecutive new lines, which is useful in the parser, because
324 * if you want to look at the first non newline token, you need a lookahead
325 * of 2 with this (otherwise the lookahead would be unbounded) *)
fccc685 Initial open-source release
MLstate authored
326 let just_parsed_a_line_terminator = ref true
5496ee5 [doc] jslang: adding some comments in the js lexer/parser
Valentin Gatien-Baron authored
327
328 (* the main lexing function: called the actual lexer, and updates the global
329 * state *)
fccc685 Initial open-source release
MLstate authored
330 let rec lex lexbuf =
331 match main lexbuf with
332 | LT when !just_parsed_a_line_terminator ->
333 (* INVARIANT: there is never two consecutive LT in the token stream *)
334 (* can have a division doesn't change *)
335 (* just_parsed_a_line_terminator is still true *)
336 lex lexbuf
337 | LT ->
338 (* can have a division doesn't change *)
339 just_parsed_a_line_terminator := true;
340 LT
341
342 (* these symbols cannot be followed by a division *)
343 | Lbracket
344 | Lcurly
345 | Rcurly
346 | Lparen
347 | Dot
348 | Semic
349 | Comma
350 | Lt
351 | Gt
352 | Le
353 | Ge
354 | EqualEqual
355 | BangEqual
356 | EqualEqualEqual
357 | BangEqualEqual
358 | Plus
359 | Minus
360 | Times
361 | Percent
362 | LtLt
363 | GtGt
364 | GtGtGt
365 | Amper
366 | Bar
367 | Chapeau
368 | Bang
369 | Tilda
370 | AmperAmper
371 | BarBar
372 | Question
373 | Colon
374 | Equal
375 | PlusEqual
376 | MinusEqual
377 | TimesEqual
378 | PercentEqual
379 | LtLtEqual
380 | GtGtEqual
381 | GtGtGtEqual
382 | AmperEqual
383 | BarEqual
384 | ChapeauEqual
385 | Div
386 | DivEqual
387 | Break
388 | Case
389 | Catch
390 | Continue
391 | Debugger
392 | Default
393 | Delete
394 | Do
395 | Else
396 | Finally
397 | For
398 | Function
399 | If
400 | In
401 | Instanceof
402 | New
403 | Return
404 | Switch
405 | This
406 | Throw
407 | Typeof
408 | Try
409 | Var
410 | Void
411 | While
412 | With
413 | Class
414 | Const
415 | Enum
416 | Export
417 | Extends
418 | Import
419 | Super
420 | Implements
421 | Interface
422 | Let
423 | Package
424 | Private
425 | Protected
426 | Public
427 | Static
428 | Yield
429 as r ->
430 just_parsed_a_line_terminator := false;
431 can_have_a_division := false;
432 r
433
434 (* these symbols can be followed by a division *)
435 | EOF (* don't care *)
436 | Rbracket
437 | PlusPlus
438 | MinusMinus
439 | Rparen
440 | Ident _
441 | False
442 | True
443 | Null
444 | Regexp _
445 | String _
446 | Integer _
447 as r ->
448 just_parsed_a_line_terminator := false;
449 can_have_a_division := true;
450 r
451 let init_lexer () =
452 can_have_a_division := false;
453 just_parsed_a_line_terminator := true
454
455 let stream lexbuf =
456 Stream.from (
457 fun _ ->
458 match lex lexbuf with
459 | EOF -> None
460 | t -> Some t
461 )
462
463 let stream_of_file file =
464 init_lexer ();
465 try
466 let ic_ = open_in file in
467 Gc.finalise close_in ic_; (* garbage collecting the input channel *)
468 let lexbuf = Lexing.from_channel ic_ in
469 stream lexbuf, lexbuf
470 with Sys_error diagnostic ->
471 Printf.printf "Couldn't open file %s: %s\n%!" file diagnostic;
472 exit 1
473
474 let stream_of_string string =
475 init_lexer ();
476 let lexbuf = Lexing.from_string string in
477 stream lexbuf, lexbuf
478 }
Something went wrong with that request. Please try again.