Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

[doc] jslang: adding some comments in the js lexer/parser

  • Loading branch information...
commit 5496ee5aa9fa3b0587ea7f5911c14a1e6e488bbc 1 parent 24ec553
Valentin Gatien-Baron authored
View
12 jslang/jsLex.mli
@@ -15,6 +15,18 @@
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
+
+(**
+ The javascript lexer
+
+ Beware that the lexer is stateful, so it should not be used
+ by several threads simultaneously, nor should you try to intersperse
+ the lexing of several inputs
+
+ This module is not meant to be called directly, use instead the high level
+ functions provided in jsParse if you want to parse some javascript.
+*)
+
type token =
| Yield
| With
View
33 jslang/jsLex.mll
@@ -15,7 +15,10 @@
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
+
{
+ (* to know what these tokens correspond to, simply
+ * look at association list below *)
type token =
| Yield
| With
@@ -117,7 +120,14 @@
| AmperAmper
| Amper
+ (* the ecmascript defines two kinds of lexing: for the places in the ast
+ * where a token starting with / is a regular expression, and the places where
+ * it is the division or the division-assignment /=
+ * to avoid having to have some information flow from the parser to the lexer,
+ * this is done by looking at the previous token (see function lex) *)
let can_have_a_division = ref false
+
+ (* mapping keywords to tokens *)
let keywords_list = [
"break", Break;
"case", Case;
@@ -170,6 +180,8 @@
]
let keywords = Hashtbl.create 100
let () = List.iter (fun (a,b) -> Hashtbl.add keywords a b) keywords_list
+
+ (* using a single buffer to store the result of parsing string literals, regexp literals, etc. *)
let b = Buffer.create 1000
}
@@ -179,8 +191,15 @@ let hexa = ['0'-'9''a'-'f''A'-'F']
rule main = parse
| ['\t''\012''\013'' ']+ { main lexbuf }
+
+(* beware that we must not throw newlines to be able to implement semicolon
+ * insertions *)
| ['\n' '\r']+ { LT }
+
| "//" [^'\n''\r']* { main lexbuf }
+
+(* beware that if a newline appears in a multi line comment
+ * then we _must_ generate a newline token *)
| "/*" { multiline_comment false lexbuf }
| '/' { if !can_have_a_division then Div else (Buffer.clear b; regexp_body lexbuf) }
@@ -244,6 +263,10 @@ rule main = parse
| eof { EOF }
| _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in main lexing" c)) }
+(* regular expression are not really parsed, you simply interpret them enough
+ * so that you can find the end of the regexp
+ * in particular, escapes are *not* interpreted, and so the string in the regexp
+ * node and token should not be escaped when printed *)
and regexp_body = parse
| ['\r''\n'] { raise (Stream.Error "Line terminator inside a regexp literal") }
| '\\' _ as s { Buffer.add_string b s; regexp_body lexbuf }
@@ -263,6 +286,8 @@ and character_class = parse
and regexp_flags s1 = parse
| identifier_part* as s2 { Regexp (s1,s2) }
+(* [double] is true when the string is enclosed in double quotes
+ * and false when it is enclosed in single quotes *)
and string double = parse
| "'" { if double then (Buffer.add_char b '\''; string double lexbuf)
else String (Buffer.contents b) }
@@ -284,6 +309,7 @@ and string double = parse
| eof { raise (Stream.Error "unterminated string literal comment") }
| _ as c { raise (Stream.Error (Printf.sprintf "unexpected character %C in a string literal" c)) }
+(* [newline] is true when a newline has been parsed in the comment *)
and multiline_comment newline = parse
| [^'*''\n''\r']* { multiline_comment newline lexbuf }
| ['\r''\n'] { multiline_comment true lexbuf }
@@ -292,7 +318,14 @@ and multiline_comment newline = parse
| eof { raise (Stream.Error "unterminated multiline comment") }
{
+(* this global variable is used to ensure that the lexer never returns
+ * two consecutive new lines, which is useful in the parser, because
+ * if you want to look at the first non newline token, you need a lookahead
+ * of 2 with this (otherwise the lookahead would be unbounded) *)
let just_parsed_a_line_terminator = ref true
+
+(* the main lexing function: called the actual lexer, and updates the global
+ * state *)
let rec lex lexbuf =
match main lexbuf with
| LT when !just_parsed_a_line_terminator ->
View
24 jslang/jsParse.ml
@@ -18,10 +18,12 @@
module J = JsAst
open JsLex (* bringing token in the scope *)
+(* right now, the parser doesn't insert any positions in the ast *)
let dummy_pos = FilePos.nopos "jsParse"
let label () = Annot.next_label dummy_pos
let native_ident = JsCons.Ident.native
+(* used for debug only, not error messages *)
let string_of_token = function
| Break -> "break"
| Case -> "case"
@@ -123,19 +125,27 @@ let string_of_token = function
| Div -> "/"
| DivEqual -> "/="
+(* redefining the modules Stream allows us to 'override' the syntax of streams
+ * the new peek, junk and empty function look at the first non-newline token
+ * (which allows us to write almost the whole parser while implicitely
+ * discarding newlines) *)
module Stream =
struct
+
type 'a t = 'a Stream.t
exception Failure = Stream.Failure
exception Error = Stream.Error
let from = Stream.from
let junk_no_newline = Stream.junk
+ let peek_no_newline = Stream.peek
+
let junk stream =
+ (* this function is symmetric with peek below *)
(match Stream.peek stream with
| Some LT -> Stream.junk stream
| _ -> ());
Stream.junk stream
- let peek_no_newline = Stream.peek
+
(*let peek stream =
match Stream.npeek 2 stream with
| [LT; a] -> Some a
@@ -145,16 +155,22 @@ struct
let peek stream = (* this Stream.peek makes the parsing really faster *)
match Stream.peek stream with
| Some LT ->
+ (* using the invariant that says that you never have two consecutives
+ * newlines in the token stream *)
(match Stream.npeek 2 stream with
| _ :: t :: _ -> Some t
| _ -> None)
| o -> o
+
+ (* redefining empty because a stream with only a newline must be considered
+ * as empty *)
let empty s =
match peek s with
| None -> ()
| Some _ -> raise Stream.Failure
end
+(* a handful of parser combinators *)
let rev_list0_aux acc parser_ stream =
let rec aux acc = parser
| [< e = parser_; stream >] -> aux (e :: acc) stream
@@ -209,6 +225,12 @@ let option parser_ = parser
let option_default default parser_ = parser
| [< r = parser_ >] -> r
| [< >] -> default
+
+(* tries to parse using [parser_] but only when there is no newline
+ * in the input stream
+ * for cases such as [return
+ * 2]
+ * which is parsed as [return; 2] and not [return 2] *)
let option_no_newline parser_ stream =
match Stream.peek_no_newline stream with
| Some LT -> None
View
23 jslang/jsParse.mli
@@ -15,20 +15,43 @@
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
+
+(**
+ The javascript parser
+ (implemented by roughly following the ecmascript specification
+ http://www.ecma-international.org/publications/files/ECMA-ST/Ecma-262.pdf)
+*)
+
+(**
+ Low level parsing functions, taking streams as input
+*)
+
val code : JsLex.token Stream.t -> JsAst.code
val expr : JsLex.token Stream.t -> JsAst.expr
val stm : JsLex.token Stream.t -> JsAst.statement
+(**
+ The exception that is thrown when the corresponding argument
+ is given to the functions below and a syntax error occurs
+*)
+
type error
exception Exception of error
val pp : Format.formatter -> error -> unit
+(**
+ The high level parsing functions interface
+ By default [throw_exn] is false, and the parser exits when faced
+ with a parse error
+*)
+
module String :
sig
val code : ?throw_exn:bool -> string -> JsAst.code
val expr : ?throw_exn:bool -> ?globalize:bool -> string -> JsAst.expr
val stm : ?throw_exn:bool -> string -> JsAst.statement
end
+
module File :
sig
val code : ?throw_exn:bool -> string -> JsAst.code
Please sign in to comment.
Something went wrong with that request. Please try again.