Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

[enhance] optim,bslString: html string escaping

The function doing escaping was sadly quadratic in the length of successive character that are converted to htmlentities (now it's linear).
Avoid to construct intermediary list for block of non escaped and escaped chars
Avoid to construct list of escaped chars
Speep-up the most frequent path by a factor 3 (block of non escaped chars)

CHANGELOG optimisation of html escaping (x3 speed and use less memory)
  • Loading branch information...
commit e51ec5d223ac44a64bbc2354487b5466f317f1d7 1 parent f94cd81
@OpaOnWindowsNow OpaOnWindowsNow authored
View
6 libbase/baseChar.ml
@@ -1,5 +1,5 @@
(*
- Copyright © 2011 MLstate
+ Copyright © 2011, 2012 MLstate
This file is part of OPA.
@@ -81,3 +81,7 @@ let hexa_value c =
else if c >= 'a' && c <= 'f' then 87 (* int_of_char 'a' - 10 *)
else assert false
)
+
+let cache fun_charset =
+ let table = Array.init 256 (fun code -> fun_charset (Char.unsafe_chr code)) in
+ fun c -> Array.unsafe_get table (Char.code c)
View
3  libbase/baseChar.mli
@@ -1,5 +1,5 @@
(*
- Copyright © 2011 MLstate
+ Copyright © 2011, 2012 MLstate
This file is part of OPA.
@@ -43,3 +43,4 @@ val is_alpha : char -> bool
*)
val is_space : char -> bool
val hexa_value : char -> int
+val cache : (char->bool) -> (char->bool)
View
12 libbase/baseString.ml
@@ -624,14 +624,14 @@ let last_char s =
if len = 0 then invalid_arg "String.last_char" else
unsafe_get s (pred len)
+let rec aux_len_from func str len i =
+ if i<>len && func (unsafe_get str i) then aux_len_from func str len (i+1)
+ else i
+
let len_from func str index =
let len = length str in
- let max = len - index in
- let rec aux cur =
- if cur < max && func str.[index + cur] then aux (succ cur)
- else cur
- in
- aux 0
+ if 0 <= index && index < len then (aux_len_from func str len index)-index
+ else invalid_arg (Printf.sprintf "String.len_from(_,len=%d,%d)" len index)
let hash = Hashtbl.hash
View
2  libbase/baseString.mli
@@ -1,5 +1,5 @@
(*
- Copyright © 2011 MLstate
+ Copyright © 2011, 2012 MLstate
This file is part of OPA.
View
58 libbase/baseUtf8.ml
@@ -1,5 +1,5 @@
(*
- Copyright © 2011 MLstate
+ Copyright © 2011, 2012 MLstate
This file is part of OPA.
@@ -20,8 +20,13 @@ module String = BaseString
(* -- *)
-let except_html_char = "\"&<>" (* normal characters that *should* be escaped in html *)
-let allowed_special_char = "\t\n\r" (* special characters that *should not* be escaped in html *)
+(* normal characters that *should* be escaped in html *)
+let except_html_char = "\"&<>"
+let fexcept_html_char = BaseChar.cache (String.contains except_html_char)
+
+(* special characters that *should not* be escaped in html *)
+let allowed_special_char = "\t\n\r"
+let fallowed_special_char = BaseChar.cache (String.contains allowed_special_char)
let string_of_int i =
let r1 i = Char.chr (((i mod 64) + 128)) in
@@ -39,30 +44,25 @@ let string_of_int i =
else assert false
(* takes any utf8 string and converts it into html entities (escape *all* characters, which is not avised in general; please perform checks before calling it) *)
-let htmlentities src =
- let rec pow n p = if p = 0 then 1 else (pow n (p - 1)) * n in
- let len = String.length src in
- let soi = Pervasives.string_of_int in
- let rec aux nbr pos i lst =
- if i = len then lst
- else
- let chr = Char.code src.[i] in
- if pos = (-1) then
- if chr >= 240 then aux ((chr mod 16) * 262144) 2 (i + 1) lst
- else if chr >= 224 then aux ((chr mod 32) * 4096) 1 (i + 1) lst
- else if chr >= 192 then aux ((chr mod 64) * 64) 0 (i + 1) lst
- else if (chr < 128 && (chr >= 32 || String.contains allowed_special_char src.[i])) || String.contains except_html_char src.[i]
- then aux 0 (-1) (i + 1) (chr::lst)
- else
- (* between 128 and 192: malformed UTF-8; we should absolutely not fail, but return the usual black question mark for invalid symbols *)
- (* between 0 and 31, except allowed_special_char, the entities seem to be illegal, so we project again to the question mark *)
- begin
- (* Journal.Interface.warning (Printf.sprintf "Warning: htmlentities: invalid UTF-8: in string %s at position %d on character of code %d" src i chr); *)
- aux 0 (-1) (i + 1) (65533::lst)
- end
- else
- let nbr = nbr + ((chr mod 64) * (pow 64 pos)) in
- if pos = 0 then aux 0 (-1) (i + 1) (nbr::lst)
- else aux nbr (pos - 1) (i + 1) lst
+let htmlentities_append buf src start len =
+ let unknown = 65533 in
+ let last = start+len in
+ let rec aux i =
+ if i=last then () else (
+ let char_code = Cactutf.look src i in
+ let next_i = try Cactutf.next src i with _ -> (* bad utf8 *) i+1 in
+ let char_ok = (char_code >= 32 || fallowed_special_char src.[i]) (* control char *)
+ (*&& (char_code <= 128 || fexcept_html_char src.[i]) *) (* Useless *)
+ in
+ Buffer.add_string buf "&#";
+ Buffer.add_string buf (Pervasives.string_of_int (if char_ok then char_code else unknown));
+ Buffer.add_char buf ';';
+ aux next_i
+ )
in
- List.fold_right (fun item acc -> acc^"&#"^(soi item)^";") (aux 0 (-1) 0 []) ""
+ aux start
+
+let htmlentities src =
+ let buf = Buffer.create ((String.length src) * ( (*String.length "&#00;"*)3 + 2 )) in
+ htmlentities_append buf src 0 (String.length src);
+ Buffer.contents buf
View
5 libbase/baseUtf8.mli
@@ -1,5 +1,5 @@
(*
- Copyright © 2011 MLstate
+ Copyright © 2011, 2012 MLstate
This file is part of OPA.
@@ -18,4 +18,7 @@
val except_html_char : string
val allowed_special_char : string
val string_of_int : int -> string
+(* escape all chars to html entities using a formater taking starting position and number of char to process *)
+val htmlentities_append : Buffer.t -> string -> int -> int -> unit
+(* escape all chars to html entities *)
val htmlentities : string -> string
View
71 opabsl/mlbsl/bslString.ml
@@ -69,37 +69,64 @@ let have_to_be_escaped_table =
let code = Char.code chr in
code >= 128 || String.contains Base.Utf8.except_html_char chr || (code < 32 && not (String.contains Base.Utf8.allowed_special_char chr)) in
Array.init 256 (fun code -> have_to_be_escaped (Char.unsafe_chr code))
-let have_to_be_escaped (c:char) = have_to_be_escaped_table.(Char.code c)
-let not_have_to_be_escaped (c:char) = not (have_to_be_escaped_table.(Char.code c))
+let have_to_be_escaped (c:char) = Array.unsafe_get have_to_be_escaped_table (Char.code c)
+let not_have_to_be_escaped (c:char) = not (have_to_be_escaped c)
-
-let utf8_byte_have_to_be_escaped = function
+(* too slow
+let _utf8_byte_have_to_be_escaped = function
| '"' | '<' | '>' | '&' -> true
-(* | '\'' -> true *)
+(* | '\'' -> true *)
| _ -> false
+*)
+(* Don't understand how using an array is faster in this case, but it is *)
+let table_utf8_byte_have_to_be_escaped = Array.init 256 (fun code -> String.contains Base.Utf8.except_html_char (Char.unsafe_chr code))
+let utf8_byte_have_to_be_escaped c = Array.unsafe_get table_utf8_byte_have_to_be_escaped (Char.code c)
+
+
+let utf8_byte_not_have_to_be_escaped c = not (utf8_byte_have_to_be_escaped c)
+
+(* fast path for non escaped part x2 speed-up *)
+(* same as BaseString.len_from but avoiding second order call and hopefully benefit from inlining *)
+let rec from_to_utf8_nbhtbe src i len =
+ if (i<>len) && (utf8_byte_not_have_to_be_escaped (String.unsafe_get src i)) then from_to_utf8_nbhtbe src (i+1) len else i
+let rec from_to_nhtbe src i len =
+ if (i<>len) && (not_have_to_be_escaped (String.unsafe_get src i)) then from_to_nhtbe src (i+1) len else i
+
+let empty_buf = Buffer.create 0
(* This thing works with utf-8 because
- if utf8 encoding is ok, no 'one byte utf8 char' needs to be escaped if the html has utf-8 encoding,
- if utf8 is not ok, all byte of longer than on byte character are seen as needing escaping *)
##register escapeHTML : bool, string -> string
let escapeHTML utf8 src =
+ let len = String.length src in
+ if len=0 then "" else
let have_to_be_escaped = if utf8 then utf8_byte_have_to_be_escaped else have_to_be_escaped in
- if BaseString.exists have_to_be_escaped src then
- let len = String.length src in
- let rec aux pos acc =
- if pos < len then
- if not (have_to_be_escaped src.[pos]) then
- let to_push = String.sub src pos (BaseString.len_from (fun c -> not (have_to_be_escaped c)) src pos) in
- aux (pos + (String.length to_push)) (to_push::acc)
- else
- let to_push = String.sub src pos (BaseString.len_from have_to_be_escaped src pos) in
- aux (pos + (String.length to_push)) ((Base.Utf8.htmlentities to_push)::acc)
- else acc
- in
- BaseString.rev_sconcat "" (aux 0 [])
- else
- src
-
+ let not_have_to_be_escaped = if utf8 then utf8_byte_not_have_to_be_escaped else not_have_to_be_escaped in
+ let rec aux pos buf =
+ let need_escaping = have_to_be_escaped src.[pos] in
+ let len_to_push =
+ if utf8 && not(need_escaping) then (from_to_utf8_nbhtbe src pos len)-pos
+ else if not(need_escaping) then (from_to_nhtbe src pos len)-pos
+ else (
+ let case = if need_escaping then have_to_be_escaped else not_have_to_be_escaped in
+ BaseString.len_from case src pos
+ )
+ in
+ let new_pos = pos + len_to_push in
+ if new_pos<len then (
+ let buf = if buf == empty_buf then Buffer.create (len+(4*len/(new_pos+1))) else buf in
+ if need_escaping then
+ Base.Utf8.htmlentities_append buf src pos len_to_push
+ else
+ Buffer.add_substring buf src pos len_to_push;
+ aux new_pos buf
+ ) else (* term *)
+ if buf==empty_buf then (
+ if need_escaping then Base.Utf8.htmlentities src else src
+ ) else Buffer.contents buf
+ in
+ aux 0 empty_buf
##register to_character \ `Base.Utf8.string_of_int` : int -> string
@@ -118,8 +145,6 @@ let of_byte_unsafe i =
##register byte_at_unsafe : int, string -> int
let byte_at_unsafe n s = Base.Char.code s.[n]
-
-
(* special function for TRX *)
(* TODO write it in C for better performance (on pointers)?
we could then even use some bit-level magic cleverlness to compare word-by-word instead
Please sign in to comment.
Something went wrong with that request. Please try again.