-
Notifications
You must be signed in to change notification settings - Fork 125
/
baseUtf8.ml
68 lines (60 loc) · 3.03 KB
/
baseUtf8.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
(*
Copyright © 2011 MLstate
This file is part of OPA.
OPA is free software: you can redistribute it and/or modify it under the
terms of the GNU Affero General Public License, version 3, as published by
the Free Software Foundation.
OPA is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
more details.
You should have received a copy of the GNU Affero General Public License
along with OPA. If not, see <http://www.gnu.org/licenses/>.
*)
(* depends *)
module String = BaseString
(* -- *)
let except_html_char = "\"&<>" (* normal characters that *should* be escaped in html *)
let allowed_special_char = "\t\n\r" (* special characters that *should not* be escaped in html *)
let string_of_int i =
let r1 i = Char.chr (((i mod 64) + 128)) in
let r2 i = Char.chr (((i / 64) mod 64) + 128) in
if i < 128 then String.make 1 (Char.chr i)
else if i < 2048 then
let t = [| Char.chr ((i / 64) + 192); r1 i |] in
String.init 2 (fun n -> t.(n))
else if i < 65536 then
let t = [| Char.chr (((i / 4096) mod 16) + 224); r2 i; r1 i |] in
String.init 3 (fun n -> t.(n))
else if i < 2097152 then
let t = [| Char.chr (((i / 262144) mod 8) + 240); Char.chr (((i / 4096) mod 64) + 128); r2 i; r1 i |] in
String.init 4 (fun n -> t.(n))
else assert false
(* takes any utf8 string and converts it into html entities (escape *all* characters, which is not avised in general; please perform checks before calling it) *)
let htmlentities src =
let rec pow n p = if p = 0 then 1 else (pow n (p - 1)) * n in
let len = String.length src in
let soi = Pervasives.string_of_int in
let rec aux nbr pos i lst =
if i = len then lst
else
let chr = Char.code src.[i] in
if pos = (-1) then
if chr >= 240 then aux ((chr mod 16) * 262144) 2 (i + 1) lst
else if chr >= 224 then aux ((chr mod 32) * 4096) 1 (i + 1) lst
else if chr >= 192 then aux ((chr mod 64) * 64) 0 (i + 1) lst
else if (chr < 128 && (chr >= 32 || String.contains allowed_special_char src.[i])) || String.contains except_html_char src.[i]
then aux 0 (-1) (i + 1) (chr::lst)
else
(* between 128 and 192: malformed UTF-8; we should absolutely not fail, but return the usual black question mark for invalid symbols *)
(* between 0 and 31, except allowed_special_char, the entities seem to be illegal, so we project again to the question mark *)
begin
(* Journal.Interface.warning (Printf.sprintf "Warning: htmlentities: invalid UTF-8: in string %s at position %d on character of code %d" src i chr); *)
aux 0 (-1) (i + 1) (65533::lst)
end
else
let nbr = nbr + ((chr mod 64) * (pow 64 pos)) in
if pos = 0 then aux 0 (-1) (i + 1) (nbr::lst)
else aux nbr (pos - 1) (i + 1) lst
in
List.fold_right (fun item acc -> acc^"&#"^(soi item)^";") (aux 0 (-1) 0 []) ""