Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 552 lines (512 sloc) 18.654 kB
fccc685 Initial open-source release
MLstate authored
1 (*
70d9436 @Aqua-Ye [fix] cactutf.ml: cons function : if negative character, consider it …
Aqua-Ye authored
2 Copyright © 2011, 2012 MLstate
fccc685 Initial open-source release
MLstate authored
3
4 This file is part of OPA.
5
6 OPA is free software: you can redistribute it and/or modify it under the
7 terms of the GNU Affero General Public License, version 3, as published by
8 the Free Software Foundation.
9
10 OPA is distributed in the hope that it will be useful, but WITHOUT ANY
11 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12 FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for
13 more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with OPA. If not, see <http://www.gnu.org/licenses/>.
17 *)
18 (*
19 -------------------
20 | cactUTF section |
21 -------------------
22 *)
23
24 (*
25 * Note of the authors:
26 *
27 * Beware of the stings !
28 **)
29
30 type unicode = int
31 type unicode_index = int
32 type bytes_index = int
33
34 let (@>) s pos =
35 int_of_char(s.[pos])
36
37 (*
38 FIXME: wild error management, incorrect
39 *)
40 exception Error of string
41 let myerror s = raise (Error s)
42 let warning_or_error s =
43 if true then prerr_endline s else myerror s
44 (* END OF FIXME *)
45
46 exception Lenbytes of int
47 let pre_lenbytes i =
48 if (i < 128) then
49 1
50 else if (i < 192) then
51 raise (Lenbytes i)
52 else if (i < 224) then
53 2
54 else if (i < 240) then
55 3
56 else
57 4
58 (*
59 For one Unicode code, return the number of bytes needed for
60 a representation in UTF-8.
61 *)
62 let lenbytes i =
63 try
64 pre_lenbytes i
65 with
66 Lenbytes i ->
67 warning_or_error (Printf.sprintf "bslCactutf.lenbytes : invalid UTF8 opcode: %d" i);
68 1
69
70 (* determine if a bytechar is the first of a utf-8 char *)
71 let is_first_char i = i < 128 || 192 <= i
72 let is_middle_char i = 128 <= i && i < 192
73
74 (*
75 ** prev()
76 ** Return the index of the first utf8-char before or at the given char.
77 *)
78 let prev_first_char str i =
79 let len = String.length str in
80 if i>= len then myerror (Printf.sprintf "prev_first_char : index too big (i=%d, len=%d)" i len);
81 let rec aux i =
82 if i<0 then myerror "prev_first_char : reach the begin of the string"
83 else (if is_middle_char (str @> i) then aux (i-1) else i)
84 in aux i
85 (*
86 (*register prev : string -> int -> int*)
87 let prev_first_char s n =
88 let i4 = s @> (n - 1)
89 in
90 if (i4 < 128) then (* one byte *)
91 n - 1
92 else if (i4 >= 192) then (* 0b11xxxxxx *)
93 myerror "cactutf.opa : prev : error in UTF8 [1]"
94 else
95 let i3 = s @> (n - 2)
96 in
97 if (i3 < 128) then (* 0b0xxxxxxx *)
98 myerror "cactutf.opa : prev : error in UTF8 [2]"
99 else if (i3 >= 224) then (* 0b111xxxxx *)
100 myerror "cactutf.opa : prev : error in UTF8 [3]"
101 else if (i3 < 192) then (* 0b110xxxxx *)
102 n - 2
103 else
104 let i2 = s @> (n - 3)
105 in
106 if (i2 < 128) then
107 myerror "cactutf.opa : prev : error in UTF8 [4]"
108 else if (i2 >= 224) then (* three bytes *)
109 n - 3
110 else if (i2 >= 192) then
111 myerror "cactutf.opa : prev : error in UTF8 [5]"
112 else
113 let i1 = s @> (n - 4)
114 in
115 if (i1 >= 224) then (* four bytes *)
116 n - 4
117 else
118 myerror "cactutf.opa : prev : error in UTF8 [6]"
119 *)
120
121
122 (* a light version of next *)
123 let pre_next str n = n + lenbytes (str @> n)
124
125 let check str =
126 let len = String.length str in
127 let rec aux i =
128 if i<len then aux (pre_next str i)
129 else if i=len then true
130 else false
131 in
132 try aux 0 with Error _ -> false
133
134 let mess_invalid_char str n =
135 let v = str @> n in Printf.sprintf "the char at raw position %d is invalid [code is %d] : <<%c>> , global string validity is %b" n v (str.[n]) (check str)
136
137 let length_until s pos =
138 let rec aux s c i len =
139 if (i >= len) then
140 c
141 else
142 let n = s @> i
143 in
144 let k = lenbytes n
145 in
146 aux s (c + 1) (i + k) len
147 in
148 aux s 0 0 pos
149
150 let length s =
151 length_until s (String.length s)
152
153 (*
154 ** next()
155 ** Return the index of the next Unicode character.
156 *)
157 (*##register next : string -> int -> int*)
158 let next str n =
159 try
160 n + pre_lenbytes (str @> n)
161 with Lenbytes _ -> warning_or_error ("bslCactutf.next : "^(mess_invalid_char str n)); n+1
162
163 (*
164 ** prev()
165 ** Return the index of the previous Unicode character.
166 *)
167 (*##register prev : string -> int -> int*)
168 let prev str n =
169 (* FIND THE PREVIOUS CHAR *)
170 let i = prev_first_char str (n-1) in
171 if i<0 then ( warning_or_error ("bslCactutf.prev"); i )
172 else i
173
174
175 (*
176 ** nth()
177 ** Return the index of the n-th Unicode character.
178 ** use a cache to speed-up similar calls
179 ** memoize last n-th caracter position and restart from it
180 *)
181 let nth =
182 let cache_s = ref "" in
183 let cache_th= ref 0 in
184 let cache_i = ref 0 in
185 fun str th ->
186 try
187 let len = String.length str in
188 if not(!cache_s == str) then begin
189 (* if str change then start from begining
190 TODO could start from begin or end to be at least 2x faster
191 *)
192 cache_th:= 0;
193 cache_i := 0;
194 cache_s := str;
195 end;
196 (* TODO could start from begin or end to be faster if previous cache is not adapted *)
197 (* disabled this warning since it floods TT and is useless if nobody works on improving it
198 * if !cache_th < th-100 || !cache_th > th+100 then (Printf.printf "bslCactutf.nth may slow you"; flush stdout);*)
199 if !cache_th < th then (
200 while !cache_th<th && (!cache_i) < len do
201 cache_th := !cache_th +1;
202 cache_i := next str !cache_i;
203 done;
204 !cache_i
205 ) else (
206
207 while !cache_th > th && (!cache_i)>= 0 do
208 cache_th := !cache_th -1;
209 cache_i := prev str !cache_i;
210 done;
211 !cache_i
212 )
213 with _ ->
214 warning_or_error (Printf.sprintf "bslCactutf.nth : utf-8 position %d[=>%d], global string validity is %b" (!cache_th) (!cache_i) (check str));
215 !cache_i
216
217 type range_validity =
218 | Invalid_range of string (* an error message is given *)
219 | Valid_range (* the requested substring *)
220
221 (*
222 * Return a pair
223 * - a flag saying if the requested substring was valid
224 * - the unicode substring (clipped to the size of the string if needed)
225 *)
226 let sub_no_failure s i n =
227 if n = 0 then (Valid_range, "") else (* used to work that way, should be improved
228 * bacause n = 0 and i < 0 is not valid
229 * but when n = 0, and i = 0, we say
230 * [pj = nth s (-1)] which prints an unwanted
231 * error ! *)
232 let len = String.length s in
233 let pi = nth s i in
234 let pj = nth s (i+n-1) in
235 let pi' = max pi 0 in
236 let pj' =
237 if pj >= len then
238 len - pi'
239 else
240 min (pj-pi'+lenbytes (s @> pj)) (len-pi') in
241 let substring = String.sub s pi' pj' in
242 let validity =
243 if i < 0 then
244 Invalid_range "cactutf.opa : sub(_, i<0 ,_) : index is negative"
245 else if n < 0 then
246 Invalid_range "cactutf.opa : sub(_, n<0 ,_) : index is negative"
247 else if n = 0 then
248 Valid_range
249 else if pi >= len then
250 Invalid_range (Printf.sprintf "cactutf.opa : sub(s, i=%d>utf_length(s)=%d ,_) : the index is too big, cryptic info=(%d,%d)" i (length s) pi len)
251 else if pj >= len then
252 Invalid_range (Printf.sprintf "cactutf.opa : sub(s, i=%d ,n=%d | i+n=%d>utf_length(s)=%d ) : the required length is too big, cryptic info=(%d,%d)" i n (i+n) (length s) pj len)
253 else
254 Valid_range in
255 (validity, substring)
256
257 (*
258 ** sub()
259 ** Return an Unicode sub-string.
260 *)
261 (*##register sub : string -> int -> int -> string*)
262 let sub s i n =
263 let validity, substring = sub_no_failure s i n in
264 ( match validity with
265 | Invalid_range s -> warning_or_error s
266 | Valid_range -> ()
267 );
268 substring
269
270 (*##register sub_opt : string -> int -> int -> string option*)
271 let sub_opt s i n =
272 match sub_no_failure s i n with
273 | (Invalid_range _, _) -> None
274 | (Valid_range, s) -> Some s
275
276
277 let one_byte b1 = b1
278 (*
279 ** two_bytes()
280 ** Encode two bytes into one Unicode character.
281 ** 0b110xxxx 0b10xxxxxx
282 *)
283 let two_bytes b1 b2 =
284 (((b1 - 192) * 64) + (b2 - 128))
285
286 (*
287 ** three bytes()
288 *)
289 let three_bytes b1 b2 b3 =
290 (((b1 - 224) * 4096) + ((b2 - 128) * 64) + (b3 - 128))
291
292 (*
293 ** four bytes()
294 *)
295 let four_bytes b1 b2 b3 b4 =
296 (((b1 - 240) * 262144) + ((b2 - 128) * 4096) + ((b3 - 128) * 64) + (b4 - 128))
297
298 (*
299 ** charutf8()
300 ** Return the Unicode code at the index in a string.
301 *)
302 (*register charutf8 : string -> int -> int*)
303 let charutf8 str pos =
304 let i = str @> pos
305 in
306 let len = lenbytes i
307 in
308 if (len = 1) then
309 i
310 else if (len = 2) then
311 two_bytes i (str @> (pos + 1))
312 else if (len = 3) then
313 three_bytes i (str @> (pos + 1)) (str @> (pos + 2))
314 else
315 four_bytes i (str @> (pos + 1)) (str @> (pos + 2)) (str @> (pos + 3))
316
317 (*
318 ** get()
319 ** Return the Unicode code of the nth Unicode character.
320 *)
321 (*##register get : string -> int -> int*)
322 let get str n =
323 charutf8 str (nth str n)
324
325 (*
326 ** look()
327 ** Return the Unicode code using the index (and not the nth).
328 ** A lot faster, but only when using index instead of position.
329 *)
330 (*##register look : string -> int -> int*)
331 let look str i =
332 charutf8 str i
333
334 (*
335 *)
336 let csize n =
337 if (n < 128) then
338 1
339 else if (n < 2048) then
340 2
341 else if (n < 65536) then
342 3
343 else
344 4
345
346 (*
347 ** cons()
348 ** Build a new string from a character.
349 *)
350 (*##register cons : int -> string*)
351 let cons c =
70d9436 @Aqua-Ye [fix] cactutf.ml: cons function : if negative character, consider it …
Aqua-Ye authored
352 let c = if c < 0 then 0 else c in
fccc685 Initial open-source release
MLstate authored
353 let s = csize c in
354 let str = String.create s in
355 if (s = 1) then
356 (str.[0] <- char_of_int(c);
357 str)
358 else if (s = 2) then
359 let n1 = c / 64 in
360 let n2 = c - (n1 * 64) in
361 str.[0] <- char_of_int(n1 + 192);
362 str.[1] <- char_of_int(n2 + 128);
363 str
364 else if (s = 3) then
365 let n1 = c / 4096 in
366 let n2 = (c - (n1 * 4096)) / 64 in
367 let n3 = (c - (n1 * 4096) - (n2 * 64)) in
368 str.[0] <- char_of_int(n1 + 224);
369 str.[1] <- char_of_int(n2 + 128);
370 str.[2] <- char_of_int(n3 + 128);
371 str
372 else
373 let n1 = c / 262144 in
374 let n2 = (c - (n1 * 262144)) / 4096 in
375 let n3 = (c - (n1 * 262144) - (n2 * 4096)) / 64 in
376 let n4 = (c - (n1 * 262144) - (n2 * 4096)) - (n3 * 64) in
377 str.[0] <- char_of_int(n1 + 240);
378 str.[1] <- char_of_int(n2 + 128);
379 str.[2] <- char_of_int(n3 + 128);
380 str.[3] <- char_of_int(n4 + 128);
381 str
382
383 (*
384 ** uppercase()
385 ** Return an Uppercase version of the string.
386 ** Beware of the Braille and some greeks caracters.
387 *)
388 (*##register uppercase : string -> string*)
389 let uppercase str =
390 let myupp i =
391 if ((i >= 97) && (i <= 123)) (* US-ASCII *)
392 || ((i >= 224) && (i <= 246)) (* ISO-8859-1 (latin-1) v *)
393 || ((i >= 248) && (i <= 254)) (* ISO-8859-1 (latin-1) ^ *)
394 || ((i >= 65345) && (i <= 65370)) (* caracteres demi/pleine chasse *)
395 || ((i >= 944) && (i <= 974)) (* grec *)
396 || ((i >= 1072) && (i <= 1103)) then (* cyrillique *)
397 i - 32
398 else if ((i >= 257) && (i <= 319) && ((i mod 2) = 1)) (* latin étendu A v *)
399 || ((i >= 314) && (i <= 328) && ((i mod 2) = 0))
400 || ((i >= 331) && (i <= 375) && ((i mod 2) = 1))
401 || (i = 378) || (i = 380) || (i = 382) (* latin étendu A ^ *)
402 || (i = 389) || (i = 392) || (i = 396) || (i = 402) (* latin étendu B v *)
403 || (i = 409) || (i = 417) || (i = 419) || (i = 421)
404 || (i = 424) || (i = 429) || (i = 432) || (i = 434)
405 || (i = 436) || (i = 438) || (i = 453) || (i = 456)
406 || (i = 459)
407 || ((i >= 462) && (i <= 476) && ((i mod 2) = 0))
408 || ((i >= 479) && (i <= 495) && ((i mod 2) = 1))
409 || (i = 498)
410 || ((i >= 501) && (i <= 563) && ((i mod 2) = 1))
411 || (i = 572) || (i = 578) || (i = 585) || (i = 587)
412 || (i = 589) || (i = 591) (* latin étendu B ^ *)
413 || ((i >= 7680) && (i <= 7935) && ((i mod 2) = 1)) (* latin étendu additionnel *)
414 || (i = 8580) (* nombre latin : facteur 10 *)
415 || ((i >= 977) && (i <= 1007) && ((i mod 2) = 1)) (* grec avec accents *)
416 || ((i >= 1120) && (i <= 1153) && ((i mod 2) = 1)) (* cyrillique v *)
417 || ((i >= 1163) && (i <= 1215) && ((i mod 2) = 1))
418 || ((i >= 1217) && (i <= 1230) && ((i mod 2) = 0))
419 || ((i >= 1232) && (i <= 1279) && ((i mod 2) = 1)) (* cyrillique ^ *)
420 || ((i >= 1280) && (i <= 1299) && ((i mod 2) = 1)) then (* cyrillique additionnel *)
421 i - 1
422 else if (i = 454) || (i = 457) || (i = 460) || (i = 499) then (* latin étendu B doubles lettres *)
423 i - 2
424 else if (i = 255) then (* special case : ÿ. Latin 1&A.*)
425 376
426 else if ((i >= 9424) && (i <= 9449)) then (* lettres pastilles *)
427 i - 26
428 else if ((i >= 1104) && (i <= 1119)) then (* cyrillique *)
429 i - 80
430 else if ((i >= 7936) && (i <= 8047) && ((i mod 16) <= 7)) (* grec polytonique v *)
431 || ((i >= 8064) && (i <= 8111) && ((i mod 16) <= 7)) then (* grev polytonique ^ *)
432 i + 8
433 else if ((i >= 1377) && (i <= 1414)) then (* arménien *)
434 i - 48
435 else if ((i >= 8560) && (i <= 8575)) then (* nombres latins *)
436 i - 16
437 else
438 i
439 in
440 let rec aux len pos accu =
441 if (len = 0) then
442 accu
443 else
444 aux (len - 1) (next str pos) (accu ^ cons (myupp (look str pos)))
445 in
446 aux (length str) 0 ""
447
448 (*
449 ** lowercase()
450 ** See uppercase().
451 *)
452 (*##register lowercase : string -> string*)
453 let lowercase str =
454 let mylow i =
455 if ((i >= 65) && (i <= 91)) (* US-ASCII *)
456 || ((i >= 192) && (i <= 214)) (* ISO-8859-1 (latin-1) v *)
457 || ((i >= 216) && (i <= 222)) (* ISO-8859-1 (latin-1) ^ *)
458 || ((i >= 65313) && (i <= 65338)) (* caracteres demi/pleine chasse *)
459 || ((i >= 912) && (i <= 942)) (* grec *)
460 || ((i >= 1040) && (i <= 1071)) then (* cyrillique *)
461 i + 32
462 else if ((i >= 256) && (i <= 319) && ((i mod 2) = 0)) (* latin étendu A v *)
463 || ((i >= 313) && (i <= 328) && ((i mod 2) = 1))
464 || ((i >= 330) && (i <= 375) && ((i mod 2) = 0))
465 || (i = 377) || (i = 379) || (i = 381) (* latin étendu A ^ *)
466 || (i = 388) || (i = 391) || (i = 395) || (i = 401) (* latin étendu B v *)
467 || (i = 408) || (i = 416) || (i = 418) || (i = 420)
468 || (i = 423) || (i = 428) || (i = 431) || (i = 433)
469 || (i = 435) || (i = 437) || (i = 453) || (i = 456)
470 || (i = 459)
471 || ((i >= 461) && (i <= 476) && ((i mod 2) = 1))
472 || ((i >= 478) && (i <= 495) && ((i mod 2) = 0))
473 || (i = 498)
474 || ((i >= 500) && (i <= 563) && ((i mod 2) = 0))
475 || (i = 571) || (i = 577) || (i = 584) || (i = 586)
476 || (i = 588) || (i = 590) (* latin étendu B ^ *)
477 || ((i >= 7680) && (i <= 7935) && ((i mod 2) = 0)) (* latin étendu additionnel *)
478 || (i = 8579) (* nombre latin : facteur 10 *)
479 || ((i >= 976) && (i <= 1007) && ((i mod 2) = 0)) (* grec avec accents *)
480 || ((i >= 1120) && (i <= 1153) && ((i mod 2) = 0)) (* cyrillique v *)
481 || ((i >= 1162) && (i <= 1215) && ((i mod 2) = 0))
482 || ((i >= 1217) && (i <= 1230) && ((i mod 2) = 1))
483 || ((i >= 1232) && (i <= 1279) && ((i mod 2) = 0)) (* cyrillique ^ *)
484 || ((i >= 1280) && (i <= 1299) && ((i mod 2) = 0)) then (* cyrillique additionnel *)
485 i + 1
486 else if (i = 452) || (i = 455) || (i = 458) || (i = 497) then (* latin étendu B doubles lettres *)
487 i + 2
488 else if (i = 376) then (* special case : ÿ. Latin 1&A.*)
489 255
490 else if ((i >= 9398) && (i <= 9423)) then (* lettres pastilles *)
491 i + 26
492 else if ((i >= 1024) && (i <= 1039)) then (* cyrillique *)
493 i + 80
494 else if ((i >= 7936) && (i <= 8047) && ((i mod 16) > 7)) (* grec polytonique v *)
495 || ((i >= 8064) && (i <= 8111) && ((i mod 16) > 7)) then (* grev polytonique ^ *)
496 i - 8
497 else if ((i >= 1329) && (i <= 1366)) then (* arménien *)
498 i + 48
499 else if ((i >= 8544) && (i <= 8559)) then (* nombres latins *)
500 i + 16
501 else
502 i
503 in
504 let rec aux len pos accu =
505 if (len = 0) then
506 accu
507 else
508 aux (len - 1) (next str pos) (accu ^ cons (mylow (look str pos)))
509 in
510 aux (length str) 0 ""
511
512
513 exception Done
514 (*
515 let remove_accents s =
516 let buffer = Buffer.create (String.length s) in
517 let add = Buffer.add_char buffer in
518 let add_array a ~start ~length=
519 for i = 0 to length - 1 do
520 Utf8.store buffer a.(start + i)
521 done
522 in
523 let lex_one_char = lexer
524 | [ 192 193 194 195 196 197 65 (*ÀÁÂÃÄÅA*) ] -> add 'A'
525 | [ 224 225 226 227 228 229 97 (*àáâãäåa*) ] -> add 'a'
526 | [ 210 211 212 213 214 216 79 (*ÒÓÔÕÖØO*) ] -> add 'O'
527 | [ 242 243 244 245 246 248 111 (*òóôõöøo*) ]-> add 'o'
528 | [ 200 201 202 203 69 (*ÈÉÊËE*) ] -> add 'E'
529 | [ 232 233 234 235 101 (*èéêëe*) ] -> add 'e'
530 | [ 199 67 (*ÇC*) ] -> add 'C'
531 | [ 231 99 (*çc*) ] -> add 'c'
532 | [ 204 205 206 207 73 (*ÌÍÎÏI*) ] -> add 'I'
533 | [ 236 237 238 239 105 (*ìíîïi*) ] -> add 'i'
534 | [ 217 218 219 220 85 (*ÙÚÛÜU*) ] -> add 'U'
535 | [ 249 250 251 252 117 (*ùúûüu*) ] -> add 'u'
536 | [ 255 121 (*ÿy*) ] -> add 'y'
537 | [ 209 78 (*ÑN*) ] -> add 'N'
538 | [ 241 110 (*ñn*) ] -> add 'n'
539 | eof -> raise Done
540 | _ -> add_array (Ulexing.get_buf lexbuf) ~start:(Ulexing.get_start lexbuf) ~length:(Ulexing.lexeme_length lexbuf)
541 in
542 try
543 let lexbuf = Ulexing.from_utf8_string s in
544 while true do
545 lex_one_char lexbuf
546 done;
547 assert false
548 with
549 Done -> Buffer.contents buffer
550 *)
551 (*##endmodule*)
Something went wrong with that request. Please try again.