Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 793 lines (660 sloc) 30.118 kb
fccc685 Initial open-source release
MLstate authored
1 (*----------------------------------------------------------------------------
2 Copyright (c) 2007-2009, Daniel C. Bünzli. All rights reserved.
3 Distributed under a BSD license, see license at the end of the file.
4 Xmlm version 1.0.2
5 ----------------------------------------------------------------------------*)
6
7 (** Streaming XML IO.
8
9 A well-formed sequence of {{:#TYPEsignal}signals} represents an
10 {{:http://www.w3.org/TR/REC-xml}XML} document tree traversal in
11 depth first order (this has nothing to do with XML
12 well-formedness). Input pulls a well-formed sequence of signals
13 from a data source and output pushes a well-formed sequence of
14 signals to a data destination. Functions are provided to easily
15 transform sequences of signals to/from arborescent data structures.
16
17 Consult the {{:#io}features and limitations} and {{:#ex}examples}
18 of use.
19
20 {e Version 1.0.2 - daniel.buenzl i\@erratique.ch }
21
22 {b References.}
23
24 Tim Bray.
25 {e {{:http://www.xml.com/axml/axml.html}The annotated XML Specification}},
26 1998.
27
28 Tim Bray et al.
29 {e {{:http://www.w3.org/TR/xml-names11}Namespaces in XML 1.1 (2nd ed.)}},
30 2006.
31
32 {1 Basic types and values} *)
33
34 (** The type for character encodings. For [`UTF_16], endianness is
35 determined from the
36 {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
37 type encoding = [
38 | `UTF_8
39 | `UTF_16
40 (** Endianness determined from the
41 {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM}. *)
42 | `UTF_16BE
43 | `UTF_16LE
44 | `ISO_8859_1
45 | `US_ASCII ]
46
47 type dtd = string option
48 (** The type for the optional
49 {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}. *)
50
51 type name = string * string
52 (** The type for attribute and element's
53 {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}
54 [(uri,local)]. An empty [uri] represents a name without a
55 namespace name, i.e. an unprefixed name
56 that is not under the scope of a default namespace. *)
57
58 type attribute = name * string
59 (** The type for attributes. Name and attribute data. *)
60
61 type tag = name * attribute list
62 (** The type for an element tag. Tag name and attribute list. *)
63
64 type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
65 (** The type for signals. A {e well-formed} sequence of signals belongs
66 to the language of the [doc] grammar :
67 {[doc ::= `Dtd tree
68 tree ::= `El_start child `El_end
69 child ::= `Data | tree | epsilon ]}
70 Input and output deal only with well-formed sequences or
71 exceptions are raised.
72 *)
73
74 val ns_xml : string
75 (** Namespace name {{:http://www.w3.org/XML/1998/namespace}value} bound to the
76 reserved ["xml"] prefix. *)
77
78 val ns_xmlns : string
79 (** Namespace name {{:http://www.w3.org/2000/xmlns/}value} bound to the
80 reserved ["xmlns"] prefix. *)
81
82 (** {1 Input} *)
83
84 type pos = int * int
85 (** The type for input positions. Line and column number, both start
86 with 1. *)
87
88 (** The type for input errors. *)
89 type error = [
90 | `Max_buffer_size
91 (** Maximal buffer size exceeded ([Sys.max_string_length]). *)
92 | `Unexpected_eoi
93 (** Unexpected end of input. *)
94 | `Malformed_char_stream
95 (** Malformed underlying character stream. *)
96 | `Unknown_encoding of string
97 (** Unknown encoding. *)
98 | `Unknown_entity_ref of string
99 (** Unknown entity reference, {{:#inentity} details}. *)
100 | `Unknown_ns_prefix of string
101 (** Unknown namespace prefix {{:#inns} details} *)
102 | `Illegal_char_ref of string
103 (** Illegal character reference. *)
104 | `Illegal_char_seq of string
105 (** Illegal character sequence. *)
106 | `Expected_char_seqs of string list * string
107 (** Expected one of the character sequences in the list but found another. *)
108 | `Expected_root_element
109 (** Expected the document's root element. *) ]
110
111 val error_message : error -> string
112 (** Converts the error to an english error message. *)
113
114 exception Error of pos * error
115 (** Raised on input errors. *)
116
117 type source = [
118 | `Channel of in_channel | `String of int * string | `Fun of (unit -> int) ]
119 (** The type for input sources. For [`String] starts reading at the
120 given integer position. For [`Fun] the function must return the
121 next {e byte} as an [int] and raise [End_of_file] if there is no
122 such byte. *)
123
124 type input
125 (** The type for input abstractions. *)
126
127 val make_input : ?enc:encoding option -> ?strip:bool ->
128 ?ns:(string -> string option) ->
129 ?entity: (string -> string option) -> source -> input
130 (** Returns a new input abstraction reading from the given source.
131 {ul
132 {- [enc], character encoding of the document, {{:#inenc} details}.
133 Defaults to [None].}
134 {- [strip], strips whitespace in character data, {{:#inwspace} details}.
135 Defaults to [false].}
136 {- [ns] is called to bind undeclared namespace prefixes,
137 {{:#inns} details}. Default returns always [None].}
138 {- [entity] is called to resolve non predefined entity references,
139 {{:#inentity} details}. Default returns always [None].}} *)
140
141 val input : input -> signal
142 (** Inputs a signal. Repeated invocation of the function with the same
143 input abstraction will generate a {{:#TYPEsignal}well-formed} sequence
144 of signals or an {!Error} is raised. Furthermore there will be no
145 two consecutive [`Data] signals in the sequence and their string
146 is always non empty. After a well-formed sequence was input another may
147 be input, see {!eoi} and {{:#iseq}details}.
148
149 {b Raises} {!Error} on input errors. *)
150
151 val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
152 input -> 'a
153 (** If the next signal is a :
154 {ul
155 {- [`Data] signal, inputs it and invokes [data] with the character data.}
156 {- [`El_start] signal, inputs the sequence of signals until its
157 matching [`El_end] and invokes [el] and [data] as follows
158 {ul
159 {- [el], is called on each [`El_end] signals with the corresponding
160 [`El_start] tag and the result of the callback invocation for the
161 element's children.}
162 {- [data], is called on each [`Data] signals with the character data.
163 This function won't be called twice consecutively or with the empty
164 string.}}}
165 {- Other signals, raises [Invalid_argument].}}
166
167 {b Raises} {!Error} on input errors and [Invalid_argument]
168 if the next signal is not [`El_start] or [`Data]. *)
169
170 val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
171 input -> (dtd * 'a)
172 (** Same as {!input_tree} but reads a complete {{:#TYPEsignal}well-formed}
173 sequence of signals.
174
175 {b Raises} {!Error} on input errors and [Invalid_argument]
176 if the next signal is not [`Dtd]. *)
177
178 val peek : input -> signal
179 (** Same as {!input} but doesn't remove the signal from the sequence.
180
181 {b Raises} {!Error} on input errors. *)
182
183 val eoi : input -> bool
184 (** Returns [true] if the end of input is reached. See {{:#iseq}details}.
185
186 {b Raises} {!Error} on input errors. *)
187
188 val pos : input -> pos
189 (** Current position in the input abstraction. *)
190
191 (** {1 Output} *)
192
193 type 'a frag = [ `El of tag * 'a list | `Data of string ]
194 (** The type for deconstructing data structures of type ['a]. *)
195
196 type dest = [ `Channel of out_channel | `Buffer of Buffer.t |
197 `Fun of (int -> unit) ]
198 (** The type for output destinations. For [`Buffer], the buffer won't
199 be cleared. For [`Fun] the function is called with the output {e
200 bytes} as [int]s. *)
201
202 type output
203 (** The type for output abstractions. *)
204
205 val make_output : ?nl:bool -> ?indent:int option ->
206 ?ns_prefix:(string -> string option) -> dest -> output
207 (** Returns a new output abstraction writing to the given destination.
208 {ul
209 {- [nl], if [true] a newline is output when the root's element [`El_end]
210 signal is output.
211 Defaults to [false].}
212 {- [indent], identation behaviour, see {{:#outindent} details}. Defaults to
213 [None].}
214 {- [ns_prefix], undeclared namespace prefix bindings,
215 see {{:#outns}details}. Default returns always [None].}} *)
216
217
218 val output : output -> signal -> unit
219 (** Outputs a signal. After a well-formed sequence of signals was
220 output a new well-formed sequence can be output.
221
222 {b Raises} [Invalid_argument] if the resulting signal sequence on
223 the output abstraction is not {{:#TYPEsignal}well-formed} or if a
224 namespace name could not be bound to a prefix. *)
225
226 val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
227 (** Outputs signals corresponding to a value by recursively
228 applying the given value deconstructor.
229
230 {b Raises} see {!output}. *)
231
232 val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
233 (** Same as {!output_tree} but outputs a complete {{:#TYPEsignal}well-formed}
234 sequence of signals.
235
236 {b Raises} see {!output}. *)
237
238 (** {1:sto Functorial interface}
239
240 {!Make} allows client to specify types for strings and internal
241 buffers. Among other things this can be used to perform
242 hash-consing or to process the character stream, e.g. to normalize
243 unicode characters or to convert to a custom encoding. *)
244
245 type std_string = string
246 type std_buffer = Buffer.t
247
248 (** Input signature for strings. *)
249 module type String = sig
250
251 type t
252 (** The type for strings. *)
253
254 val empty : t
255 (** The empty string. *)
256
257 val length : t -> int
258 (** Returns the length of the string. *)
259
260 val append : t -> t -> t
261 (** Concatenates two strings. *)
262
263 val lowercase : t -> t
264 (** New string with uppercase letter translated
265 to lowercase (correctness is only needed for ASCII
266 {{:http://www.unicode.org/glossary/#code_point}code point}). *)
267
268 val iter : (int -> unit) -> t -> unit
269 (** Iterates over the unicode
270 {{:http://www.unicode.org/glossary/#code_point}code point}
271 of the given string. *)
272
273 val of_string : std_string -> t
274 (** String from an OCaml string. *)
275
276 val to_utf_8 : ('a -> std_string -> 'a) -> 'a -> t -> 'a
277 (** [to_utf_8 f v s], is [f (... (f (f v s1) s2) ...) sn]. Where the
278 concatenation of [s1], [s2], ... [sn] is [s] as an UTF-8 stream. *)
279
280 val compare : t -> t -> int
281 (** String comparison. Binary comparison is sufficent. *)
282 end
283
284 (** Input signature for internal buffers. *)
285 module type Buffer = sig
286
287 type string
288 (** The type for strings. *)
289
290 type t
291 (** The type for buffers. *)
292
293 exception Full
294 (** Raised if the buffer cannot be grown. *)
295
296 val create : int -> t
297 (** Creates a buffer of the given size. *)
298
299 val add_uchar : t -> int -> unit
300 (** Adds the given (guaranteed valid) unicode
301 {{:http://www.unicode.org/glossary/#code_point}code point} to a
302 buffer.
303
304 {b Raises} {!Full} if the buffer cannot be grown. *)
305
306 val clear : t -> unit
307 (** Clears the buffer. *)
308
309 val contents : t -> string
310 (** Returns the buffer contents. *)
311
312 val length : t -> int
313 (** Returns the number of characters contained in the buffer. *)
314 end
315
316 (** Output signature of {!Make}. *)
317 module type S = sig
318
319 (** {1 Basic types and values} *)
320
321 type string
322
323 type encoding = [
324 | `UTF_8 | `UTF_16 | `UTF_16BE | `UTF_16LE | `ISO_8859_1| `US_ASCII ]
325 type dtd = string option
326 type name = string * string
327 type attribute = name * string
328 type tag = name * attribute list
329 type signal = [ `Dtd of dtd | `El_start of tag | `El_end | `Data of string ]
330
331 val ns_xml : string
332 val ns_xmlns : string
333
334 (** {1 Input} *)
335
336 type pos = int * int
337 type error = [
338 | `Max_buffer_size
339 | `Unexpected_eoi
340 | `Malformed_char_stream
341 | `Unknown_encoding of string
342 | `Unknown_entity_ref of string
343 | `Unknown_ns_prefix of string
344 | `Illegal_char_ref of string
345 | `Illegal_char_seq of string
346 | `Expected_char_seqs of string list * string
347 | `Expected_root_element ]
348
349 exception Error of pos * error
350 val error_message : error -> string
351
352 type source = [
353 | `Channel of in_channel
354 | `String of int * std_string
355 | `Fun of (unit -> int) ]
356
357 type input
358
359 val make_input : ?enc:encoding option -> ?strip:bool ->
360 ?ns:(string -> string option) ->
361 ?entity: (string -> string option) -> source -> input
362
363 val input : input -> signal
364
365 val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
366 input -> 'a
367
368 val input_doc_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) ->
369 input -> (dtd * 'a)
370
371 val peek : input -> signal
372 val eoi : input -> bool
373 val pos : input -> pos
374
375 (** {1 Output} *)
376
377 type 'a frag = [ `El of tag * 'a list | `Data of string ]
378 type dest = [
379 | `Channel of out_channel | `Buffer of std_buffer | `Fun of (int -> unit) ]
380
381 type output
382 val make_output : ?nl:bool -> ?indent:int option ->
383 ?ns_prefix:(string -> string option) -> dest -> output
384
385 val output : output -> signal -> unit
386 val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
387 val output_doc_tree : ('a -> 'a frag) -> output -> (dtd * 'a) -> unit
388 end
389
390 (** Functor building streaming XML IO with the given strings and buffers. *)
391 module Make (String : String) (Buffer : Buffer with type string = String.t) : S
392 with type string = String.t
393
394 (** {1:io Features and limitations}
395
396 The module assumes strings are immutable, thus strings
397 the client gives or receives {e during} the input and output process
398 must not be modified.
399 {2:input Input}
400 {3:inenc Encoding}
401
402 The parser supports ASCII, US-ASCII,
403 {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8},
404 {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16},
405 {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16LE},
406 {{:http://www.faqs.org/rfcs/rfc2781.html} UTF-16BE} and
407 {{:http://anubis.dkuug.dk/JTC1/SC2/WG3/docs/n411.pdf}ISO-8559-1}
408 (Latin-1) encoded documents. But strings returned by
409 the library are {b always} UTF-8 encoded (unless you use the functor).
410
411 The encoding can be specified explicitly using the optional
412 argument [enc]. Otherwise the parser uses UTF-16 or UTF-8 if there is a
413 {{:http://www.unicode.org/unicode/faq/utf_bom.html#BOM}BOM} at the
414 beginning of the document. If there is no BOM it uses the encoding
415 specified in the {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
416 declaration}. Finally, if there is no XML declaration UTF-8 is assumed.
417 {3:inwspace White space handling}
418
419 The parser performs
420 {{:http://www.w3.org/TR/REC-xml/#AVNormalize}attribute data
421 normalization} on {e every} attribute data. This means that
422 attribute data does not have leading and trailling white space and that
423 any white space is collapsed and transformed to a single space
424 character ([U+0020]).
425
426 White space handling of character data depends on the [strip]
427 argument. If [strip] is [true], character data is treated like
428 attribute data, white space before and after elements is removed
429 and any white space is collapsed and transformed to a single
430 space character ([U+0020]), except if the data is under the scope of a {e
431 xml:space} attribute whose value is {e preserve}. If [strip] is
432 [false] all white space data is preserved as present in the
433 document (however all kinds of
434 {{:http://www.w3.org/TR/REC-xml/#sec-line-ends}line ends} are
435 translated to the newline character ([U+000A]). {3:inns Namespaces}
436
437 Xmlm's {{:#TYPEname}names} are
438 {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
439 The parser automatically handles the document's namespace
440 declarations. Undeclared namespace prefixes can be bound via the
441 callback [ns], which must return a namespace name. If [ns] returns
442 [None] an [`Unknown_ns_prefix] error is raised.
443
444 Attributes used for namespace declarations are preserved by the
445 parser. They are in the {!ns_xmlns} namespace. Default namespace
446 declarations made with {i xmlns} have the attribute name
447 [(Xmlm.ns_xmlns, "xmlns")]. Prefix declarations have the prefix as
448 the local name, for example {i xmlns:ex} results in the attribute name
449 [(Xmlm.ns_xmlns, "ex")].
450
451 Regarding constraints on the usage of the {i xml} and {i xmlns}
452 prefixes by documents, the parser does not report errors on violations
453 of the {i must} constraints listed in
454 {{:http://www.w3.org/TR/xml-names11/#xmlReserved}this paragraph}.
455 {3:inentity Character and entity references}
456
457 {{:http://www.w3.org/TR/REC-xml/#dt-charref}Character references}
458 and {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
459 entities} are automatically resolved. Other entity references can
460 be resolved by the callback [entity], which must return an UTF-8
461 (unless you use the functor) string corresponding to the
462 replacement character data. The replacement data is {e not}
463 analysed for further references, it is added to the data as such
464 modulo white space stripping. If [entity] returns [None] the error
465 [`Unknown_entity_ref] is returned.
466 {3:iseq Sequences of documents}
467
468 When a well-formed sequence of signals is input, no data is consumed beyond
469 the closing ['>'] of the document's root element.
470
471 If you want to parse a document as
472 {{:http://www.w3.org/TR/REC-xml/#NT-document}defined} in the XML
473 specification, call {!eoi} after a well-formed sequence of
474 signals, it must return [true]. If you expect another document on
475 the same input abstraction a new well-formed sequence of signals
476 can be {!input}. Use {!eoi} to check if a document follows (this
477 may consume data).
478
479 Invoking {!eoi} after a well-formed sequence of signals skips
480 whitespaces, comments and processing instructions until it gets to
481 either an {{:http://www.w3.org/TR/REC-xml/#NT-XMLDecl} XML
482 declaration} or a {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD}
483 or the start of a new element or the end of input (in which case
484 {!eoi} returns [true]). If there is a new document but there is no
485 XML declaration or the declaration specifies UTF-16, the same
486 encoding as for the previous document is used.
487
488 {3:inmisc Miscellaneous}
489 {ul
490 {- Parses the more liberal and simpler XML 1.1
491 {{:http://www.w3.org/TR/xml11/#NT-Name}Name} definition (minus [':'] because
492 of namespaces).}
493 {- The {{:http://www.w3.org/TR/REC-xml/#dt-doctype}DTD} is parsed
494 roughly (no guarantee it is well formed) and its information is ignored.}
495 {- The parser drops
496 {{:http://www.w3.org/TR/REC-xml/#dt-comment}comments},
497 {{:http://www.w3.org/TR/REC-xml/#dt-pi}processing instructions}, and
498 {{:http://www.w3.org/TR/REC-xml/#sec-rmd}standalone declaration}.}
499 {- Element attributes are not checked for uniqueness.}
500 {- Attribute and character data chunks are limited by
501 [Sys.max_string_length] (unless you use the functor).
502 The error [`Max_buffer_size] is raised if the limit is hit.}
503 {- Tail recursive.}
504 {- Non validating.}
505 }
506
507
508 {2:output Output}
509 {3:outenc Encoding}
510
511 Outputs only {{:http://www.faqs.org/rfcs/rfc3629.html} UTF-8}
512 encoded documents (even if you use the functor). Strings given to
513 output functions {b must be} UTF-8 encoded (unless you use the
514 functor, but you need to provide a translation), no checks are
515 performed. {3:outns Namespaces}
516
517 Xmlm's {{:#TYPEname}names} are
518 {{:http://www.w3.org/TR/xml-names11/#dt-expname}expanded names}.
519 Expanded names are automatically converted to
520 {{:http://www.w3.org/TR/xml-names11/#dt-qualname}qualified
521 names} by the output abstraction. There is no particular api to specify
522 prefixes and default namespaces,
523 the actual result depends solely on the output
524 of attributes belonging to the {!ns_xmlns} namespace. For example to set
525 the default namespace of an element to {i http://example.org/myns},
526 use the following attribute :
527 {[(* xmlns='http://example.org/myns' *)
528 let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns"]}
529 To bind the prefix ["ex"] to {i http://example.org/ex}, use the
530 following attribute :
531 {[(* xmlns:ex='http://example.org/ex' *)
532 let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"]}
533 Note that outputing input signals without
534 touching namespace declaration attributes will preserve existing
535 prefixes and bindings provided the same namespace name is not
536 bound to different prefixes in a given context.
537
538 The callback [ns_prefix] of an output abstraction can be used to
539 give a prefix to a namespace name lacking a prefix binding in the
540 current output scope. Given a namespace name the function must return
541 the prefix to use. Note that this
542 will {b not} add any namespace declaration attribute to the
543 output. If the function returns [None], {!output} will raise
544 [Invalid_argument]. The default function returns always [None].
545 {3:outindent Indentation}
546
547 Output can be indented by specifying the [indent] argument when an
548 output abstraction is created. If [indent] is [None] (default)
549 signal output does not introduce any extra white space. If
550 [ident] is [Some c], each {!signal} is output on its own line
551 (for empty elements [`El_start] and [`El_end] are collapsed on a single
552 line) and nested elements are indented with [c] space
553 characters.
554 {3:oseq Sequences of documents}
555
556 After a well-formed sequence of signals was output, the output
557 abstraction can be reused to output a new well-formed sequence of
558 signals.
559
560 {3:outmisc Miscellaneous}
561 {ul
562 {- Output on a channel does not flush it.}
563 {- In attribute and character data you provide, markup
564 delimiters ['<'],['>'],['&'], and ['\"'] are
565 automatically escaped to
566 {{:http://www.w3.org/TR/REC-xml/#sec-predefined-ent}predefined
567 entities}.}
568 {- No checks are peformed on the prefix and local part of output
569 names to verify they are
570 {{:http://www.w3.org/TR/xml-names11/#NT-NCName}NCName}s.
571 For example using the tag name [("","dip d")] will produce
572 a non well-formed document because of the space character.}
573 {- Tail recursive.}}
574
575 {2 Tips}
576 {ul
577 {- The best options to do an input/output round trip
578 and preserve as much information as possible is to
579 input with [strip = false] and output with [indent = None].}
580 {- Complete whitespace control on output is achieved
581 with [indent = None] and suitable [`Data] signals}}
582 *)
583
584 (** {1:ex Examples}
585
586 {2:exseq Sequential processing}
587
588 Sequential processing has the advantage that you don't need to get
589 the whole document tree in memory to process it.
590
591 The following function reads a {e single} document on an
592 input channel and outputs it.
593 {[let id ic oc =
594 let i = Xmlm.make_input (`Channel ic) in
595 let o = Xmlm.make_output (`Channel oc) in
596 let rec pull i o depth =
597 Xmlm.output o (Xmlm.peek i);
598 match Xmlm.input i with
599 | `El_start _ -> pull i o (depth + 1)
600 | `El_end -> if depth = 1 then () else pull i o (depth - 1)
601 | `Data _ -> pull i o depth
602 | `Dtd _ -> assert false
603 in
604 Xmlm.output o (Xmlm.input i); (* `Dtd *)
605 pull i o 0;
606 if not (Xmlm.eoi i) then invalid_arg "document not well-formed"]}
607 The following function reads a {e sequence} of documents on an
608 input channel and outputs it.
609 {[let id_seq ic oc =
610 let i = Xmlm.make_input (`Channel ic) in
611 let o = Xmlm.make_output ~nl:true (`Channel oc) in
612 while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) done]}
613 The following function reads a {e sequence} of documents on the
614 input channel. In each document's tree it prunes non root elements
615 whose name belongs to [prune_list].
616 {[let prune_docs prune_list ic oc =
617 let i = Xmlm.make_input (`Channel ic) in
618 let o = Xmlm.make_output ~nl:true (`Channel oc) in
619 let copy i o = Xmlm.output o (Xmlm.input i) in
620 let prune (name, _) = List.mem name prune_list in
621 let rec process i o d =
622 let rec skip i d = match Xmlm.input i with
623 | `El_start _ -> skip i (d + 1)
624 | `El_end -> if d = 1 then () else skip i (d - 1)
625 | s -> skip i d
626 in
627 match Xmlm.peek i with
628 | `El_start tag when prune tag -> skip i 0; process i o d
629 | `El_start _ -> copy i o; process i o (d + 1)
630 | `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
631 | `Data _ -> copy i o; process i o d
632 | `Dtd _ -> assert false
633 in
634 let rec docs i o =
635 copy i o; (* `Dtd *)
636 copy i o; (* root start *)
637 process i o 0;
638 if Xmlm.eoi i then () else docs i o
639 in
640 docs i o]}
641
642 {2:extree Tree processing}
643
644 A document's sequence of signals can be easily converted
645 to an arborescent data structure. Assume your trees are defined by :
646 {[type tree = E of Xmlm.tag * tree list | D of string]}
647 The following functions input/output xml documents from/to abstractions
648 as value of type [tree].
649 {[let in_tree i =
650 let el tag childs = E (tag, childs) in
651 let data d = D d in
652 Xmlm.input_doc_tree ~el ~data i
653
654 let out_tree o t =
655 let frag = function
656 | E (tag, childs) -> `El (tag, childs)
657 | D d -> `Data d
658 in
659 Xmlm.output_doc_tree frag o t]}
660
661 {2:exrow Tabular data processing}
662
663 We show how to process XML data that represents tabular data (some
664 people like do that).
665
666 The file we need to deal with represents nominal data about
667 {{:http://www.w3.org/}W3C bureaucrats}. There are no namespaces
668 and attributes are ignored. The element structure of the document
669 is :
670 {ul {- <list>
671 {ul {- <bureaucrat> represents a W3C bureaucrat
672 (zero or more).
673
674 A bureaucrat contains the following elements, in order.
675 {ul {- <name> its name (mandatory, string).}
676 {- <surname> its surname (mandatory, string).}
677 {- <honest> present iff he implemented one of its spec
678 (optional, empty).}
679 {- <obfuscation_level> its grade on the
680 open scale of obfuscation (mandatory, float).}
681 {- <tr> (zero or more, string), technical reports he
682 worked on.}}}}}}
683
684 In OCaml we represent a W3C bureaucrat by this type :
685 {[type w3c_bureaucrat = {
686 name : string;
687 surname : string;
688 honest : bool;
689 obfuscation_level : float;
690 trs : string list; }]}
691 The following functions input and output W3C bureaucrats as lists
692 of values of type [w3c_bureaucrat].
693 {[let in_w3c_bureaucrats src =
694 let i = Xmlm.make_input ~strip:true src in
695 let tag n = ("", n), [] in
696 let error () = invalid_arg "parse error" in
697 let accept s i = if Xmlm.input i = s then () else error () in
698 let rec i_seq el acc i = match Xmlm.peek i with
699 | `El_start _ -> i_seq el ((el i) :: acc) i
700 | `El_end -> List.rev acc
701 | _ -> error ()
702 in
703 let i_el n i =
704 accept (`El_start (tag n)) i;
705 let d = match Xmlm.peek i with
706 | `Data d -> ignore (Xmlm.input i); d
707 | `El_end -> ""
708 | _ -> error ()
709 in
710 accept (`El_end) i;
711 d
712 in
713 let i_bureaucrat i =
714 try
715 accept (`El_start (tag "bureaucrat")) i;
716 let name = i_el "name" i in
717 let surname = i_el "surname" i in
718 let honest = match Xmlm.peek i with
719 | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true
720 | _ -> false
721 in
722 let obf = float_of_string (i_el "obfuscation_level" i) in
723 let trs = i_seq (i_el "tr") [] i in
724 accept (`El_end) i;
725 { name = name; surname = surname; honest = honest;
726 obfuscation_level = obf; trs = trs }
727 with
728 | Failure _ -> error () (* float_of_string *)
729 in
730 accept (`Dtd None) i;
731 accept (`El_start (tag "list")) i;
732 let bl = i_seq i_bureaucrat [] i in
733 accept (`El_end) i;
734 if not (Xmlm.eoi i) then invalid_arg "more than one document";
735 bl
736
737 let out_w3c_bureaucrats dst bl =
738 let tag n = ("", n), [] in
739 let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
740 let out = Xmlm.output o in
741 let o_el n d =
742 out (`El_start (tag n));
743 if d <> "" then out (`Data d);
744 out `El_end
745 in
746 let o_bureaucrat b =
747 out (`El_start (tag "bureaucrat"));
748 o_el "name" b.name;
749 o_el "surname" b.surname;
750 if b.honest then o_el "honest" "";
751 o_el "obfuscation_level" (string_of_float b.obfuscation_level);
752 List.iter (o_el "tr") b.trs;
753 out `El_end
754 in
755 out (`Dtd None);
756 out (`El_start (tag "list"));
757 List.iter o_bureaucrat bl;
758 out (`El_end)]}
759 *)
760
761 (*----------------------------------------------------------------------------
762 Copyright (c) 2007-2009, Daniel C. Bünzli
763 All rights reserved.
764
765 Redistribution and use in source and binary forms, with or without
766 modification, are permitted provided that the following conditions are
767 met:
768
769 1. Redistributions of source code must retain the above copyright
770 notice, this list of conditions and the following disclaimer.
771
772 2. Redistributions in binary form must reproduce the above copyright
773 notice, this list of conditions and the following disclaimer in the
774 documentation and/or other materials provided with the
775 distribution.
776
777 3. Neither the name of the Daniel C. Bünzli nor the names of
778 contributors may be used to endorse or promote products derived
779 from this software without specific prior written permission.
780
781 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
782 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
783 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
784 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
785 OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
786 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
787 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
788 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
789 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
790 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
791 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
792 ----------------------------------------------------------------------------*)
Something went wrong with that request. Please try again.