/
uniset.h
1566 lines (1423 loc) · 58.9 KB
/
uniset.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*
***************************************************************************
* Copyright (C) 1999-2008, International Business Machines Corporation
* and others. All Rights Reserved.
***************************************************************************
* Date Name Description
* 10/20/99 alan Creation.
***************************************************************************
*/
#ifndef UNICODESET_H
#define UNICODESET_H
#include "unicode/unifilt.h"
#include "unicode/unistr.h"
#include "unicode/uset.h"
/**
* \file
* \brief C++ API: Unicode Set
*/
U_NAMESPACE_BEGIN
class BMPSet;
class ParsePosition;
class SymbolTable;
class UnicodeSetStringSpan;
class UVector;
class RuleCharacterIterator;
/**
* A mutable set of Unicode characters and multicharacter strings. Objects of this class
* represent <em>character classes</em> used in regular expressions.
* A character specifies a subset of Unicode code points. Legal
* code points are U+0000 to U+10FFFF, inclusive.
*
* <p>The UnicodeSet class is not designed to be subclassed.
*
* <p><code>UnicodeSet</code> supports two APIs. The first is the
* <em>operand</em> API that allows the caller to modify the value of
* a <code>UnicodeSet</code> object. It conforms to Java 2's
* <code>java.util.Set</code> interface, although
* <code>UnicodeSet</code> does not actually implement that
* interface. All methods of <code>Set</code> are supported, with the
* modification that they take a character range or single character
* instead of an <code>Object</code>, and they take a
* <code>UnicodeSet</code> instead of a <code>Collection</code>. The
* operand API may be thought of in terms of boolean logic: a boolean
* OR is implemented by <code>add</code>, a boolean AND is implemented
* by <code>retain</code>, a boolean XOR is implemented by
* <code>complement</code> taking an argument, and a boolean NOT is
* implemented by <code>complement</code> with no argument. In terms
* of traditional set theory function names, <code>add</code> is a
* union, <code>retain</code> is an intersection, <code>remove</code>
* is an asymmetric difference, and <code>complement</code> with no
* argument is a set complement with respect to the superset range
* <code>MIN_VALUE-MAX_VALUE</code>
*
* <p>The second API is the
* <code>applyPattern()</code>/<code>toPattern()</code> API from the
* <code>java.text.Format</code>-derived classes. Unlike the
* methods that add characters, add categories, and control the logic
* of the set, the method <code>applyPattern()</code> sets all
* attributes of a <code>UnicodeSet</code> at once, based on a
* string pattern.
*
* <p><b>Pattern syntax</b></p>
*
* Patterns are accepted by the constructors and the
* <code>applyPattern()</code> methods and returned by the
* <code>toPattern()</code> method. These patterns follow a syntax
* similar to that employed by version 8 regular expression character
* classes. Here are some simple examples:
*
* \htmlonly<blockquote>\endhtmlonly
* <table>
* <tr align="top">
* <td nowrap valign="top" align="left"><code>[]</code></td>
* <td valign="top">No characters</td>
* </tr><tr align="top">
* <td nowrap valign="top" align="left"><code>[a]</code></td>
* <td valign="top">The character 'a'</td>
* </tr><tr align="top">
* <td nowrap valign="top" align="left"><code>[ae]</code></td>
* <td valign="top">The characters 'a' and 'e'</td>
* </tr>
* <tr>
* <td nowrap valign="top" align="left"><code>[a-e]</code></td>
* <td valign="top">The characters 'a' through 'e' inclusive, in Unicode code
* point order</td>
* </tr>
* <tr>
* <td nowrap valign="top" align="left"><code>[\\u4E01]</code></td>
* <td valign="top">The character U+4E01</td>
* </tr>
* <tr>
* <td nowrap valign="top" align="left"><code>[a{ab}{ac}]</code></td>
* <td valign="top">The character 'a' and the multicharacter strings "ab" and
* "ac"</td>
* </tr>
* <tr>
* <td nowrap valign="top" align="left"><code>[\\p{Lu}]</code></td>
* <td valign="top">All characters in the general category Uppercase Letter</td>
* </tr>
* </table>
* \htmlonly</blockquote>\endhtmlonly
*
* Any character may be preceded by a backslash in order to remove any special
* meaning. White space characters, as defined by UCharacter.isWhitespace(), are
* ignored, unless they are escaped.
*
* <p>Property patterns specify a set of characters having a certain
* property as defined by the Unicode standard. Both the POSIX-like
* "[:Lu:]" and the Perl-like syntax "\\p{Lu}" are recognized. For a
* complete list of supported property patterns, see the User's Guide
* for UnicodeSet at
* <a href="http://icu-project.org/userguide/unicodeSet.html">
* http://icu-project.org/userguide/unicodeSet.html</a>.
* Actual determination of property data is defined by the underlying
* Unicode database as implemented by UCharacter.
*
* <p>Patterns specify individual characters, ranges of characters, and
* Unicode property sets. When elements are concatenated, they
* specify their union. To complement a set, place a '^' immediately
* after the opening '['. Property patterns are inverted by modifying
* their delimiters; "[:^foo]" and "\\P{foo}". In any other location,
* '^' has no special meaning.
*
* <p>Ranges are indicated by placing two a '-' between two
* characters, as in "a-z". This specifies the range of all
* characters from the left to the right, in Unicode order. If the
* left character is greater than or equal to the
* right character it is a syntax error. If a '-' occurs as the first
* character after the opening '[' or '[^', or if it occurs as the
* last character before the closing ']', then it is taken as a
* literal. Thus "[a\-b]", "[-ab]", and "[ab-]" all indicate the same
* set of three characters, 'a', 'b', and '-'.
*
* <p>Sets may be intersected using the '&' operator or the asymmetric
* set difference may be taken using the '-' operator, for example,
* "[[:L:]&[\\u0000-\\u0FFF]]" indicates the set of all Unicode letters
* with values less than 4096. Operators ('&' and '|') have equal
* precedence and bind left-to-right. Thus
* "[[:L:]-[a-z]-[\\u0100-\\u01FF]]" is equivalent to
* "[[[:L:]-[a-z]]-[\\u0100-\\u01FF]]". This only really matters for
* difference; intersection is commutative.
*
* <table>
* <tr valign=top><td nowrap><code>[a]</code><td>The set containing 'a'
* <tr valign=top><td nowrap><code>[a-z]</code><td>The set containing 'a'
* through 'z' and all letters in between, in Unicode order
* <tr valign=top><td nowrap><code>[^a-z]</code><td>The set containing
* all characters but 'a' through 'z',
* that is, U+0000 through 'a'-1 and 'z'+1 through U+10FFFF
* <tr valign=top><td nowrap><code>[[<em>pat1</em>][<em>pat2</em>]]</code>
* <td>The union of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]&[<em>pat2</em>]]</code>
* <td>The intersection of sets specified by <em>pat1</em> and <em>pat2</em>
* <tr valign=top><td nowrap><code>[[<em>pat1</em>]-[<em>pat2</em>]]</code>
* <td>The asymmetric difference of sets specified by <em>pat1</em> and
* <em>pat2</em>
* <tr valign=top><td nowrap><code>[:Lu:] or \\p{Lu}</code>
* <td>The set of characters having the specified
* Unicode property; in
* this case, Unicode uppercase letters
* <tr valign=top><td nowrap><code>[:^Lu:] or \\P{Lu}</code>
* <td>The set of characters <em>not</em> having the given
* Unicode property
* </table>
*
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
*
* <p><b>Formal syntax</b></p>
*
* \htmlonly<blockquote>\endhtmlonly
* <table>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>pattern := </code></td>
* <td valign="top"><code>('[' '^'? item* ']') |
* property</code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>item := </code></td>
* <td valign="top"><code>char | (char '-' char) | pattern-expr<br>
* </code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>pattern-expr := </code></td>
* <td valign="top"><code>pattern | pattern-expr pattern |
* pattern-expr op pattern<br>
* </code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>op := </code></td>
* <td valign="top"><code>'&' | '-'<br>
* </code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>special := </code></td>
* <td valign="top"><code>'[' | ']' | '-'<br>
* </code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>char := </code></td>
* <td valign="top"><em>any character that is not</em><code> special<br>
* | ('\' </code><em>any character</em><code>)<br>
* | ('\\u' hex hex hex hex)<br>
* </code></td>
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>hex := </code></td>
* <td valign="top"><em>any character for which
* </em><code>Character.digit(c, 16)</code><em>
* returns a non-negative result</em></td>
* </tr>
* <tr>
* <td nowrap valign="top" align="right"><code>property := </code></td>
* <td valign="top"><em>a Unicode property set pattern</em></td>
* </tr>
* </table>
* <br>
* <table border="1">
* <tr>
* <td>Legend: <table>
* <tr>
* <td nowrap valign="top"><code>a := b</code></td>
* <td width="20" valign="top"> </td>
* <td valign="top"><code>a</code> may be replaced by <code>b</code> </td>
* </tr>
* <tr>
* <td nowrap valign="top"><code>a?</code></td>
* <td valign="top"></td>
* <td valign="top">zero or one instance of <code>a</code><br>
* </td>
* </tr>
* <tr>
* <td nowrap valign="top"><code>a*</code></td>
* <td valign="top"></td>
* <td valign="top">one or more instances of <code>a</code><br>
* </td>
* </tr>
* <tr>
* <td nowrap valign="top"><code>a | b</code></td>
* <td valign="top"></td>
* <td valign="top">either <code>a</code> or <code>b</code><br>
* </td>
* </tr>
* <tr>
* <td nowrap valign="top"><code>'a'</code></td>
* <td valign="top"></td>
* <td valign="top">the literal string between the quotes </td>
* </tr>
* </table>
* </td>
* </tr>
* </table>
* \htmlonly</blockquote>\endhtmlonly
*
* <p>Note:
* - Most UnicodeSet methods do not take a UErrorCode parameter because
* there are usually very few opportunities for failure other than a shortage
* of memory, error codes in low-level C++ string methods would be inconvenient,
* and the error code as the last parameter (ICU convention) would prevent
* the use of default parameter values.
* Instead, such methods set the UnicodeSet into a "bogus" state
* (see isBogus()) if an error occurs.
*
* @author Alan Liu
* @stable ICU 2.0
*/
class U_COMMON_API UnicodeSet : public UnicodeFilter {
int32_t len; // length of list used; 0 <= len <= capacity
int32_t capacity; // capacity of list
UChar32* list; // MUST be terminated with HIGH
BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
UChar32* buffer; // internal buffer, may be NULL
int32_t bufferCapacity; // capacity of buffer
int32_t patLen;
/**
* The pattern representation of this set. This may not be the
* most economical pattern. It is the pattern supplied to
* applyPattern(), with variables substituted and whitespace
* removed. For sets constructed without applyPattern(), or
* modified using the non-pattern API, this string will be empty,
* indicating that toPattern() must generate a pattern
* representation from the inversion list.
*/
UChar *pat;
UVector* strings; // maintained in sorted order
UnicodeSetStringSpan *stringSpan;
private:
enum { // constants
kIsBogus = 1 // This set is bogus (i.e. not valid)
};
uint8_t fFlags; // Bit flag (see constants above)
public:
/**
* Determine if this object contains a valid set.
* A bogus set has no value. It is different from an empty set.
* It can be used to indicate that no set value is available.
*
* @return TRUE if the set is valid, FALSE otherwise
* @see setToBogus()
* @draft ICU 4.0
*/
inline UBool isBogus(void) const;
/**
* Make this UnicodeSet object invalid.
* The string will test TRUE with isBogus().
*
* A bogus set has no value. It is different from an empty set.
* It can be used to indicate that no set value is available.
*
* This utility function is used throughout the UnicodeSet
* implementation to indicate that a UnicodeSet operation failed,
* and may be used in other functions,
* especially but not exclusively when such functions do not
* take a UErrorCode for simplicity.
*
* @see isBogus()
* @draft ICU 4.0
*/
void setToBogus();
public:
enum {
/**
* Minimum value that can be stored in a UnicodeSet.
* @stable ICU 2.4
*/
MIN_VALUE = 0,
/**
* Maximum value that can be stored in a UnicodeSet.
* @stable ICU 2.4
*/
MAX_VALUE = 0x10ffff
};
//----------------------------------------------------------------
// Constructors &c
//----------------------------------------------------------------
public:
/**
* Constructs an empty set.
* @stable ICU 2.0
*/
UnicodeSet();
/**
* Constructs a set containing the given range. If <code>end >
* start</code> then an empty set is created.
*
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
* @stable ICU 2.4
*/
UnicodeSet(UChar32 start, UChar32 end);
/**
* Constructs a set from the given pattern. See the class
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
* @stable ICU 2.0
*/
UnicodeSet(const UnicodeString& pattern,
UErrorCode& status);
/**
* Constructs a set from the given pattern. See the class
* description for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be NULL
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
* @internal
*/
UnicodeSet(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status);
/**
* Constructs a set from the given pattern. See the class description
* for the syntax of the pattern language.
* @param pattern a string specifying what characters are in the set
* @param pos on input, the position in pattern at which to start parsing.
* On output, the position after the last character parsed.
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* @param symbols a symbol table mapping variable names to values
* and stand-in characters to UnicodeSets; may be NULL
* @param status input-output error code
* @stable ICU 2.8
*/
UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status);
/**
* Constructs a set that is identical to the given UnicodeSet.
* @stable ICU 2.0
*/
UnicodeSet(const UnicodeSet& o);
/**
* Destructs the set.
* @stable ICU 2.0
*/
virtual ~UnicodeSet();
/**
* Assigns this object to be a copy of another.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& operator=(const UnicodeSet& o);
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the two sets
* have the same size, and every member of the specified set is
* contained in this set (or equivalently, every member of this set is
* contained in the specified set).
*
* @param o set to be compared for equality with this set.
* @return <tt>true</tt> if the specified set is equal to this set.
* @stable ICU 2.0
*/
virtual UBool operator==(const UnicodeSet& o) const;
/**
* Compares the specified object with this set for equality. Returns
* <tt>true</tt> if the specified set is not equal to this set.
* @stable ICU 2.0
*/
UBool operator!=(const UnicodeSet& o) const;
/**
* Returns a copy of this object. All UnicodeFunctor objects have
* to support cloning in order to allow classes using
* UnicodeFunctors, such as Transliterator, to implement cloning.
* If this set is frozen, then the clone will be frozen as well.
* Use cloneAsThawed() for a mutable clone of a frozen set.
* @see cloneAsThawed
* @stable ICU 2.0
*/
virtual UnicodeFunctor* clone() const;
/**
* Returns the hash code value for this set.
*
* @return the hash code value for this set.
* @see Object#hashCode()
* @stable ICU 2.0
*/
virtual int32_t hashCode(void) const;
//----------------------------------------------------------------
// Freezable API
//----------------------------------------------------------------
/**
* Determines whether the set has been frozen (made immutable) or not.
* See the ICU4J Freezable interface for details.
* @return TRUE/FALSE for whether the set has been frozen
* @see freeze
* @see cloneAsThawed
* @stable ICU 4.0
*/
inline UBool isFrozen() const;
/**
* Freeze the set (make it immutable).
* Once frozen, it cannot be unfrozen and is therefore thread-safe
* until it is deleted.
* See the ICU4J Freezable interface for details.
* Freezing the set may also make some operations faster, for example
* contains() and span().
* A frozen set will not be modified. (It remains frozen.)
* @return this set.
* @see isFrozen
* @see cloneAsThawed
* @stable ICU 4.0
*/
UnicodeFunctor *freeze();
/**
* Clone the set and make the clone mutable.
* See the ICU4J Freezable interface for details.
* @return the mutable clone
* @see freeze
* @see isFrozen
* @stable ICU 4.0
*/
UnicodeFunctor *cloneAsThawed() const;
//----------------------------------------------------------------
// Public API
//----------------------------------------------------------------
/**
* Make this object represent the range <code>start - end</code>.
* If <code>end > start</code> then this object is set to an
* an empty range.
* A frozen set will not be modified.
*
* @param start first character in the set, inclusive
* @param end last character in the set, inclusive
* @stable ICU 2.4
*/
UnicodeSet& set(UChar32 start, UChar32 end);
/**
* Return true if the given position, in the given pattern, appears
* to be the start of a UnicodeSet pattern.
* @stable ICU 2.4
*/
static UBool resemblesPattern(const UnicodeString& pattern,
int32_t pos);
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
* description for the syntax of the pattern language.
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
* <em> Empties the set passed before applying the pattern.</em>
* @return a reference to this
* @stable ICU 2.0
*/
UnicodeSet& applyPattern(const UnicodeString& pattern,
UErrorCode& status);
/**
* Modifies this set to represent the set specified by the given
* pattern, optionally ignoring white space. See the class
* description for the syntax of the pattern language.
* A frozen set will not be modified.
* @param pattern a string specifying what characters are in the set
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be NULL
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
*<em> Empties the set passed before applying the pattern.</em>
* @return a reference to this
* @internal
*/
UnicodeSet& applyPattern(const UnicodeString& pattern,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status);
/**
* Parses the given pattern, starting at the given position. The
* character at pattern.charAt(pos.getIndex()) must be '[', or the
* parse fails. Parsing continues until the corresponding closing
* ']'. If a syntax error is encountered between the opening and
* closing brace, the parse fails. Upon return from a successful
* parse, the ParsePosition is updated to point to the character
* following the closing ']', and a StringBuffer containing a
* pairs list for the parsed pattern is returned. This method calls
* itself recursively to parse embedded subpatterns.
*<em> Empties the set passed before applying the pattern.</em>
* A frozen set will not be modified.
*
* @param pattern the string containing the pattern to be parsed.
* The portion of the string from pos.getIndex(), which must be a
* '[', to the corresponding closing ']', is parsed.
* @param pos upon entry, the position at which to being parsing.
* The character at pattern.charAt(pos.getIndex()) must be a '['.
* Upon return from a successful parse, pos.getIndex() is either
* the character after the closing ']' of the parsed pattern, or
* pattern.length() if the closing ']' is the last character of
* the pattern string.
* @param options bitmask for options to apply to the pattern.
* Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
* @param symbols a symbol table mapping variable names to
* values and stand-ins to UnicodeSets; may be NULL
* @param status returns <code>U_ILLEGAL_ARGUMENT_ERROR</code> if the pattern
* contains a syntax error.
* @return a reference to this
* @stable ICU 2.8
*/
UnicodeSet& applyPattern(const UnicodeString& pattern,
ParsePosition& pos,
uint32_t options,
const SymbolTable* symbols,
UErrorCode& status);
/**
* Returns a string representation of this set. If the result of
* calling this function is passed to a UnicodeSet constructor, it
* will produce another set that is equal to this one.
* A frozen set will not be modified.
* @param result the string to receive the rules. Previous
* contents will be deleted.
* @param escapeUnprintable if TRUE then convert unprintable
* character to their hex escape representations, \\uxxxx or
* \\Uxxxxxxxx. Unprintable characters are those other than
* U+000A, U+0020..U+007E.
* @stable ICU 2.0
*/
virtual UnicodeString& toPattern(UnicodeString& result,
UBool escapeUnprintable = FALSE) const;
/**
* Modifies this set to contain those code points which have the given value
* for the given binary or enumerated property, as returned by
* u_getIntPropertyValue. Prior contents of this set are lost.
* A frozen set will not be modified.
*
* @param prop a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1
* or UCHAR_INT_START..UCHAR_INT_LIMIT-1
* or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
*
* @param value a value in the range u_getIntPropertyMinValue(prop)..
* u_getIntPropertyMaxValue(prop), with one exception. If prop is
* UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but
* rather a mask value produced by U_GET_GC_MASK(). This allows grouped
* categories such as [:L:] to be represented.
*
* @param ec error code input/output parameter
*
* @return a reference to this set
*
* @stable ICU 2.4
*/
UnicodeSet& applyIntPropertyValue(UProperty prop,
int32_t value,
UErrorCode& ec);
/**
* Modifies this set to contain those code points which have the
* given value for the given property. Prior contents of this
* set are lost.
* A frozen set will not be modified.
*
* @param prop a property alias, either short or long. The name is matched
* loosely. See PropertyAliases.txt for names and a description of loose
* matching. If the value string is empty, then this string is interpreted
* as either a General_Category value alias, a Script value alias, a binary
* property alias, or a special ID. Special IDs are matched loosely and
* correspond to the following sets:
*
* "ANY" = [\\u0000-\\U0010FFFF],
* "ASCII" = [\\u0000-\\u007F],
* "Assigned" = [:^Cn:].
*
* @param value a value alias, either short or long. The name is matched
* loosely. See PropertyValueAliases.txt for names and a description of
* loose matching. In addition to aliases listed, numeric values and
* canonical combining classes may be expressed numerically, e.g., ("nv",
* "0.5") or ("ccc", "220"). The value string may also be empty.
*
* @param ec error code input/output parameter
*
* @return a reference to this set
*
* @stable ICU 2.4
*/
UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
const UnicodeString& value,
UErrorCode& ec);
/**
* Returns the number of elements in this set (its cardinality).
* Note than the elements of a set may include both individual
* codepoints and strings.
*
* @return the number of elements in this set (its cardinality).
* @stable ICU 2.0
*/
virtual int32_t size(void) const;
/**
* Returns <tt>true</tt> if this set contains no elements.
*
* @return <tt>true</tt> if this set contains no elements.
* @stable ICU 2.0
*/
virtual UBool isEmpty(void) const;
/**
* Returns true if this set contains the given character.
* This function works faster with a frozen set.
* @param c character to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.0
*/
virtual UBool contains(UChar32 c) const;
/**
* Returns true if this set contains every character
* of the given range.
* @param start first character, inclusive, of the range
* @param end last character, inclusive, of the range
* @return true if the test condition is met
* @stable ICU 2.0
*/
virtual UBool contains(UChar32 start, UChar32 end) const;
/**
* Returns <tt>true</tt> if this set contains the given
* multicharacter string.
* @param s string to be checked for containment
* @return <tt>true</tt> if this set contains the specified string
* @stable ICU 2.4
*/
UBool contains(const UnicodeString& s) const;
/**
* Returns true if this set contains all the characters and strings
* of the given set.
* @param c set to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.4
*/
virtual UBool containsAll(const UnicodeSet& c) const;
/**
* Returns true if this set contains all the characters
* of the given string.
* @param s string containing characters to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.4
*/
UBool containsAll(const UnicodeString& s) const;
/**
* Returns true if this set contains none of the characters
* of the given range.
* @param start first character, inclusive, of the range
* @param end last character, inclusive, of the range
* @return true if the test condition is met
* @stable ICU 2.4
*/
UBool containsNone(UChar32 start, UChar32 end) const;
/**
* Returns true if this set contains none of the characters and strings
* of the given set.
* @param c set to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.4
*/
UBool containsNone(const UnicodeSet& c) const;
/**
* Returns true if this set contains none of the characters
* of the given string.
* @param s string containing characters to be checked for containment
* @return true if the test condition is met
* @stable ICU 2.4
*/
UBool containsNone(const UnicodeString& s) const;
/**
* Returns true if this set contains one or more of the characters
* in the given range.
* @param start first character, inclusive, of the range
* @param end last character, inclusive, of the range
* @return true if the condition is met
* @stable ICU 2.4
*/
inline UBool containsSome(UChar32 start, UChar32 end) const;
/**
* Returns true if this set contains one or more of the characters
* and strings of the given set.
* @param s The set to be checked for containment
* @return true if the condition is met
* @stable ICU 2.4
*/
inline UBool containsSome(const UnicodeSet& s) const;
/**
* Returns true if this set contains one or more of the characters
* of the given string.
* @param s string containing characters to be checked for containment
* @return true if the condition is met
* @stable ICU 2.4
*/
inline UBool containsSome(const UnicodeString& s) const;
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @param spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @stable ICU 4.0
* @see USetSpanCondition
*/
int32_t span(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Unpaired surrogates are treated according to contains() of their surrogate code points.
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string
* @param length of the string; can be -1 for NUL-terminated
* @param spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @stable ICU 4.0
* @see USetSpanCondition
*/
int32_t spanBack(const UChar *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the length of the initial substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Similar to the strspn() C library function.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @param spanCondition specifies the containment condition
* @return the length of the initial substring according to the spanCondition;
* 0 if the start of the string does not fit the spanCondition
* @stable ICU 4.0
* @see USetSpanCondition
*/
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Returns the start of the trailing substring of the input string which
* consists only of characters and strings that are contained in this set
* (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE),
* or only of characters and strings that are not contained
* in this set (USET_SPAN_NOT_CONTAINED).
* See USetSpanCondition for details.
* Malformed byte sequences are treated according to contains(0xfffd).
* This function works faster with a frozen set and with a non-negative string length argument.
* @param s start of the string (UTF-8)
* @param length of the string; can be -1 for NUL-terminated
* @param spanCondition specifies the containment condition
* @return the start of the trailing substring according to the spanCondition;
* the string length if the end of the string does not fit the spanCondition
* @stable ICU 4.0
* @see USetSpanCondition
*/
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
/**
* Implement UnicodeMatcher::matches()
* @stable ICU 2.4
*/
virtual UMatchDegree matches(const Replaceable& text,
int32_t& offset,
int32_t limit,
UBool incremental);
private:
/**
* Returns the longest match for s in text at the given position.
* If limit > start then match forward from start+1 to limit
* matching all characters except s.charAt(0). If limit < start,
* go backward starting from start-1 matching all characters
* except s.charAt(s.length()-1). This method assumes that the
* first character, text.charAt(start), matches s, so it does not
* check it.
* @param text the text to match
* @param start the first character to match. In the forward
* direction, text.charAt(start) is matched against s.charAt(0).
* In the reverse direction, it is matched against
* s.charAt(s.length()-1).
* @param limit the limit offset for matching, either last+1 in
* the forward direction, or last-1 in the reverse direction,
* where last is the index of the last character to match.
* @return If part of s matches up to the limit, return |limit -
* start|. If all of s matches before reaching the limit, return
* s.length(). If there is a mismatch between s and text, return
* 0
*/
static int32_t matchRest(const Replaceable& text,
int32_t start, int32_t limit,
const UnicodeString& s);
/**
* Returns the smallest value i such that c < list[i]. Caller
* must ensure that c is a legal value or this method will enter
* an infinite loop. This method performs a binary search.
* @param c a character in the range MIN_VALUE..MAX_VALUE
* inclusive
* @return the smallest integer i in the range 0..len-1,
* inclusive, such that c < list[i]
*/
int32_t findCodePoint(UChar32 c) const;
public:
/**
* Implementation of UnicodeMatcher API. Union the set of all
* characters that may be matched by this object into the given
* set.
* @param toUnionTo the set into which to union the source characters
* @stable ICU 2.4
*/
virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
/**
* Returns the index of the given character within this set, where
* the set is ordered by ascending code point. If the character
* is not in this set, return -1. The inverse of this method is
* <code>charAt()</code>.
* @return an index from 0..size()-1, or -1
* @stable ICU 2.4
*/
int32_t indexOf(UChar32 c) const;
/**
* Returns the character at the given index within this set, where
* the set is ordered by ascending code point. If the index is
* out of range, return (UChar32)-1. The inverse of this method is
* <code>indexOf()</code>.
* @param index an index from 0..size()-1
* @return the character at the given index, or (UChar32)-1.
* @stable ICU 2.4
*/
UChar32 charAt(int32_t index) const;
/**
* Adds the specified range to this set if it is not already
* present. If this set already contains the specified range,
* the call leaves this set unchanged. If <code>end > start</code>
* then an empty range is added, leaving the set unchanged.
* This is equivalent to a boolean logic OR, or a set UNION.
* A frozen set will not be modified.
*
* @param start first character, inclusive, of range to be added
* to this set.
* @param end last character, inclusive, of range to be added
* to this set.
* @stable ICU 2.0
*/
virtual UnicodeSet& add(UChar32 start, UChar32 end);
/**
* Adds the specified character to this set if it is not already
* present. If this set already contains the specified character,
* the call leaves this set unchanged.
* A frozen set will not be modified.
* @stable ICU 2.0
*/
UnicodeSet& add(UChar32 c);
/**
* Adds the specified multicharacter to this set if it is not already
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus "ch" => {"ch"}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* A frozen set will not be modified.
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.4
*/
UnicodeSet& add(const UnicodeString& s);
private:
/**
* @return a code point IF the string consists of a single one.