@@ -11,6 +11,13 @@ typedef enum {
11
11
MVM_NORMALIZE_NFG = 6
12
12
} MVMNormalization ;
13
13
14
+ /* First codepoint where we have to actually do a real check and maybe some
15
+ * work when normalizing. */
16
+ #define MVM_NORMALIZE_FIRST_SIG_NFD 0x00C0
17
+ #define MVM_NORMALIZE_FIRST_SIG_NFC 0x0300
18
+ #define MVM_NORMALIZE_FIRST_SIG_NFKD 0x00A0
19
+ #define MVM_NORMALIZE_FIRST_SIG_NFKC 0x00A0
20
+
14
21
/* Streaming Unicode normalizer structure. */
15
22
struct MVMNormalizer {
16
23
/* What form of normalization are we doing? */
@@ -30,6 +37,11 @@ struct MVMNormalizer {
30
37
31
38
/* End offset in the buffer for things we've normalized and so can return. */
32
39
MVMint32 buffer_norm_end ;
40
+
41
+ /* The first significant codepoint in this normalization form that we may
42
+ * have to do something with. If we see two things beneath the limit in a
43
+ * row then we know the first one below it is good to spit out. */
44
+ MVMCodepoint first_significant ;
33
45
};
34
46
35
47
/* Takes a codepoint to process for normalization as the "in" parameter. If we
@@ -38,9 +50,17 @@ struct MVMNormalizer {
38
50
* codepoints now available including the one we just passed out. If we can't
39
51
* produce a normalized codepoint right now, we return a 0. */
40
52
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint (MVMThreadContext * tc , MVMNormalizer * n , MVMCodepoint in , MVMCodepoint * out ) {
41
- /* TODO: Implement normalization! */
42
- * out = in ;
43
- return 1 ;
53
+ /* Fast-path when it's one-in-one-out. */
54
+ if (n -> buffer_end - n -> buffer_start == 1 && in < n -> first_significant ) {
55
+ if (n -> buffer [n -> buffer_start ] < n -> first_significant ) {
56
+ * out = n -> buffer [n -> buffer_start ];
57
+ n -> buffer [n -> buffer_start ] = in ;
58
+ return 1 ;
59
+ }
60
+ }
61
+
62
+ /* Fall back to slow path. */
63
+ return MVM_unicode_normalizer_process_codepoint_full (tc , n , in , out );
44
64
}
45
65
46
66
/* TODO: grapheme version of the above. */
@@ -75,4 +95,4 @@ void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n);
75
95
void MVM_unicode_normalize_codepoints (MVMThreadContext * tc , MVMObject * in , MVMObject * out , MVMNormalization form );
76
96
77
97
/* Guts-y functions, called by the API level ones above. */
78
- /* TODO: many of these. */
98
+ MVMint32 MVM_unicode_normalizer_process_codepoint_full ( MVMThreadContext * tc , MVMNormalizer * n , MVMCodepoint in , MVMCodepoint * out );
0 commit comments