Skip to content

Commit 79b6087

Browse files
committed
Implement fast "nothing to do" normalization.
When two codepoints in a row are below the threshold of significance for the target normalization form, we can immediately the first one back. For now, the slow path doing the full check does exactly the same; that's where we'll add the interesting bits soon.
1 parent 2bcdf64 commit 79b6087

File tree

2 files changed

+59
-4
lines changed

2 files changed

+59
-4
lines changed

src/strings/normalize.c

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,13 +86,48 @@ void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNorm
8686
n->buffer_start = 0;
8787
n->buffer_end = 0;
8888
n->buffer_norm_end = 0;
89+
switch (n->form) {
90+
case MVM_NORMALIZE_NFD: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFD; break;
91+
case MVM_NORMALIZE_NFKD: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKD; break;
92+
case MVM_NORMALIZE_NFC: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC; break;
93+
case MVM_NORMALIZE_NFKC: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKC; break;
94+
case MVM_NORMALIZE_NFG: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC; break;
95+
}
8996
}
9097

9198
/* Cleanup an MVMNormalization once we're done normalizing. */
9299
void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n) {
93100
free(n->buffer);
94101
}
95102

103+
/* Adds a codepoint into the buffer, making sure there's space. */
104+
static void add_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
105+
if (n->buffer_end == n->buffer_size) {
106+
MVM_panic(1, "Resize of codepoint buffer NYI");
107+
}
108+
n->buffer[n->buffer_end++] = cp;
109+
}
110+
111+
/* Called when the very fast case of normalization fails (that is, when we get
112+
* any two codepoints in a row where at least one is greater than the first
113+
* significant codepoint identified by a quick check for the target form). We
114+
* may find the quick check itself is enough; if not, we have to do real work
115+
* compute the normalization. */
116+
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
117+
/* If our buffer is empty, can only add to it and we're done. */
118+
if (n->buffer_start == n->buffer_end) {
119+
add_codepoint_to_buffer(tc, n, in);
120+
return 0;
121+
}
122+
123+
/* TODO: actually normalize. */
124+
*out = n->buffer[n->buffer_start];
125+
n->buffer[n->buffer_start] = in;
126+
return 1;
127+
}
128+
96129
/* Called when we are expecting no more codepoints. */
97130
void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n) {
131+
/* TODO: actually normalize. */
132+
n->buffer_norm_end = n->buffer_end;
98133
}

src/strings/normalize.h

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ typedef enum {
1111
MVM_NORMALIZE_NFG = 6
1212
} MVMNormalization;
1313

14+
/* First codepoint where we have to actually do a real check and maybe some
15+
* work when normalizing. */
16+
#define MVM_NORMALIZE_FIRST_SIG_NFD 0x00C0
17+
#define MVM_NORMALIZE_FIRST_SIG_NFC 0x0300
18+
#define MVM_NORMALIZE_FIRST_SIG_NFKD 0x00A0
19+
#define MVM_NORMALIZE_FIRST_SIG_NFKC 0x00A0
20+
1421
/* Streaming Unicode normalizer structure. */
1522
struct MVMNormalizer {
1623
/* What form of normalization are we doing? */
@@ -30,6 +37,11 @@ struct MVMNormalizer {
3037

3138
/* End offset in the buffer for things we've normalized and so can return. */
3239
MVMint32 buffer_norm_end;
40+
41+
/* The first significant codepoint in this normalization form that we may
42+
* have to do something with. If we see two things beneath the limit in a
43+
* row then we know the first one below it is good to spit out. */
44+
MVMCodepoint first_significant;
3345
};
3446

3547
/* Takes a codepoint to process for normalization as the "in" parameter. If we
@@ -38,9 +50,17 @@ struct MVMNormalizer {
3850
* codepoints now available including the one we just passed out. If we can't
3951
* produce a normalized codepoint right now, we return a 0. */
4052
MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
41-
/* TODO: Implement normalization! */
42-
*out = in;
43-
return 1;
53+
/* Fast-path when it's one-in-one-out. */
54+
if (n->buffer_end - n->buffer_start == 1 && in < n->first_significant) {
55+
if (n->buffer[n->buffer_start] < n->first_significant) {
56+
*out = n->buffer[n->buffer_start];
57+
n->buffer[n->buffer_start] = in;
58+
return 1;
59+
}
60+
}
61+
62+
/* Fall back to slow path. */
63+
return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out);
4464
}
4565

4666
/* TODO: grapheme version of the above. */
@@ -75,4 +95,4 @@ void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n);
7595
void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form);
7696

7797
/* Guts-y functions, called by the API level ones above. */
78-
/* TODO: many of these. */
98+
MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);

0 commit comments

Comments
 (0)