Implement fast "nothing to do" normalization.

jnthn · jnthn · commit 79b6087dbd02 · 2015-04-07T18:04:18.000+02:00
When two codepoints in a row are below the threshold of significance
for the target normalization form, we can immediately the first one
back. For now, the slow path doing the full check does exactly the
same; that's where we'll add the interesting bits soon.
diff --git a/src/strings/normalize.c b/src/strings/normalize.c
@@ -86,13 +86,48 @@ void MVM_unicode_normalizer_init(MVMThreadContext *tc, MVMNormalizer *n, MVMNorm
     n->buffer_start    = 0;
     n->buffer_end      = 0;
     n->buffer_norm_end = 0;
+    switch (n->form) {
+        case MVM_NORMALIZE_NFD:  n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFD;  break;
+        case MVM_NORMALIZE_NFKD: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKD; break;
+        case MVM_NORMALIZE_NFC:  n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;  break;
+        case MVM_NORMALIZE_NFKC: n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFKC; break;
+        case MVM_NORMALIZE_NFG:  n->first_significant = MVM_NORMALIZE_FIRST_SIG_NFC;  break;
+    }
 }
 
 /* Cleanup an MVMNormalization once we're done normalizing. */
 void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n) {
     free(n->buffer);
 }
 
+/* Adds a codepoint into the buffer, making sure there's space. */
+static void add_codepoint_to_buffer(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint cp) {
+    if (n->buffer_end == n->buffer_size) {
+        MVM_panic(1, "Resize of codepoint buffer NYI");
+    }
+    n->buffer[n->buffer_end++] = cp;
+}
+
+/* Called when the very fast case of normalization fails (that is, when we get
+ * any two codepoints in a row where at least one is greater than the first
+ * significant codepoint identified by a quick check for the target form). We
+ * may find the quick check itself is enough; if not, we have to do real work
+ * compute the normalization. */
+MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
+    /* If our buffer is empty, can only add to it and we're done. */
+    if (n->buffer_start == n->buffer_end) {
+        add_codepoint_to_buffer(tc, n, in);
+        return 0;
+    }
+
+    /* TODO: actually normalize. */
+    *out = n->buffer[n->buffer_start];
+    n->buffer[n->buffer_start] = in;
+    return 1;
+}
+
 /* Called when we are expecting no more codepoints. */
 void MVM_unicode_normalizer_eof(MVMThreadContext *tc, MVMNormalizer *n) {
+    /* TODO: actually normalize. */
+    n->buffer_norm_end = n->buffer_end;
 }
diff --git a/src/strings/normalize.h b/src/strings/normalize.h
@@ -11,6 +11,13 @@ typedef enum {
     MVM_NORMALIZE_NFG   = 6
 } MVMNormalization;
 
+/* First codepoint where we have to actually do a real check and maybe some
+ * work when normalizing. */
+#define MVM_NORMALIZE_FIRST_SIG_NFD     0x00C0
+#define MVM_NORMALIZE_FIRST_SIG_NFC     0x0300
+#define MVM_NORMALIZE_FIRST_SIG_NFKD    0x00A0
+#define MVM_NORMALIZE_FIRST_SIG_NFKC    0x00A0
+
 /* Streaming Unicode normalizer structure. */
 struct MVMNormalizer {
     /* What form of normalization are we doing? */
@@ -30,6 +37,11 @@ struct MVMNormalizer {
 
     /* End offset in the buffer for things we've normalized and so can return. */
     MVMint32 buffer_norm_end;
+
+    /* The first significant codepoint in this normalization form that we may
+     * have to do something with. If we see two things beneath the limit in a
+     * row then we know the first one below it is good to spit out. */
+    MVMCodepoint first_significant;
 };
 
 /* Takes a codepoint to process for normalization as the "in" parameter. If we
@@ -38,9 +50,17 @@ struct MVMNormalizer {
  * codepoints now available including the one we just passed out. If we can't
  * produce a normalized codepoint right now, we return a 0. */
 MVM_STATIC_INLINE MVMint32 MVM_unicode_normalizer_process_codepoint(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out) {
-    /* TODO: Implement normalization! */
-    *out = in;
-    return 1;
+    /* Fast-path when it's one-in-one-out. */
+    if (n->buffer_end - n->buffer_start == 1 && in < n->first_significant) {
+        if (n->buffer[n->buffer_start] < n->first_significant) {
+            *out = n->buffer[n->buffer_start];
+            n->buffer[n->buffer_start] = in;
+            return 1;
+        }
+    }
+
+    /* Fall back to slow path. */
+    return MVM_unicode_normalizer_process_codepoint_full(tc, n, in, out);
 }
 
 /* TODO: grapheme version of the above. */
@@ -75,4 +95,4 @@ void MVM_unicode_normalizer_cleanup(MVMThreadContext *tc, MVMNormalizer *n);
 void MVM_unicode_normalize_codepoints(MVMThreadContext *tc, MVMObject *in, MVMObject *out, MVMNormalization form);
 
 /* Guts-y functions, called by the API level ones above. */
-/* TODO: many of these. */
+MVMint32 MVM_unicode_normalizer_process_codepoint_full(MVMThreadContext *tc, MVMNormalizer *n, MVMCodepoint in, MVMCodepoint *out);