Skip to content

Commit

Permalink
Speed up MVM_string_find_cclass and cclass
Browse files Browse the repository at this point in the history
Speeds up .words in rakudo a fair amount by speeding up WhiteSpace
search. Also convert code that checked for multiple General Categories
so that it checks for only one.

Add code to ucd2c.pl which allows us to check for WhiteSpace and Zl, Zp
codepoints without having to query the Unicode database to improve
performance. This creates macros which we can use in conditionals.
  • Loading branch information
samcv committed May 26, 2018
1 parent a8ceea0 commit 6bcd795
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 51 deletions.
93 changes: 44 additions & 49 deletions src/strings/ops.c
Expand Up @@ -2582,12 +2582,12 @@ void MVM_string_cclass_init(MVMThreadContext *tc) {
UPV_Po = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Po"));
}

#include "strings/unicode_prop_macros.h"
/* Checks if the specified grapheme is in the given character class. */
static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) {
/* If it's a synthetic, then grab the base codepoint. */
MVMCodepoint cp;
if (MVM_LIKELY(g >= 0))
if (0 <= g)
cp = (MVMCodepoint)g;
else
cp = MVM_nfg_get_synthetic_info(tc, g)->codes[0];
Expand Down Expand Up @@ -2628,17 +2628,10 @@ static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGra
else
return 0;
}
return
MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lo) /* lots of CJK chars */
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll) /* (ascii handled above) */
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lt)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lm);
/* Property L covers Lo, Ll, Lu, Lt, Lm */
return !!MVM_unicode_codepoint_get_property_int(tc, cp,
MVM_UNICODE_PROPERTY_L);
/* TODO: Maybe we want MVM_UNICODE_PROPERTY_ALPHABETIC instead? */

case MVM_CCLASS_NUMERIC:
if (cp <= '9' && cp >= '0') /* short circuit common case */
Expand All @@ -2651,14 +2644,7 @@ static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGra
MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1);

case MVM_CCLASS_WHITESPACE:
if (cp <= '~') {
if (cp == ' ' || (cp <= 13 && cp >= 9))
return 1;
else
return 0;
}
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_WHITE_SPACE, 1);
return MVM_CP_is_White_Space(cp);

case MVM_CCLASS_BLANK:
if (cp == '\t')
Expand All @@ -2675,28 +2661,12 @@ static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGra
}

case MVM_CCLASS_PUNCTUATION:
return
MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pc)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pd)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ps)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pe)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pi)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pf)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Po);
return !!MVM_unicode_codepoint_get_property_int(tc, cp,
MVM_UNICODE_PROPERTY_P);

case MVM_CCLASS_NEWLINE: {
/* TODO maybe we should edit ucd2c.pl to give us all Zl and Zp
* characters. ATM and maybe forever the only Zp is U+2029
* and the only Zl is U+2028 */
if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
cp == 0x85 || cp == 0x2028 || cp == 0x2029)
cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
return 1;
}

Expand Down Expand Up @@ -2732,15 +2702,20 @@ MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString
MVM_string_gi_init(tc, &gi, s);
MVM_string_gi_move_to(tc, &gi, offset);
switch (cclass) {
case MVM_CCLASS_WHITESPACE:
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
if (MVM_CP_is_White_Space(cp))
return pos;
}
break;
case MVM_CCLASS_NEWLINE:
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
MVMCodepoint cp = g >= 0 ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
/* TODO maybe we should edit ucd2c.pl to give us all Zl and Zp
* characters. ATM and maybe forever the only Zp is U+2029
* and the only Zl is U+2028 */
MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
cp == 0x85 || cp == 0x2028 || cp == 0x2029)
cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp))
return pos;
}
break;
Expand Down Expand Up @@ -2770,10 +2745,30 @@ MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMSt

MVM_string_gi_init(tc, &gi, s);
MVM_string_gi_move_to(tc, &gi, offset);
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
if (grapheme_is_cclass(tc, cclass, g) == 0)
return pos;
switch (cclass) {
case MVM_CCLASS_WHITESPACE:
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
if (!MVM_CP_is_White_Space(cp))
return pos;
}
break;
case MVM_CCLASS_NEWLINE:
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
MVMCodepoint cp = 0 <= g ? g : MVM_nfg_get_synthetic_info(tc, g)->codes[0];
if (!(cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
cp == 0x85 || MVM_CP_is_gencat_name_Zl(cp) || MVM_CP_is_gencat_name_Zp(cp)))
return pos;
}
break;
default:
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
if (!grapheme_is_cclass(tc, cclass, g))
return pos;
}
}

return end;
Expand Down
3 changes: 3 additions & 0 deletions src/strings/unicode_prop_macros.h
@@ -0,0 +1,3 @@
#define MVM_CP_is_White_Space(cp) (((cp) == 9) || ((cp) == 10) || ((cp) == 11) || ((cp) == 12) || ((cp) == 13) || ((cp) == 32) || ((cp) == 133) || ((cp) == 160) || ((cp) == 5760) || ((cp) == 8192) || ((cp) == 8193) || ((cp) == 8194) || ((cp) == 8195) || ((cp) == 8196) || ((cp) == 8197) || ((cp) == 8198) || ((cp) == 8199) || ((cp) == 8200) || ((cp) == 8201) || ((cp) == 8202) || ((cp) == 8232) || ((cp) == 8233) || ((cp) == 8239) || ((cp) == 8287) || ((cp) == 12288))
#define MVM_CP_is_gencat_name_Zl(cp) (((cp) == 8232))
#define MVM_CP_is_gencat_name_Zp(cp) (((cp) == 8233))
36 changes: 34 additions & 2 deletions tools/ucd2c.pl
Expand Up @@ -162,7 +162,7 @@ sub main {
skip_most:
break_property('Word', 'Word_Break');
tweak_nfg_qc();

find_quick_prop_data();
# Allocate all the things
progress("done.\nsetting next_point for codepoints");
my $first_point = set_next_points();
Expand Down Expand Up @@ -201,7 +201,39 @@ sub main {
print "\nDONE!!!\n\n";
return 1;
}

sub find_quick_prop_data {
my @wanted_val_str = (
[ 'gencat_name', 'Zl', 'Zp' ],
);
my @wanted_val_bool = (
[ 'White_Space', 1 ]
);
my %gencat_wanted_h;
my @result;
for my $code (sort { $a <=> $b } keys %{$POINTS_BY_CODE}) {
for my $cat_data (@wanted_val_str) {
my $propname = $cat_data->[0];
my $i;
for ($i = 1; $i < @$cat_data; $i++) {
my $pval = $cat_data->[$i];
push @{$gencat_wanted_h{$propname . "_" . $pval}}, $code if $POINTS_BY_CODE->{$code}->{$propname} eq $pval;
}
}
for my $cat_data (@wanted_val_bool) {
my $propname = $cat_data->[0];
push @{$gencat_wanted_h{$propname}}, $code if $POINTS_BY_CODE->{$code}->{$propname};
}
}
say Dumper(%gencat_wanted_h);
for my $pname (sort keys %gencat_wanted_h) {
my @text;
for my $cp (@{$gencat_wanted_h{$pname}}) {
push @text, "((cp) == $cp)";
}
push @result, ("#define MVM_CP_is_$pname(cp) (" . join(' || ', @text) . ')');
}
write_file("src/strings/unicode_prop_macros.h", (join("\n", @result) . "\n"));
}
sub thousands {
my $in = shift;
$in = reverse "$in"; # stringify or copy the string
Expand Down

0 comments on commit 6bcd795

Please sign in to comment.