Skip to content

Commit

Permalink
Merge pull request #756 from samcv/ucd2c.pl
Browse files Browse the repository at this point in the history
ucd2c.pl Improvements
  • Loading branch information
samcv committed Nov 25, 2017
2 parents 7e69859 + 8b785c3 commit c64d9d9
Showing 1 changed file with 60 additions and 53 deletions.
113 changes: 60 additions & 53 deletions tools/ucd2c.pl
Expand Up @@ -30,7 +30,6 @@
my $hout = "";
my $h_sections = {};
my $planes = [];
my $points_by_hex = {};
my $points_by_code = {};
my $enumerated_properties = {};
my $binary_properties = {};
Expand Down Expand Up @@ -114,39 +113,39 @@ sub main {
goto skip_most if $skip_most_mode;
binary_props('extracted/DerivedBinaryProperties');
binary_props("emoji-$highest_emoji_version/emoji-data");
enumerated_property('ArabicShaping', 'Joining_Group', {}, 0, 3);
enumerated_property('Blocks', 'Block', { No_Block => 0 }, 1, 1);
enumerated_property('ArabicShaping', 'Joining_Group', {}, 3);
enumerated_property('Blocks', 'Block', { No_Block => 0 }, 1);
# disabled because of sub Jamo
#enumerated_property('Jamo', 'Jamo_Short_Name', { }, 1, 1);
enumerated_property('extracted/DerivedDecompositionType', 'Decomposition_Type', { None => 0 }, 1, 1);
enumerated_property('extracted/DerivedEastAsianWidth', 'East_Asian_Width', {}, 0, 1);
enumerated_property('ArabicShaping', 'Joining_Type', {}, 0, 2);
enumerated_property('extracted/DerivedDecompositionType', 'Decomposition_Type', { None => 0 }, 1);
enumerated_property('extracted/DerivedEastAsianWidth', 'East_Asian_Width', {}, 1);
enumerated_property('ArabicShaping', 'Joining_Type', {}, 2);
CaseFolding();
SpecialCasing();
enumerated_property('DerivedAge',
'Age', { Unassigned => 0 }, 1, 1);
'Age', { Unassigned => 0 }, 1);
binary_props('DerivedCoreProperties');
DerivedNormalizationProps();
enumerated_property('extracted/DerivedNumericValues',
'Numeric_Value', { NaN => 0 }, 1, 1);
'Numeric_Value', { NaN => 0 }, 1);
enumerated_property('extracted/DerivedNumericValues',
'Numeric_Value_Numerator', { NaN => 0 }, 1, sub {
'Numeric_Value_Numerator', { NaN => 0 }, sub {
my @fraction = split('/', (shift->[3]));
return $fraction[0];
});
enumerated_property('extracted/DerivedNumericValues',
'Numeric_Value_Denominator', { NaN => 0 }, 1, sub {
'Numeric_Value_Denominator', { NaN => 0 }, sub {
my @fraction = split('/', (shift->[3]));
return $fraction[1] || '1';
});
enumerated_property('extracted/DerivedNumericType',
'Numeric_Type', { None => 0 }, 1, 1);
'Numeric_Type', { None => 0 }, 1);
enumerated_property('HangulSyllableType',
'Hangul_Syllable_Type', { Not_Applicable => 0 }, 1, 1);
'Hangul_Syllable_Type', { Not_Applicable => 0 }, 1);
LineBreak();
NamedSequences();
binary_props('PropList');
enumerated_property('Scripts', 'Script', { Unknown => 0 }, 1, 1);
enumerated_property('Scripts', 'Script', { Unknown => 0 }, 1);
# XXX StandardizedVariants.txt # no clue what this is
grapheme_cluster_break('Grapheme', 'Grapheme_Cluster_Break');
break_property('Sentence', 'Sentence_Break');
Expand Down Expand Up @@ -244,13 +243,14 @@ sub apply_to_range {
cluck "Did not get any range in apply_to_range";
}
my $fn = shift;
my ($first, $last) = split '\\.\\.', $range;
$first ||= $range;
$last ||= $first;
my $point = $points_by_hex->{$first};
my ($first_str, $last_str) = split '\\.\\.', $range;
$first_str ||= $range;
$last_str ||= $first_str;
my ($first_code, $last_code) = (hex $first_str, hex $last_str);
my $point = $points_by_code->{$first_code};
if (!$point) { # go backwards to find the last one
# (much faster than going forwards for some reason)
my $code = hex($first) - 1;
my $code = $first_code - 1;
$code-- until ($point = $points_by_code->{$code});
$point = $point->{next_point};
}
Expand All @@ -259,10 +259,10 @@ sub apply_to_range {
$fn->($point);
$last_point = $point;
$point = $point->{next_point};
} while ($point && $point->{code} <= hex $last);
} while ($point && $point->{code} <= $last_code);
#croak "couldn't find code ".sprintf('%x', $last_point->{code} + 1).
# " got ".$point->{code_str}." for range $first..$last"
# unless $last_point->{code} == hex $last;
# " got ".$point->{code_str}." for range $first_str..$last_str"
# unless $last_point->{code} == hex $last_str;
# can't croak there because some ranges end on points that don't exist (Blocks)
}
Expand All @@ -288,7 +288,7 @@ sub binary_props {
sub break_property {
my ($fname, $pname) = @_;
enumerated_property("auxiliary/${fname}BreakProperty",
$pname, { Other => 0 }, 1, 1);
$pname, { Other => 0 }, 1);
}
sub grapheme_cluster_break {
my ($fname, $pname) = @_;
Expand All @@ -297,17 +297,30 @@ sub grapheme_cluster_break {
# Should not be set to Other for this one ?
Other => 0,
}, 1, 1);
}, 1);
}
# Make sure we don't assign twice to the same pvalue code
sub check_base_for_duplicates {
my ($base) = @_;
my %seen;
for my $key (keys %{$base->{enum}}) {
if ($seen{ $base->{enum}->{$key} }) {
die "\nError: assigned twice to the same property value code. Both $key and "
. $seen{ $base->{enum}->{$key} }
. " are assigned to pvalue code "
. $base->{enum}->{$key};
}
$seen{ ($base->{enum}->{$key}) } = $key;
}
}
sub derived_property {
# filename, property name, property object
my ($fname, $pname, $base) = @_;
my $j = 0;
# wrap the provided object as the enum key in a new one
$base = { enum => $base };
# If we provided some property values already, add that number to the counter
$j += (scalar keys %{$base->{enum}});
$j += scalar keys %{$base->{enum}};
each_line("extracted/Derived$fname", sub { $_ = shift;
my ($range, $class) = split /\s*[;#]\s*/;
unless (exists $base->{enum}->{$class}) {
Expand All @@ -324,22 +337,13 @@ sub derived_property {
}
$base->{keys} = \@keys;
$base->{bit_width} = least_int_ge_lg2($j);
my %seen;
# Make sure we don't assign twice to the same pvalue code
for my $key (keys %{$base->{enum}}) {
if ($seen{ $base->{enum}->{$key} }) {
say "\nError: assigned twice to the same property value code. Both $key and "
. $seen{ $base->{enum}->{$key} }
. " are assigned to pvalue code "
. $base->{enum}->{$key};
}
$seen{ ($base->{enum}->{$key}) } = $key;
}
register_enumerated_property($pname, $base);
}
sub enumerated_property {
my ($fname, $pname, $base, $j, $value_index) = @_;
my ($fname, $pname, $base, $value_index) = @_;
my $j = 0;
$j += scalar keys %{$base->{enum}};
$base = { enum => $base };
each_line($fname, sub { $_ = shift;
my @vals = split /\s*[#;]\s*/;
Expand Down Expand Up @@ -1470,9 +1474,9 @@ sub emit_composition_lookup {
# first codepoint of the decomposition of a primary composite, mapped to
# an array of [second codepoint, primary composite].
my @lookup;
for my $point_hex (sort keys %$points_by_hex) {
for my $point_code (sort { $a <=> $b } keys %$points_by_code) {
# Not interested in anything in the set of full composition exclusions.
my $point = $points_by_hex->{$point_hex};
my $point = $points_by_code->{$point_code};
next if $point->{Full_Composition_Exclusion};
# Only interested in things that have a decomposition spec.
Expand All @@ -1495,7 +1499,7 @@ sub emit_composition_lookup {
croak "Invalid codepoint " . $decomp[0]
}
my ($upper, $lower) = (hex(substr($decomp[0], 0, 2)), hex(substr($decomp[0], 2, 2)));
push @{$lookup[$plane]->[$upper]->[$lower]}, hex($decomp[1]), hex($point_hex);
push @{$lookup[$plane]->[$upper]->[$lower]}, hex($decomp[1]), $point_code;
}
# Produce sparse lookup tables.
Expand Down Expand Up @@ -1745,14 +1749,14 @@ sub UnicodeData {
$code_str = uc(sprintf '%04x', $new->{code});
$new->{code_str} = $code_str;
push @{$plane->{points}}, $new;
$points_by_hex->{$new->{code_str}} = $points_by_code->{$new->{code}} =
$points_by_code->{$new->{code}} =
$current = $current->{next_point} = $new;
}
$last_point = $current;
$ideograph_start = 0;
}
push @{$plane->{points}}, $point;
$points_by_hex->{$code_str} = $points_by_code->{$code} = $point;
$points_by_code->{$code} = $point;
if ($last_point) {
$last_point = $last_point->{next_point} = $point;
Expand All @@ -1779,18 +1783,19 @@ sub CaseFolding {
my @simple;
my @grows;
each_line('CaseFolding', sub { $_ = shift;
my ($left, $type, $right) = split /\s*;\s*/;
my ($left_str, $type, $right) = split /\s*;\s*/;
my $left_code = hex $left_str;
return if $type eq 'S' || $type eq 'T';
if ($type eq 'C') {
push @simple, $right;
$points_by_hex->{$left}->{Case_Folding} = $simple_count;
$points_by_code->{$left_code}->{Case_Folding} = $simple_count;
$simple_count++;
$points_by_hex->{$left}->{Case_Folding_simple} = 1;
$points_by_code->{$left_code}->{Case_Folding_simple} = 1;
}
else {
my @parts = split ' ', $right;
push @grows, "{0x".($parts[0]).",0x".($parts[1] || 0).",0x".($parts[2] || 0)."}";
$points_by_hex->{$left}->{Case_Folding} = $grows_count;
$points_by_code->{$left_code}->{Case_Folding} = $grows_count;
$grows_count++;
}
});
Expand All @@ -1812,7 +1817,8 @@ sub SpecialCasing {
my @entries;
each_line('SpecialCasing', sub { $_ = shift;
s/#.+//;
my ($code, $lower, $title, $upper, $cond) = split /\s*;\s*/;
my ($code_str, $lower, $title, $upper, $cond) = split /\s*;\s*/;
my $code = hex $code_str;
return if $cond;
sub threesome {
my @things = split ' ', shift;
Expand All @@ -1823,7 +1829,7 @@ sub SpecialCasing {
" }, { " . threesome($lower) .
" }, { " . threesome($title) .
" } }";
$points_by_hex->{$code}->{Special_Casing} = $count;
$points_by_code->{$code}->{Special_Casing} = $count;
$count++;
});
my $out = "static const MVMint32 SpecialCasing_table[$count][3][3] = {\n {0x0,0x0,0x0},\n "
Expand Down Expand Up @@ -2145,12 +2151,13 @@ sub register_int_property {
}
sub register_enumerated_property {
my ($pname, $obj) = @_;
my ($pname, $base) = @_;
check_base_for_duplicates($base);
croak if exists $enumerated_properties->{$pname};
$all_properties->{$pname} = $enumerated_properties->{$pname} = $obj;
$obj->{name} = $pname;
$obj->{property_index} = $property_index++;
$obj
$all_properties->{$pname} = $enumerated_properties->{$pname} = $base;
$base->{name} = $pname;
$base->{property_index} = $property_index++;
$base
}
main();
Expand Down

0 comments on commit c64d9d9

Please sign in to comment.