Skip to content

Commit

Permalink
Fix incorrect range which causes incorrect matches. Updated range and…
Browse files Browse the repository at this point in the history
… added regression tests.

Relevant lines of the UnicodeData.txt for Unicode 8.0 [1][2]:

01DE;LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON;Lu;0;L;00C4 0304;;;;N;LATIN CAPITAL LETTER A DIAERESIS MACRON;;;01DF;
01DF;LATIN SMALL LETTER A WITH DIAERESIS AND MACRON;Ll;0;L;00E4 0304;;;;N;LATIN SMALL LETTER A DIAERESIS MACRON;;01DE;;01DE
01E0;LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON;Lu;0;L;0226 0304;;;;N;LATIN CAPITAL LETTER A DOT MACRON;;;01E1;
01E1;LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON;Ll;0;L;0227 0304;;;;N;LATIN SMALL LETTER A DOT MACRON;;01E0;;01E0
01E2;LATIN CAPITAL LETTER AE WITH MACRON;Lu;0;L;00C6 0304;;;;N;LATIN CAPITAL LETTER A E MACRON;;;01E3;
01E3;LATIN SMALL LETTER AE WITH MACRON;Ll;0;L;00E6 0304;;;;N;LATIN SMALL LETTER A E MACRON;;01E2;;01E2
01E4;LATIN CAPITAL LETTER G WITH STROKE;Lu;0;L;;;;;N;LATIN CAPITAL LETTER G BAR;;;01E5;
01E5;LATIN SMALL LETTER G WITH STROKE;Ll;0;L;;;;;N;LATIN SMALL LETTER G BAR;;01E4;;01E4
01E6;LATIN CAPITAL LETTER G WITH CARON;Lu;0;L;0047 030C;;;;N;LATIN CAPITAL LETTER G HACEK;;;01E7;
01E7;LATIN SMALL LETTER G WITH CARON;Ll;0;L;0067 030C;;;;N;LATIN SMALL LETTER G HACEK;;01E6;;01E6
01E8;LATIN CAPITAL LETTER K WITH CARON;Lu;0;L;004B 030C;;;;N;LATIN CAPITAL LETTER K HACEK;;;01E9;
01E9;LATIN SMALL LETTER K WITH CARON;Ll;0;L;006B 030C;;;;N;LATIN SMALL LETTER K HACEK;;01E8;;01E8
01EA;LATIN CAPITAL LETTER O WITH OGONEK;Lu;0;L;004F 0328;;;;N;LATIN CAPITAL LETTER O OGONEK;;;01EB;
01EB;LATIN SMALL LETTER O WITH OGONEK;Ll;0;L;006F 0328;;;;N;LATIN SMALL LETTER O OGONEK;;01EA;;01EA
01EC;LATIN CAPITAL LETTER O WITH OGONEK AND MACRON;Lu;0;L;01EA 0304;;;;N;LATIN CAPITAL LETTER O OGONEK MACRON;;;01ED;
01ED;LATIN SMALL LETTER O WITH OGONEK AND MACRON;Ll;0;L;01EB 0304;;;;N;LATIN SMALL LETTER O OGONEK MACRON;;01EC;;01EC
01EE;LATIN CAPITAL LETTER EZH WITH CARON;Lu;0;L;01B7 030C;;;;N;LATIN CAPITAL LETTER YOGH HACEK;;;01EF;
01EF;LATIN SMALL LETTER EZH WITH CARON;Ll;0;L;0292 030C;;;;N;LATIN SMALL LETTER YOGH HACEK;;01EE;;01EE

01F0;LATIN SMALL LETTER J WITH CARON;Ll;0;L;006A 030C;;;;N;LATIN SMALL LETTER J HACEK;;;;               <NO MAPPING>

01F1;LATIN CAPITAL LETTER DZ;Lu;0;L;<compat> 0044 005A;;;;N;;;;01F3;01F2                                DZ (uppercase)
01F2;LATIN CAPITAL LETTER D WITH SMALL LETTER Z;Lt;0;L;<compat> 0044 007A;;;;N;;;01F1;01F3;01F2         DZ (titlecase)
01F3;LATIN SMALL LETTER DZ;Ll;0;L;<compat> 0064 007A;;;;N;;;01F1;;01F2                                  DZ (lowercase)

01F4;LATIN CAPITAL LETTER G WITH ACUTE;Lu;0;L;0047 0301;;;;N;;;;01F5;                                   [3]
01F5;LATIN SMALL LETTER G WITH ACUTE;Ll;0;L;0067 0301;;;;N;;;01F4;;01F4                                 [3]

--

[1] Currently fixing bugs in Unicode 8.0 because the source code claims compliance with Unicode 8.0 at the moment. Will update to Unicode 9.0 later.
[2] These lines in Unicode 8.0 are equivalent to the lines in Unicode 9.0.
[3] Already included in the table as a pair mapping.
  • Loading branch information
dilijev committed Jan 12, 2017
1 parent 4894d24 commit 25049de
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lib/Parser/CaseInsensitive.cpp
Expand Up @@ -295,7 +295,7 @@ END {
1, MappingSource::UnicodeData, 0x01cc, 0x01cc, -2, -1, 0, 0,
2, MappingSource::UnicodeData, 0x01cd, 0x01dc, -1, 1, 1, 1,
1, MappingSource::UnicodeData, 0x01dd, 0x01dd, -79, 0, 0, 0,
2, MappingSource::UnicodeData, 0x01de, 0x01f5, -1, 1, 1, 1,
2, MappingSource::UnicodeData, 0x01de, 0x01ef, -1, 1, 1, 1,
1, MappingSource::UnicodeData, 0x01f1, 0x01f1, 0, 1, 2, 2,
1, MappingSource::UnicodeData, 0x01f2, 0x01f2, -1, 0, 1, 1,
1, MappingSource::UnicodeData, 0x01f3, 0x01f3, -2, -1, 0, 0,
Expand Down
78 changes: 78 additions & 0 deletions test/es6/regex-unicode-CaseInsensitive.js
@@ -0,0 +1,78 @@
//-------------------------------------------------------------------------------------------------------
// Copyright (C) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.txt file in the project root for full license information.
//-------------------------------------------------------------------------------------------------------

function assertMatches(re, codepoint, str) {
let passed = re.test(str);
if (!passed) {
console.log("FAILED -- regex: " + re.toString() + " should match codepoint: " + codepoint.toString(16));
}
}

function assertDoesNotMatch(re, codepoint, str) {
let passed = re.test(str);
if (passed) {
console.log("FAILED -- regex: " + re.toString() + " should not match codepoint: " + codepoint.toString(16));
}
}

// Detect regressions in the CaseInsensitive table

// 01BA != 01BB under /i.
assertDoesNotMatch(/\u{01ba}/iu, 0x01bb, "\u01bb");
assertDoesNotMatch(/\u{01bb}/iu, 0x01ba, "\u01ba");

// 01F0 doesn't match anything
assertDoesNotMatch(/\u{01f0}/iu, 0x01f1, "\u01f1");
assertDoesNotMatch(/\u{01f1}/iu, 0x01f0, "\u01f0");

// 01F4-5 match (G with ACUTE)
assertMatches(/\u{01f4}/iu, 0x01f5, "\u01f5");
assertMatches(/\u{01f5}/iu, 0x01f4, "\u01f4");

//
// Latin ligature triples DZ WITH CARON, LJ, NJ (01C4-01CC); DZ (01F1-3)
//

assertMatches(/\u{01c4}/iu, 0x01c4, '\u{01c4}');
assertMatches(/\u{01c4}/iu, 0x01c5, '\u{01c5}');
assertMatches(/\u{01c4}/iu, 0x01c6, '\u{01c6}');
assertMatches(/\u{01c5}/iu, 0x01c4, '\u{01c4}');
assertMatches(/\u{01c5}/iu, 0x01c5, '\u{01c5}');
assertMatches(/\u{01c5}/iu, 0x01c6, '\u{01c6}');
assertMatches(/\u{01c6}/iu, 0x01c4, '\u{01c4}');
assertMatches(/\u{01c6}/iu, 0x01c5, '\u{01c5}');
assertMatches(/\u{01c6}/iu, 0x01c6, '\u{01c6}');

assertMatches(/\u{01c7}/iu, 0x01c7, '\u{01c7}');
assertMatches(/\u{01c7}/iu, 0x01c8, '\u{01c8}');
assertMatches(/\u{01c7}/iu, 0x01c9, '\u{01c9}');
assertMatches(/\u{01c9}/iu, 0x01c7, '\u{01c7}');
assertMatches(/\u{01c9}/iu, 0x01c8, '\u{01c8}');
assertMatches(/\u{01c9}/iu, 0x01c9, '\u{01c9}');
assertMatches(/\u{01c8}/iu, 0x01c7, '\u{01c7}');
assertMatches(/\u{01c8}/iu, 0x01c8, '\u{01c8}');
assertMatches(/\u{01c8}/iu, 0x01c9, '\u{01c9}');

assertMatches(/\u{01ca}/iu, 0x01ca, '\u{01ca}');
assertMatches(/\u{01ca}/iu, 0x01cb, '\u{01cb}');
assertMatches(/\u{01ca}/iu, 0x01cc, '\u{01cc}');
assertMatches(/\u{01cb}/iu, 0x01ca, '\u{01ca}');
assertMatches(/\u{01cb}/iu, 0x01cb, '\u{01cb}');
assertMatches(/\u{01cb}/iu, 0x01cc, '\u{01cc}');
assertMatches(/\u{01cc}/iu, 0x01ca, '\u{01ca}');
assertMatches(/\u{01cc}/iu, 0x01cb, '\u{01cb}');
assertMatches(/\u{01cc}/iu, 0x01cc, '\u{01cc}');

assertMatches(/\u{01f1}/iu, 0x01f1, '\u{01f1}');
assertMatches(/\u{01f1}/iu, 0x01f2, '\u{01f2}');
assertMatches(/\u{01f1}/iu, 0x01f3, '\u{01f3}');
assertMatches(/\u{01f2}/iu, 0x01f2, '\u{01f2}');
assertMatches(/\u{01f2}/iu, 0x01f1, '\u{01f1}');
assertMatches(/\u{01f2}/iu, 0x01f3, '\u{01f3}');
assertMatches(/\u{01f3}/iu, 0x01f1, '\u{01f1}');
assertMatches(/\u{01f3}/iu, 0x01f2, '\u{01f2}');
assertMatches(/\u{01f3}/iu, 0x01f3, '\u{01f3}');

console.log("PASS");
5 changes: 5 additions & 0 deletions test/es6/rlexe.xml
Expand Up @@ -997,6 +997,11 @@
<compile-flags>-args summary -endargs</compile-flags>
</default>
</test>
<test>
<default>
<files>regex-unicode-CaseInsensitive.js</files>
</default>
</test>
<test>
<default>
<files>regex-set.js</files>
Expand Down

0 comments on commit 25049de

Please sign in to comment.