From 7c599bc53b7a3cb74e4e2a87b9bed2929789e145 Mon Sep 17 00:00:00 2001 From: Orgad Shaneh Date: Thu, 7 Nov 2019 15:19:38 +0200 Subject: [PATCH] Change regexp to match the XML spec, supporting Unicode Replace the regexp with one that matches the XML spec[1] (except [\u10000-\uEFFFF] which matches digits for some reason...). Remove the localeRange option, which is no longer needed [1] https://www.w3.org/TR/xml/#NT-NameStartChar --- spec/validator_utf8_with_BOM_spec.js | 4 +--- spec/x_cyrillic_2j_str_spec.js | 20 +------------------ src/parser.d.ts | 2 -- src/util.js | 17 ++++++++-------- src/validator.js | 30 ++++++++++------------------ src/xmlstr2xmlnode.js | 16 +++++++-------- 6 files changed, 28 insertions(+), 61 deletions(-) diff --git a/spec/validator_utf8_with_BOM_spec.js b/spec/validator_utf8_with_BOM_spec.js index 59b9ad25..e8e52025 100644 --- a/spec/validator_utf8_with_BOM_spec.js +++ b/spec/validator_utf8_with_BOM_spec.js @@ -3,12 +3,10 @@ const validator = require("../src/validator"); describe("XMLParser", function() { - it("should validate xml string with cyrillic characters", function() { const BOM = "\ufeff"; - const options = {localeRange: "a-zA-Zа-яёА-ЯЁ"} let xmlData = BOM + "<КорневаяЗапись><Тэг>ЗначениеValue53456"; - let result = validator.validate(xmlData, options); + let result = validator.validate(xmlData); expect(result).toBe(true); }); diff --git a/spec/x_cyrillic_2j_str_spec.js b/spec/x_cyrillic_2j_str_spec.js index 26d7ba41..8f114c0d 100644 --- a/spec/x_cyrillic_2j_str_spec.js +++ b/spec/x_cyrillic_2j_str_spec.js @@ -1,7 +1,6 @@ "use strict"; const parser = require("../src/parser"); -const validator = require("../src/validator"); describe("XMLParser", function() { @@ -13,29 +12,12 @@ describe("XMLParser", function() { } }; const options = { - localeRange: "а-яёА-ЯЁa-zA-Z", attributeNamePrefix : "@_" } - const result = parser.parse(xmlData, options, { localeRange: "а-яёА-ЯЁa-zA-Z" }); + const result = parser.parse(xmlData, options); expect(result).toEqual(expected); // console.log({ expected}) // console.log({ result }) }); - - it("should invalid XML with invalid localRange", function() { - const xmlData = `<КорневаяЗапись><Тэг>ЗначениеValue53456`; - - const expected = { - "code": "InvalidOptions", - "msg": "Invalid localeRange", - "line": 1 - }; - - const result = validator.validate(xmlData , { localeRange: "а-яёА-ЯЁa-zA-Z<" }).err - expect(result).toEqual(expected); - // console.log({ expected}) - // console.log({ result }) - }); - }); diff --git a/src/parser.d.ts b/src/parser.d.ts index ff688847..3ba98910 100644 --- a/src/parser.d.ts +++ b/src/parser.d.ts @@ -11,7 +11,6 @@ type X2jOptions = { trimValues: boolean; cdataTagName: false | string; cdataPositionChar: string; - localeRange: string; parseTrueNumberOnly: boolean; tagValueProcessor: (tagValue: string, tagName: string) => string; attrValueProcessor: (attrValue: string, attrName: string) => string; @@ -20,7 +19,6 @@ type X2jOptions = { type X2jOptionsOptional = Partial; type validationOptions = { allowBooleanAttributes: boolean; - localeRange: string; }; type validationOptionsOptional = Partial; type J2xOptions = { diff --git a/src/util.js b/src/util.js index 97766e4d..a092f247 100644 --- a/src/util.js +++ b/src/util.js @@ -1,5 +1,10 @@ 'use strict'; +const nameStartChar = ':A-Za-z_\\u00C0-\\u00D6\\u00D8-\\u00F6\\u00F8-\\u02FF\\u0370-\\u037D\\u037F-\\u1FFF\\u200C-\\u200D\\u2070-\\u218F\\u2C00-\\u2FEF\\u3001-\\uD7FF\\uF900-\\uFDCF\\uFDF0-\\uFFFD'; +const nameChar = nameStartChar + '\\-.\\d\\u00B7\\u0300-\\u036F\\u203F-\\u2040'; +const nameRegexp = '[' + nameStartChar + '][' + nameChar + ']*' +const regexName = new RegExp('^' + nameRegexp + '$'); + const getAllMatches = function(string, regex) { const matches = []; let match = regex.exec(string); @@ -15,15 +20,11 @@ const getAllMatches = function(string, regex) { return matches; }; -const doesMatch = function(string, regex) { - const match = regex.exec(string); +const isName = function(string) { + const match = regexName.exec(string); return !(match === null || typeof match === 'undefined'); }; -const doesNotMatch = function(string, regex) { - return !doesMatch(string, regex); -}; - exports.isExist = function(v) { return typeof v !== 'undefined'; }; @@ -81,6 +82,6 @@ exports.buildOptions = function(options, defaultOptions, props) { return newOptions; }; -exports.doesMatch = doesMatch; -exports.doesNotMatch = doesNotMatch; +exports.isName = isName; exports.getAllMatches = getAllMatches; +exports.nameRegexp = nameRegexp; diff --git a/src/validator.js b/src/validator.js index 0ea6c0f7..d3406f26 100644 --- a/src/validator.js +++ b/src/validator.js @@ -4,10 +4,9 @@ const util = require('./util'); const defaultOptions = { allowBooleanAttributes: false, //A tag can have attributes without any value - localeRange: 'a-zA-Z', }; -const props = ['allowBooleanAttributes', 'localeRange']; +const props = ['allowBooleanAttributes']; //const tagsPattern = new RegExp("<\\/?([\\w:\\-_\.]+)\\s*\/?>","g"); exports.validate = function (xmlData, options) { @@ -16,12 +15,6 @@ exports.validate = function (xmlData, options) { //xmlData = xmlData.replace(/(\r\n|\n|\r)/gm,"");//make it single line //xmlData = xmlData.replace(/(^\s*<\?xml.*?\?>)/g,"");//Remove XML starting tag //xmlData = xmlData.replace(/()/g,"");//Remove DOCTYPE - const localRangeRegex = new RegExp(`[${options.localeRange}]`); - - if (localRangeRegex.test("<#$'\"\\\/:0")) { - return getErrorObject('InvalidOptions', 'Invalid localeRange', 1); - } - const tags = []; let tagFound = false; @@ -32,8 +25,6 @@ exports.validate = function (xmlData, options) { // check for byte order mark (BOM) xmlData = xmlData.substr(1); } - const regxAttrName = new RegExp(`^[${options.localeRange}_][${options.localeRange}0-9\\-\\.:]*$`); - const regxTagName = new RegExp(`^([${options.localeRange}_])[${options.localeRange}0-9\\.\\-_:]*$`); for (let i = 0; i < xmlData.length; i++) { if (xmlData[i] === '<') { //starting of tag @@ -78,7 +69,7 @@ exports.validate = function (xmlData, options) { //continue; i--; } - if (!validateTagName(tagName, regxTagName)) { + if (!validateTagName(tagName)) { let msg; if(tagName.trim().length === 0) { msg = "There is an unnecessary space between tag name and backward slash '))|(([\\w:\\-._]*:)?([\\w:\\-._]+))([^>]*)>|((\\/)(([\\w:\\-._]*:)?([\\w:\\-._]+))\\s*>))([^<]*)'; +const regx = + '<((!\\[CDATA\\[([\\s\\S]*?)(]]>))|((NAME:)?(NAME))([^>]*)>|((\\/)(NAME)\\s*>))([^<]*)' + .replace(/NAME/g, util.nameRegexp); //const tagsRegx = new RegExp("<(\\/?[\\w:\\-\._]+)([^>]*)>(\\s*"+cdataRegx+")*([^<]+)?","g"); //const tagsRegx = new RegExp("<(\\/?)((\\w*:)?([\\w:\\-\._]+))([^>]*)>([^<]*)("+cdataRegx+"([^<]*))*([^<]+)?","g"); @@ -32,7 +33,6 @@ const defaultOptions = { trimValues: true, //Trim string values of tag and attributes cdataTagName: false, cdataPositionChar: '\\c', - localeRange: '', tagValueProcessor: function(a, tagName) { return a; }, @@ -58,7 +58,6 @@ const props = [ 'trimValues', 'cdataTagName', 'cdataPositionChar', - 'localeRange', 'tagValueProcessor', 'attrValueProcessor', 'parseTrueNumberOnly', @@ -74,7 +73,6 @@ const getTraversalObj = function(xmlData, options) { const xmlObj = new xmlNode('!xml'); let currentNode = xmlObj; - regx = regx.replace(/\[\\w/g, '[' + options.localeRange + '\\w'); const tagsRegx = new RegExp(regx, 'g'); let tag = tagsRegx.exec(xmlData); let nextTag = tagsRegx.exec(xmlData); @@ -83,7 +81,7 @@ const getTraversalObj = function(xmlData, options) { if (tagType === TagType.CLOSING) { //add parsed data to parent node - if (currentNode.parent && tag[14]) { + if (currentNode.parent && tag[12]) { currentNode.parent.val = util.getValue(currentNode.parent.val) + '' + processTagValue(tag, options, currentNode.parent.tagname); } if (options.stopNodes.length && options.stopNodes.includes(currentNode.tagname)) { @@ -101,14 +99,14 @@ const getTraversalObj = function(xmlData, options) { //for backtracking currentNode.val = util.getValue(currentNode.val) + options.cdataPositionChar; //add rest value to parent node - if (tag[14]) { + if (tag[12]) { currentNode.val += processTagValue(tag, options); } } else { currentNode.val = (currentNode.val || '') + (tag[3] || '') + processTagValue(tag, options); } } else if (tagType === TagType.SELF) { - if (currentNode && tag[14]) { + if (currentNode && tag[12]) { currentNode.val = util.getValue(currentNode.val) + '' + processTagValue(tag, options); } @@ -142,7 +140,7 @@ const getTraversalObj = function(xmlData, options) { function processTagValue(parsedTags, options, parentTagName) { const tagName = parsedTags[7] || parentTagName; - let val = parsedTags[14]; + let val = parsedTags[12]; if (val) { if (options.trimValues) { val = val.trim();