From 7ffae077283a5b88ca9ed3fa864fd403c1d22b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Harb=C3=B6ck?= Date: Tue, 2 May 2023 20:03:22 +0200 Subject: [PATCH] Fix character reference parsing Ignore leading zeros and case-insensitive hexadecimal characters. Fixes: https://github.com/NaturalIntelligence/fast-xml-parser/issues/568 --- spec/entities_spec.js | 28 ++++++++++++++++++++++++ src/xmlparser/OrderedObjParser.js | 36 +++++++++++++++---------------- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/spec/entities_spec.js b/spec/entities_spec.js index 41c12f6f..8629f8dd 100644 --- a/spec/entities_spec.js +++ b/spec/entities_spec.js @@ -46,6 +46,34 @@ describe("XMLParser Entities", function() { expect(result).toEqual(expected); }); + it("should parse different entity character reference variants", function() { + const xmlData = ` + + < + < + < + < + < + < + < + < + < + < + `; + + const expected = { + "?xml": "", + "tests": { + "test": ["<", "<", "<", "<", "<", "<", "<", "<", "<", "<"] + } + }; + + const parser = new XMLParser(); + let result = parser.parse(xmlData, true); + + expect(result).toEqual(expected); + }); + it("should parse XML with DOCTYPE without internal DTD", function() { const xmlData = "test"; const expected = { diff --git a/src/xmlparser/OrderedObjParser.js b/src/xmlparser/OrderedObjParser.js index db11a8c0..b5f3961b 100644 --- a/src/xmlparser/OrderedObjParser.js +++ b/src/xmlparser/OrderedObjParser.js @@ -20,26 +20,26 @@ class OrderedObjParser{ this.tagsNodeStack = []; this.docTypeEntities = {}; this.lastEntities = { - "apos" : { regex: /&(apos|#39|#x27);/g, val : "'"}, - "gt" : { regex: /&(gt|#62|#x3E);/g, val : ">"}, - "lt" : { regex: /&(lt|#60|#x3C);/g, val : "<"}, - "quot" : { regex: /&(quot|#34|#x22);/g, val : "\""}, + "apos" : { regex: /&(apos|#0*39|#x0*27);/gi, val : "'"}, + "gt" : { regex: /&(gt|#0*62|#x0*3E);/gi, val : ">"}, + "lt" : { regex: /&(lt|#0*60|#x0*3C);/gi, val : "<"}, + "quot" : { regex: /&(quot|#0*34|#x0*22);/gi, val : "\""}, }; - this.ampEntity = { regex: /&(amp|#38|#x26);/g, val : "&"}; + this.ampEntity = { regex: /&(amp|#0*38|#x0*26);/gi, val : "&"}; this.htmlEntities = { - "space": { regex: /&(nbsp|#160);/g, val: " " }, - // "lt" : { regex: /&(lt|#60);/g, val: "<" }, - // "gt" : { regex: /&(gt|#62);/g, val: ">" }, - // "amp" : { regex: /&(amp|#38);/g, val: "&" }, - // "quot" : { regex: /&(quot|#34);/g, val: "\"" }, - // "apos" : { regex: /&(apos|#39);/g, val: "'" }, - "cent" : { regex: /&(cent|#162);/g, val: "¢" }, - "pound" : { regex: /&(pound|#163);/g, val: "£" }, - "yen" : { regex: /&(yen|#165);/g, val: "¥" }, - "euro" : { regex: /&(euro|#8364);/g, val: "€" }, - "copyright" : { regex: /&(copy|#169);/g, val: "©" }, - "reg" : { regex: /&(reg|#174);/g, val: "®" }, - "inr" : { regex: /&(inr|#8377);/g, val: "₹" }, + "space": { regex: /&(nbsp|#0*160);/gi, val: " " }, + // "lt" : { regex: /&(lt|#0*60);/gi, val: "<" }, + // "gt" : { regex: /&(gt|#0*62);/gi, val: ">" }, + // "amp" : { regex: /&(amp|#0*38);/gi, val: "&" }, + // "quot" : { regex: /&(quot|#0*34);/gi, val: "\"" }, + // "apos" : { regex: /&(apos|#0*39);/gi, val: "'" }, + "cent" : { regex: /&(cent|#0*162);/gi, val: "¢" }, + "pound" : { regex: /&(pound|#0*163);/gi, val: "£" }, + "yen" : { regex: /&(yen|#0*165);/gi, val: "¥" }, + "euro" : { regex: /&(euro|#0*8364);/gi, val: "€" }, + "copyright" : { regex: /&(copy|#0*169);/gi, val: "©" }, + "reg" : { regex: /&(reg|#0*174);/gi, val: "®" }, + "inr" : { regex: /&(inr|#0*8377);/gi, val: "₹" }, }; this.addExternalEntities = addExternalEntities; this.parseXml = parseXml;