Skip to content

Commit

Permalink
rewrite StringUtil::isValidUTF8()
Browse files Browse the repository at this point in the history
The Byte-Order Mark is irrelevant for UTF-8, so I don't know why that comment was added.
  • Loading branch information
ulmus-scott authored and linuxdude42 committed Apr 10, 2022
1 parent 8f4d409 commit b7759e7
Showing 1 changed file with 123 additions and 19 deletions.
142 changes: 123 additions & 19 deletions mythtv/libs/libmythbase/stringutil.cpp
@@ -1,30 +1,134 @@
#include "stringutil.h"

#if QT_VERSION < QT_VERSION_CHECK(6,0,0)
#include <QTextCodec>
#if __has_include(<bit>) // C++20
#include <bit>
#endif

#include <climits> // for CHAR_BIT

#if defined(__cpp_lib_bitops) && __cpp_lib_bitops >= 201907L
using std::countl_one;
#else
#include <QStringDecoder>
// 8 bit LUT based count leading ones
static int countl_one(unsigned char x)
{
#if CHAR_BIT != 8
if (x > 256)
return 8; // works for our purposes even if not correct
#endif
static constexpr int leading_ones[256] =
{
#define REPEAT_4(x) (x), (x), (x), (x)
#define REPEAT_8(x) REPEAT_4(x), REPEAT_4(x)
#define REPEAT_16(x) REPEAT_8(x), REPEAT_8(x)
#define REPEAT_32(x) REPEAT_16(x), REPEAT_16(x)
#define REPEAT_64(x) REPEAT_32(x), REPEAT_32(x)
#define REPEAT_128(x) REPEAT_64(x), REPEAT_64(x)
REPEAT_128(0), REPEAT_64(1), REPEAT_32(2), REPEAT_16(3), REPEAT_8(4), REPEAT_4(5), 6, 6, 7, 8
#undef REPEAT_4
#undef REPEAT_8
#undef REPEAT_16
#undef REPEAT_32
#undef REPEAT_64
#undef REPEAT_128
};
return leading_ones[x];
}
#endif // C++20 feature test macro

bool StringUtil::isValidUTF8(const QByteArray& data)
{
// NOTE: If you have a better way to determine this, then please update this
// Any chosen method MUST be able to identify UTF-8 encoded text without
// using the BOM Byte-Order Mark as that will probably not be present.
#if QT_VERSION < QT_VERSION_CHECK(6,0,0)
QTextCodec::ConverterState state;
QTextCodec *codec = QTextCodec::codecForName("UTF-8");
const QString text = codec->toUnicode(data.constData(), data.size(), &state);
if (state.invalidChars > 0)
return false;
#else
auto toUtf16 = QStringDecoder(QStringDecoder::Utf8);
QString text = toUtf16.decode(data);
if (toUtf16.hasError())
return false;
#endif
const unsigned char* p = (const unsigned char*)data.data();
const unsigned char* const end = p + data.size();
while (p < end)
{
int code_point_length = countl_one(*p);

switch (code_point_length)
{
case 0: p++; continue; // ASCII
case 1: return false; // invalid, continuation byte
case 2:
case 3:
case 4: break;
default: return false;
/* the variable length code is limited to 4 bytes by RFC3629 §3 to match
the range of UTF-16, i.e. the maximum code point is U+10FFFF
*/
}

if (end < code_point_length + p)
{
return false; // truncated codepoint at end
}

// verify each starting byte is followed by the correct number of continuation bytes
switch (code_point_length)
{
case 4:
if (countl_one(p[3]) != 1)
{
return false;
}
[[fallthrough]];
case 3:
if (countl_one(p[2]) != 1)
{
return false;
}
[[fallthrough]];
case 2:
if (countl_one(p[1]) != 1)
{
return false;
}
break;
default: break; // should never be reached
}

// all continuation bytes are in the range 0x80 to 0xBF
switch (code_point_length)
{
case 2:
// overlong encoding of single byte character
if (*p == 0xC0 || *p == 0xC1)
{
return false;
}
break;
case 3:
// U+D800–U+DFFF are invalid; UTF-16 surrogate halves
// 0xED'A0'80 to 0xED'BF'BF
if (p[0] == 0xED && p[1] >= 0xA0)
{
return false;
}
// overlong encoding of 2 byte character
if (p[0] == 0xE0 && p[1] < 0xA0)
{
return false;
}
break;
case 4:
// code points > U+10FFFF are invalid
// U+10FFFF in UTF-8 is 0xF4'8F'BF'BF
// U+110000 in UTF-8 is 0xF4'90'80'80
if (*p > 0xF4 || (p[0] == 0xF4 && p[1] >= 0x90))
{
return false;
}
// overlong encoding of 3 byte character
if (p[0] == 0xF0 && p[1] < 0x90)
{
return false;
}
break;
default: break; // should never be reached
}

p += code_point_length;
}

Q_UNUSED(text);
return true;
}

Expand Down

0 comments on commit b7759e7

Please sign in to comment.