Skip to content

Commit

Permalink
Add support for utf16le and utf16be decodestream
Browse files Browse the repository at this point in the history
There are some issues with takechars, but .lines and .slurp works.
It omits the byte order mark if and only if it is at position 0 in the
file.
  • Loading branch information
samcv committed Sep 17, 2018
1 parent ddde095 commit 792cdd5
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 12 deletions.
6 changes: 6 additions & 0 deletions src/strings/decode_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVM
case MVM_encoding_type_utf16:
reached_stopper = MVM_string_utf16_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_utf16be:
reached_stopper = MVM_string_utf16be_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_utf16le:
reached_stopper = MVM_string_utf16le_decodestream(tc, ds, stopper_chars, sep_spec);
break;
default:
if (ds->encoding < MVM_encoding_type_MIN || MVM_encoding_type_MAX < ds->encoding)
MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRIi32, ds->encoding);
Expand Down
60 changes: 48 additions & 12 deletions src/strings/utf16.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,39 @@

#define BOM_UTF16LE "\xff\xfe"
#define BOM_UTF16BE "\xfe\xff"

#define UTF16_DECODE_BIG_ENDIAN 1
#define UTF16_DECODE_LITTLE_ENDIAN 2
#define UTF16_DECODE_AUTO_ENDIAN 4
MVM_STATIC_INLINE int is_little_endian (MVMuint8 *buf8) {
return memcmp(buf8, BOM_UTF16LE, 2) == 0;
}
MVM_STATIC_INLINE int is_big_endian (MVMuint8 *buf8) {
return memcmp(buf8, BOM_UTF16BE, 2) == 0;
}
MVMuint32 MVM_string_utf16_decodestream_main(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars,
MVMDecodeStreamSeparators *seps, int endianess);
MVMuint32 MVM_string_utf16_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars,
MVMDecodeStreamSeparators *seps) {
return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_AUTO_ENDIAN);
}
MVMuint32 MVM_string_utf16le_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars,
MVMDecodeStreamSeparators *seps) {
return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_LITTLE_ENDIAN);
}
MVMuint32 MVM_string_utf16be_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars,
MVMDecodeStreamSeparators *seps) {
return MVM_string_utf16_decodestream_main(tc, ds, stopper_chars, seps, UTF16_DECODE_BIG_ENDIAN);
}
/* mostly from YAML-LibYAML */
/* Decodes using a decodestream. Decodes as far as it can with the input
* buffers, or until a stopper is reached. */
MVMuint32 MVM_string_utf16_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
MVMuint32 MVM_string_utf16_decodestream_main(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars,
MVMDecodeStreamSeparators *seps) {
MVMDecodeStreamSeparators *seps, int endianess) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer;
Expand Down Expand Up @@ -43,9 +69,22 @@ MVMuint32 MVM_string_utf16_decodestream(MVMThreadContext *tc, MVMDecodeStream *d
while (cur_bytes) {
/* Process this buffer. */
MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
unsigned char *bytes = (unsigned char *)cur_bytes->bytes;

for (; pos+1 < cur_bytes->length; pos += 2) {
MVMuint8 *bytes = (unsigned char *)cur_bytes->bytes;
if (ds->abs_byte_pos == 0 && pos + 1 < cur_bytes->length) {
if (is_little_endian(bytes + pos)) {
low = 0;
high = 1;
last_accept_pos = pos;
pos += 2;
}
else if (is_big_endian(bytes + pos)) {
low = 1;
high = 0;
last_accept_pos = pos;
pos += 2;
}
}
for (; pos + 1 < cur_bytes->length; pos += 2) {
MVMuint32 value = (bytes[pos+high] << 8) + bytes[pos+low];
MVMuint32 value2;
MVMGrapheme32 g;
Expand Down Expand Up @@ -101,9 +140,6 @@ MVMuint32 MVM_string_utf16_decodestream(MVMThreadContext *tc, MVMDecodeStream *d

return reached_stopper;
}
#define UTF16_DECODE_BIG_ENDIAN 1
#define UTF16_DECODE_LITTLE_ENDIAN 2
#define UTF16_DECODE_AUTO_ENDIAN 4
static MVMString * MVM_string_utf16_decode_main(MVMThreadContext *tc,
const MVMObject *result_type, char *utf16_chars, size_t bytes, int endianess);
MVMString * MVM_string_utf16be_decode(MVMThreadContext *tc,
Expand All @@ -122,13 +158,13 @@ MVMString * MVM_string_utf16_decode(MVMThreadContext *tc,
int mode = UTF16_DECODE_LITTLE_ENDIAN;
#endif
/* set the byte order if there's a BOM */
if (bytes >= 2) {
if (!memcmp(utf16_chars, BOM_UTF16LE, 2)) {
if (2 <= bytes) {
if (is_little_endian(utf16_chars)) {
mode = UTF16_DECODE_LITTLE_ENDIAN;
utf16_chars += 2;
bytes -= 2;
}
else if (!memcmp(utf16_chars, BOM_UTF16BE, 2)) {
else if (is_big_endian(utf16_chars)) {
mode = UTF16_DECODE_BIG_ENDIAN;
utf16_chars += 2;
bytes -= 2;
Expand Down

0 comments on commit 792cdd5

Please sign in to comment.