Skip to content


Merge pull request #1160 from ZhongnianTao/master
Browse files Browse the repository at this point in the history
Add GB18030 encode, decode, and decodestream support
  • Loading branch information
samcv committed Aug 25, 2019
2 parents 3e715b4 + d13db41 commit ae5a385
Show file tree
Hide file tree
Showing 8 changed files with 11,276 additions and 2 deletions.
3 changes: 3 additions & 0 deletions build/
Expand Up @@ -229,6 +229,7 @@ OBJECTS2 = src/6model/reprs/MVMDLLSym@obj@ \
src/strings/shiftjis@obj@ \
src/strings/shiftjis_codeindex@obj@ \
src/strings/gb2312@obj@ \
src/strings/gb18030@obj@ \
src/math/bigintops@obj@ \
src/profiler/instrument@obj@ \
src/profiler/log@obj@ \
Expand Down Expand Up @@ -395,6 +396,8 @@ HEADERS = src/moar.h \
src/strings/shiftjis_codeindex.h \
src/strings/gb2312.h \
src/strings/gb2312_codeindex.h \
src/strings/gb18030.h \
src/strings/gb18030_codeindex.h \
src/math/bigintops.h \
src/profiler/instrument.h \
src/profiler/log.h \
Expand Down
1 change: 1 addition & 0 deletions src/moar.h
Expand Up @@ -184,6 +184,7 @@ MVM_PUBLIC const MVMint32 MVM_jit_support(void);
#include "strings/shiftjis.h"
#include "strings/unicode_ops.h"
#include "strings/gb2312.h"
#include "strings/gb18030.h"
#include "io/io.h"
#include "io/eventloop.h"
#include "io/syncfile.h"
Expand Down
3 changes: 3 additions & 0 deletions src/strings/decode_stream.c
Expand Up @@ -150,6 +150,9 @@ static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVM
case MVM_encoding_type_gb2312:
reached_stopper = MVM_string_gb2312_decodestream(tc, ds, stopper_chars, sep_spec);
case MVM_encoding_type_gb18030:
reached_stopper = MVM_string_gb18030_decodestream(tc, ds, stopper_chars, sep_spec);
if (ds->encoding < MVM_encoding_type_MIN || MVM_encoding_type_MAX < ds->encoding)
MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRIi32, ds->encoding);
Expand Down
346 changes: 346 additions & 0 deletions src/strings/gb18030.c
@@ -0,0 +1,346 @@
#include "moar.h"
#include "gb18030_codeindex.h"

/* Information about GB18030:
GB18030 to Unicode Mapping used (included in GNU LIBICONV Package):

const MVMint32 gb18030_two_byte_lower_bound[126] = {

const MVMint32 gb18030_two_byte_upper_bound[126] = {

MVMint32 gb18030_valid_check_len2(MVMint32 c_1, MVMint32 c_2) {
/* This function serves like a 'first stage check' of c_1 and c_2.
It eliminates most of the invalid combinations of c_1 and c_2,
but for code simplicity and to avoid lots of if-else here,
ther remaining invalid combinations will be processed
in the conversion table. */
if (c_1 < 0x81 || c_1 > 0xfe) return 0;
c_1 -= 0x81;
return gb18030_two_byte_lower_bound[c_1] <= c_2 && c_2 <= gb18030_two_byte_upper_bound[c_1];

MVMint32 gb18030_valid_check_len4(MVMint32 c_1, MVMint32 c_2, MVMint32 c_3, MVMint32 c_4) {
if ((0x81 <= c_1 && c_1 <= 0x83) || (c_1 == 0x84 && c_2 == 0x30)) {
return (0x30 <= c_2 && c_2 <= 0x39) && (0x81 <= c_3 && c_3 <= 0xfe) && (0x30 <= c_4 && c_4 <= 0x39);
} else if (c_1 == 0x84 && c_2 == 0x31) {
return (0x81 <= c_3 && c_3 <= 0xa4) && (0x30 <= c_4 && c_4 <= 0x39);
return 0;

MVMint32 gb18030_valid_check_len4_first2(MVMint32 c_1, MVMint32 c_2) {
return (((0x81 <= c_1 && c_1 <= 0x83) || (c_1 == 0x84 && c_2 == 0x30)) && (0x30 <= c_2 && c_2 <= 0x39)) || (c_1 == 0x84 && c_2 == 0x31);

MVMString * MVM_string_gb18030_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb18030, size_t bytes) {
size_t i, result_graphs;

MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));

result->body.storage_type = MVM_STRING_GRAPHEME_32;
result-> = MVM_malloc(sizeof(MVMGrapheme32) * bytes);

result_graphs = 0;

for (i = 0; i < bytes; i++) {
if (0 <= gb18030[i] && gb18030[i] <= 127) {
if (gb18030[i] == '\r' && i + 1 < bytes && gb18030[i + 1] == '\n') {
result->[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
else {
result->[result_graphs++] = gb18030[i];
else {
if (i + 1 < bytes) {
/* GB18030 codepoint of length 2 */
MVMuint8 byte1 = gb18030[i];
MVMuint8 byte2 = gb18030[i + 1];
if (gb18030_valid_check_len2(byte1, byte2)) {
MVMGrapheme32 index = gb18030_index_to_cp_len2(byte1, byte2);
if (index != GB18030_NULL) {
result->[result_graphs++] = index;
if (i + 3 < bytes) {
/* GB18030 codepoint of length 4 */
MVMuint8 byte1 = gb18030[i];
MVMuint8 byte2 = gb18030[i + 1];
MVMuint8 byte3 = gb18030[i + 2];
MVMuint8 byte4 = gb18030[i + 3];
if (gb18030_valid_check_len4(byte1, byte2, byte3, byte4)) {
MVMGrapheme32 index = gb18030_index_to_cp_len4(byte1, byte2, byte3, byte4);
if (index != GB18030_NULL) {
result->[result_graphs++] = index;
i += 3;

"Error decoding gb18030 string: invalid gb18030 format. Last byte seen was 0x%hhX\n",

result->body.num_graphs = result_graphs;

return result;

MVMuint32 MVM_string_gb18030_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer = NULL;
MVMDecodeStreamBytes *cur_bytes = NULL;
MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
MVMint32 last_accept_pos, last_was_cr;
MVMuint32 reached_stopper;

MVMint32 last_was_first_byte, is_len4;
MVMint32 last_codepoint;
MVMint32 len4_cnt,len4_byte1, len4_byte2, len4_byte3, len4_byte4;

/* If there's no buffers, we're done. */
if (!ds->bytes_head)
return 0;
last_accept_pos = ds->bytes_head_pos;

/* If we're asked for zero chars, also done. */
if (stopper_chars && *stopper_chars == 0)
return 1;

bufsize = ds->result_size_guess;
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

/* Decode each of the buffers. */
cur_bytes = ds->bytes_head;
last_was_cr = 0;
reached_stopper = 0;

last_was_first_byte = 0;
last_codepoint = 0;

is_len4 = 0;
len4_cnt = 0;
len4_byte1 = 0;
len4_byte2 = 0;
len4_byte3 = 0;
len4_byte4 = 0;

while (cur_bytes) {
/* Process this buffer. */
MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
MVMuint8 *bytes = (MVMuint8 *)cur_bytes->bytes;

while (pos < cur_bytes->length) {
MVMGrapheme32 graph;
MVMint32 codepoint = (MVMint32) bytes[pos++];

if (is_len4) {
if (len4_cnt == 2) {
len4_byte3 = codepoint;
if (len4_cnt == 3) {
len4_byte4 = codepoint;
if (gb18030_valid_check_len4(len4_byte1, len4_byte2, len4_byte3, len4_byte4)) {
graph = gb18030_index_to_cp_len4(len4_byte1, len4_byte2, len4_byte3, len4_byte4);
is_len4 = 0;
} else {
"Error decoding gb18030 string: invalid gb18030 format. Last four bytes seen was 0x%hhX, 0x%hhX, 0x%hhX, 0x%hhX\n",
len4_byte1, len4_byte2, len4_byte3, len4_byte4);
else if (codepoint <= 127 && !last_was_first_byte) {
if (last_was_cr) {
if (codepoint == '\n') {
graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
else {
graph = '\r';
last_was_cr = 0;
else if (codepoint == '\r') {
last_was_cr = 1;
else {
graph = codepoint;
else {
if (last_was_first_byte) {
if (gb18030_valid_check_len4_first2(last_codepoint, codepoint)) {
is_len4 = 1;
len4_byte1 = last_codepoint;
len4_byte2 = codepoint;
len4_cnt = 2;
last_was_first_byte = 0;
graph = gb18030_index_to_cp_len2(last_codepoint, codepoint);
if (graph == GB18030_NULL) {
"Error decoding gb18030 string: invalid gb18030 format. Last two bytes seen was 0x%hhX, 0x%hhX\n",
last_codepoint, codepoint);
last_was_first_byte = 0;
else {
last_was_first_byte = 1;
last_codepoint = codepoint;

if (count == bufsize) {
/* We filled the buffer. Attach this one to the buffers
* linked list, and continue with a new one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;

buffer[count++] = graph;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;

if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint) ||
(stopper_chars && *stopper_chars == total)) {
reached_stopper = 1;
goto done;

cur_bytes = cur_bytes -> next;


/* Attach what we successfully parsed as a result buffer, and trim away
* what we chewed through. */
if (count) {
MVM_string_decodestream_add_chars(tc, ds, buffer, count);
else {
MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

return reached_stopper;

char * MVM_string_gb18030_encode_substr(MVMThreadContext *tc, MVMString *str,
MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
MVMint32 translate_newlines) {

MVMuint32 startu = (MVMuint32)start;
MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
MVMuint8 *result = NULL;
size_t result_alloc;
MVMuint8 *repl_bytes = NULL;
MVMuint64 repl_length;

if (start < 0 || start > strgraphs)
MVM_exception_throw_adhoc(tc, "start out of range");
if (length < -1 || start + lengthu > strgraphs)
MVM_exception_throw_adhoc(tc, "length out of range");

if (replacement)
repl_bytes = (MVMuint8 *) MVM_string_gb18030_encode_substr(tc,
replacement, &repl_length, 0, -1, NULL, translate_newlines);

result_alloc = lengthu;
result = MVM_malloc(result_alloc + 1);

if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
/* No encoding needed; directly copy. */
memcpy(result, str->, lengthu);
result[lengthu] = 0;
if (output_size)
*output_size = lengthu;
else {
MVMuint32 out_pos = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);

while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
if (result_alloc <= out_pos + 1) {
result_alloc += 8;
result = MVM_realloc(result, result_alloc + 5);
if (codepoint <= 0x7F) {
result[out_pos++] = codepoint;
else {
MVMint64 gb18030_cp;
gb18030_cp = gb18030_cp_to_index(codepoint);
if (gb18030_cp == GB18030_NULL) {
if (replacement) {
size_t i;
if (result_alloc <= out_pos + repl_length) {
result_alloc += repl_length;
result = MVM_realloc(result, result_alloc + 1);
for (i = 0; i < repl_length; i++) {
result[out_pos++] = repl_bytes[i];
else {
MVM_exception_throw_adhoc(tc, "Error encoding gb18030 string: could not encode codepoint 0x%hhX", codepoint);
if (gb18030_cp <= 0xffff) {
/* Length = 2 */
result[out_pos++] = gb18030_cp / 256;
result[out_pos++] = gb18030_cp % 256;
else {
/* Length = 4 */
result[out_pos++] = (gb18030_cp / 16777216) % 256;
result[out_pos++] = (gb18030_cp / 65536) % 256;
result[out_pos++] = (gb18030_cp / 256) % 256;
result[out_pos++] = gb18030_cp % 256;
result[out_pos] = 0;
if (output_size)
*output_size = out_pos;
if (repl_bytes) MVM_free(repl_bytes);
return (char *)result;
5 changes: 5 additions & 0 deletions src/strings/gb18030.h
@@ -0,0 +1,5 @@
MVMString * MVM_string_gb18030_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb18030, size_t bytes);
MVMuint32 MVM_string_gb18030_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps);
char * MVM_string_gb18030_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines);

0 comments on commit ae5a385

Please sign in to comment.