Skip to content

Commit

Permalink
Add GB2312 encode, decode and decodestream support
Browse files Browse the repository at this point in the history
  • Loading branch information
ZhongnianTao committed Jul 14, 2019
1 parent defe0a9 commit 73f28d3
Show file tree
Hide file tree
Showing 8 changed files with 3,149 additions and 2 deletions.
3 changes: 3 additions & 0 deletions build/Makefile.in
Expand Up @@ -226,6 +226,7 @@ OBJECTS2 = src/6model/reprs/MVMDLLSym@obj@ \
src/strings/windows1252@obj@ \
src/strings/shiftjis@obj@ \
src/strings/shiftjis_codeindex@obj@ \
src/strings/gb2312@obj@ \
src/math/bigintops@obj@ \
src/profiler/instrument@obj@ \
src/profiler/log@obj@ \
Expand Down Expand Up @@ -390,6 +391,8 @@ HEADERS = src/moar.h \
src/strings/utf16.h \
src/strings/windows1252.h \
src/strings/shiftjis_codeindex.h \
src/strings/gb2312.h \
src/strings/gb2312_codeindex.h \
src/math/bigintops.h \
src/profiler/instrument.h \
src/profiler/log.h \
Expand Down
2 changes: 2 additions & 0 deletions src/moar.h
Expand Up @@ -183,6 +183,8 @@ MVM_PUBLIC const MVMint32 MVM_jit_support(void);
#include "strings/windows1252.h"
#include "strings/shiftjis.h"
#include "strings/unicode_ops.h"
#include "strings/gb2312.h"
#include "strings/gb2312_codeindex.h"
#include "io/io.h"
#include "io/eventloop.h"
#include "io/syncfile.h"
Expand Down
3 changes: 3 additions & 0 deletions src/strings/decode_stream.c
Expand Up @@ -147,6 +147,9 @@ static MVMuint32 run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVM
case MVM_encoding_type_utf16le:
reached_stopper = MVM_string_utf16le_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_gb2312:
reached_stopper = MVM_string_gb2312_decodestream(tc, ds, stopper_chars, sep_spec);
break;
default:
if (ds->encoding < MVM_encoding_type_MIN || MVM_encoding_type_MAX < ds->encoding)
MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRIi32, ds->encoding);
Expand Down
252 changes: 252 additions & 0 deletions src/strings/gb2312.c
@@ -0,0 +1,252 @@
#include "moar.h"
#include "gb2312_codeindex.h"

MVMString * MVM_string_gb2312_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb2312, size_t bytes) {
size_t i, result_graphs;

MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));

result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);

result_graphs = 0;

for (i = 0; i < bytes; i++) {
if (0 <= gb2312[i] && gb2312[i] <= 127) {
/* Ascii character */
if (gb2312[i] == '\r' && i + 1 < bytes && gb2312[i + 1] == '\n') {
result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
i++;
}
else {
result->body.storage.blob_32[result_graphs++] = gb2312[i];
}
}
else {
if (i + 1 < bytes && (gb2312[i + 1] > 127 || gb2312[i + 1] < 0)) {
MVMuint8 byte1 = gb2312[i];
MVMuint8 byte2 = gb2312[i + 1];
MVMuint16 codepoint = (MVMuint16)byte1 * 256 + byte2;
MVMGrapheme32 index = gb2312_index_to_cp(codepoint);
if (index != GB2312_NULL) {
result->body.storage.blob_32[result_graphs++] = index;
i++;
}
else {
MVM_exception_throw_adhoc(tc, "Error decoding gb2312 string: could not decode codepoint 0x%hhX", codepoint);
}
}
else {
MVM_exception_throw_adhoc(tc,
"Error decoding gb2312 string: invalid gb2312 format (two bytes for a gb2312 character). Last byte seen was 0x%hhX\n",
(MVMuint8)gb2312[i]);
}
}
}

result->body.num_graphs = result_graphs;

return result;
}

MVMuint32 MVM_string_gb2312_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer = NULL;
MVMDecodeStreamBytes *cur_bytes = NULL;
MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
MVMint32 last_accept_pos, last_was_cr;
MVMuint32 reached_stopper;

MVMint32 last_was_first_byte;
MVMuint16 last_codepoint;

/* If there's no buffers, we're done. */
if (!ds->bytes_head)
return 0;
last_accept_pos = ds->bytes_head_pos;

/* If we're asked for zero chars, also done. */
if (stopper_chars && *stopper_chars == 0)
return 1;

bufsize = ds->result_size_guess;
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

/* Decode each of the buffers. */
cur_bytes = ds->bytes_head;
last_was_cr = 0;
reached_stopper = 0;

last_was_first_byte = 0;
last_codepoint = 0;

while (cur_bytes) {
/* Process this buffer. */
MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
MVMuint8 *bytes = (MVMuint8 *)cur_bytes->bytes;

while (pos < cur_bytes->length) {
MVMGrapheme32 graph;
MVMuint16 codepoint = (MVMuint16) bytes[pos++];

if (codepoint <= 127) {
if (last_was_first_byte) {
MVM_exception_throw_adhoc(tc,
"Error decoding gb2312 string: invalid gb2312 format (two bytes for a gb2312 character). Last byte seen was 0x%hhX\n",
last_codepoint);
}

if (last_was_cr) {
if (codepoint == '\n') {
graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
}
else {
graph = '\r';
pos--;
}
last_was_cr = 0;
}
else if (graph == '\r') {
last_was_cr = 1;
continue;
}
else {
graph = codepoint;
}
}
else {
if (last_was_first_byte) {
MVMuint16 combined_codepoint = last_codepoint * 256 + codepoint;
graph = gb2312_index_to_cp(combined_codepoint);
if (graph == GB2312_NULL) {
MVM_exception_throw_adhoc(tc, "Error decoding gb2312 string: could not decode codepoint 0x%hhX",
combined_codepoint);
}
last_was_first_byte = 0;
}
else {
last_was_first_byte = 1;
last_codepoint = codepoint;
continue;
}
}

if (count == bufsize) {
/* We filled the buffer. Attach this one to the buffers
* linked list, and continue with a new one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}

buffer[count++] = graph;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;
total++;

if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint) ||
(stopper_chars && *stopper_chars == total)) {
reached_stopper = 1;
goto done;
}
}

cur_bytes = cur_bytes -> next;
}

done:

/* Attach what we successfully parsed as a result buffer, and trim away
* what we chewed through. */
if (count) {
MVM_string_decodestream_add_chars(tc, ds, buffer, count);
}
else {
MVM_free(buffer);
}
MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

return reached_stopper;
}

char * MVM_string_gb2312_encode_substr(MVMThreadContext *tc, MVMString *str,
MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
MVMint32 translate_newlines) {

MVMuint32 startu = (MVMuint32)start;
MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
MVMuint8 *result = NULL;
size_t result_alloc;
MVMuint8 *repl_bytes = NULL;
MVMuint64 repl_length;

if (start < 0 || start > strgraphs)
MVM_exception_throw_adhoc(tc, "start out of range");
if (length < -1 || start + lengthu > strgraphs)
MVM_exception_throw_adhoc(tc, "length out of range");

if (replacement)
repl_bytes = (MVMuint8 *) MVM_string_gb2312_encode_substr(tc,
replacement, &repl_length, 0, -1, NULL, translate_newlines);

result_alloc = lengthu;
result = MVM_malloc(result_alloc + 1);

if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
/* No encoding needed; directly copy. */
memcpy(result, str->body.storage.blob_ascii, lengthu);
result[lengthu] = 0;
if (output_size)
*output_size = lengthu;
}
else {
MVMuint32 out_pos = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);

while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
if (result_alloc <= out_pos + 1) {
result_alloc += 8;
result = MVM_realloc(result, result_alloc + 2);
}
if (codepoint <= 0x7F) {
/* ASCII character */
result[out_pos++] = codepoint;
}
else {
MVMint32 gb2312_cp;
gb2312_cp = gb2312_cp_to_index(codepoint);
if (gb2312_cp == GB2312_NULL) {
if (replacement) {
size_t i;
if (result_alloc <= out_pos + repl_length) {
result_alloc += repl_length;
result = MVM_realloc(result, result_alloc + 1);
}
for (i = 0; i < repl_length; i++) {
result[out_pos++] = repl_bytes[i];
}
continue;
}
else {
MVM_free(result);
MVM_exception_throw_adhoc(tc, "Error encoding gb2312 string: could not encode codepoint 0x%hhX", codepoint);
}
}
result[out_pos++] = gb2312_cp / 256;
result[out_pos++] = gb2312_cp % 256;
}
}
result[out_pos] = 0;
if (output_size)
*output_size = out_pos;
}
if (repl_bytes) MVM_free(repl_bytes);
return (char *)result;
}


5 changes: 5 additions & 0 deletions src/strings/gb2312.h
@@ -0,0 +1,5 @@
MVMString * MVM_string_gb2312_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb2312, size_t bytes);
MVMuint32 MVM_string_gb2312_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps);
char * MVM_string_gb2312_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines);


0 comments on commit 73f28d3

Please sign in to comment.