forked from MoarVM/MoarVM
/
gb2312.c
273 lines (239 loc) · 9.7 KB
/
gb2312.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
#include "moar.h"
#include "gb2312_codeindex.h"
MVMString * MVM_string_gb2312_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb2312, size_t bytes) {
size_t i, result_graphs;
MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
result_graphs = 0;
for (i = 0; i < bytes; i++) {
if (0 <= gb2312[i] && gb2312[i] <= 127) {
/* Ascii character */
if (gb2312[i] == '\r' && i + 1 < bytes && gb2312[i + 1] == '\n') {
result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
i++;
}
else {
result->body.storage.blob_32[result_graphs++] = gb2312[i];
}
}
else {
if (i + 1 < bytes && (gb2312[i + 1] > 127 || gb2312[i + 1] < 0)) {
MVMuint8 byte1 = gb2312[i];
MVMuint8 byte2 = gb2312[i + 1];
MVMuint16 codepoint = (MVMuint16)byte1 * 256 + byte2;
MVMGrapheme32 index = gb2312_index_to_cp(codepoint);
if (index != GB2312_NULL) {
result->body.storage.blob_32[result_graphs++] = index;
i++;
}
else {
MVM_exception_throw_adhoc(tc, "Error decoding gb2312 string: could not decode codepoint 0x%x", codepoint);
}
}
else {
MVM_exception_throw_adhoc(tc,
"Error decoding gb2312 string: invalid gb2312 format (two bytes for a gb2312 character). Last byte seen was 0x%hhX\n",
(MVMuint8)gb2312[i]);
}
}
}
result->body.num_graphs = result_graphs;
return result;
}
#define GB2312_DECODE_FORMAT_EXCEPTION -1
#define GB2312_DECODE_ASCII_CODEPOINT -2
#define GB2312_DECODE_CONTINUE -3
#define GB2312_DECODE_CODEPOINT_EXCEPTION -4
#define GB2312_DECODE_CHINESE_CODEPOINT -5
int gb2312_decode_handler(MVMThreadContext *tc, MVMint32 last_was_first_byte,
MVMuint16 codepoint, MVMuint16 last_codepoint, MVMGrapheme32 *out) {
MVMGrapheme32 graph;
if (codepoint <= 127) {
if (last_was_first_byte) {
return GB2312_DECODE_FORMAT_EXCEPTION;
}
graph = (MVMGrapheme32)codepoint;
*out = graph;
return GB2312_DECODE_ASCII_CODEPOINT;
}
else {
if (last_was_first_byte) {
MVMuint16 combined_codepoint = last_codepoint * 256 + codepoint;
graph = gb2312_index_to_cp(combined_codepoint);
*out = graph;
if (graph == GB2312_NULL) {
return GB2312_DECODE_CODEPOINT_EXCEPTION;
}
return GB2312_DECODE_CHINESE_CODEPOINT;
}
else {
return GB2312_DECODE_CONTINUE;
}
}
}
MVMuint32 MVM_string_gb2312_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer = NULL;
MVMDecodeStreamBytes *cur_bytes = NULL;
MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
MVMint32 last_accept_pos, last_was_cr;
MVMuint32 reached_stopper;
MVMint32 last_was_first_byte;
MVMuint16 last_codepoint;
/* If there's no buffers, we're done. */
if (!ds->bytes_head)
return 0;
last_accept_pos = ds->bytes_head_pos;
/* If we're asked for zero chars, also done. */
if (stopper_chars && *stopper_chars == 0)
return 1;
bufsize = ds->result_size_guess;
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
/* Decode each of the buffers. */
cur_bytes = ds->bytes_head;
last_was_cr = 0;
reached_stopper = 0;
last_was_first_byte = 0;
last_codepoint = 0;
while (cur_bytes) {
/* Process this buffer. */
MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
MVMuint8 *bytes = (MVMuint8 *)cur_bytes->bytes;
while (pos < cur_bytes->length) {
MVMGrapheme32 graph;
MVMuint16 codepoint = (MVMuint16) bytes[pos++];
int handler_rtrn = gb2312_decode_handler(tc, last_was_first_byte, codepoint, last_codepoint, &graph);
if (handler_rtrn == GB2312_DECODE_FORMAT_EXCEPTION) {
MVM_exception_throw_adhoc(tc,
"Error decoding gb2312 string: invalid gb2312 format (two bytes for a gb2312 character). Last byte seen was 0x%hhX\n",
last_codepoint);
}
else if (handler_rtrn == GB2312_DECODE_CODEPOINT_EXCEPTION) {
MVM_exception_throw_adhoc(tc, "Error decoding gb2312 string: could not decode codepoint 0x%x",
last_codepoint * 256 + codepoint);
}
else if (handler_rtrn == GB2312_DECODE_CONTINUE) {
last_codepoint = codepoint;
last_was_first_byte = 1;
continue;
}
else if (handler_rtrn == GB2312_DECODE_CHINESE_CODEPOINT) {
last_was_first_byte = 0;
}
if (last_was_cr) {
if (codepoint == '\n') {
graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
}
else {
graph = '\r';
pos--;
}
last_was_cr = 0;
}
else if (codepoint == '\r') {
last_was_cr = 1;
continue;
}
if (count == bufsize) {
/* We filled the buffer. Attach this one to the buffers
* linked list, and continue with a new one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}
buffer[count++] = graph;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;
total++;
if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint) ||
(stopper_chars && *stopper_chars == total)) {
reached_stopper = 1;
goto done;
}
}
cur_bytes = cur_bytes -> next;
}
done:
/* Attach what we successfully parsed as a result buffer, and trim away
* what we chewed through. */
if (count) {
MVM_string_decodestream_add_chars(tc, ds, buffer, count);
}
else {
MVM_free(buffer);
}
MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
return reached_stopper;
}
char * MVM_string_gb2312_encode_substr(MVMThreadContext *tc, MVMString *str,
MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
MVMint32 translate_newlines) {
MVMuint32 startu = (MVMuint32)start;
MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
MVMuint8 *result = NULL;
size_t result_alloc;
MVMuint8 *repl_bytes = NULL;
MVMuint64 repl_length;
if (start < 0 || start > strgraphs)
MVM_exception_throw_adhoc(tc, "start out of range");
if (length < -1 || start + lengthu > strgraphs)
MVM_exception_throw_adhoc(tc, "length out of range");
if (replacement)
repl_bytes = (MVMuint8 *) MVM_string_gb2312_encode_substr(tc,
replacement, &repl_length, 0, -1, NULL, translate_newlines);
result_alloc = lengthu;
result = MVM_malloc(result_alloc + 1);
if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
/* No encoding needed; directly copy. */
memcpy(result, str->body.storage.blob_ascii, lengthu);
result[lengthu] = 0;
if (output_size)
*output_size = lengthu;
}
else {
MVMuint32 out_pos = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
if (result_alloc <= out_pos + 1) {
result_alloc += 8;
result = MVM_realloc(result, result_alloc + 2);
}
if (codepoint <= 0x7F) {
/* ASCII character */
result[out_pos++] = codepoint;
}
else {
MVMint32 gb2312_cp;
gb2312_cp = gb2312_cp_to_index(codepoint);
if (gb2312_cp == GB2312_NULL) {
if (replacement) {
size_t i;
if (result_alloc <= out_pos + repl_length) {
result_alloc += repl_length;
result = MVM_realloc(result, result_alloc + 1);
}
for (i = 0; i < repl_length; i++) {
result[out_pos++] = repl_bytes[i];
}
continue;
}
MVM_free(result);
MVM_exception_throw_adhoc(tc, "Error encoding gb2312 string: could not encode codepoint 0x%x", codepoint);
}
result[out_pos++] = gb2312_cp / 256;
result[out_pos++] = gb2312_cp % 256;
}
}
result[out_pos] = 0;
if (output_size)
*output_size = out_pos;
}
MVM_free(repl_bytes);
return (char *)result;
}