forked from MoarVM/MoarVM
/
gb18030.c
346 lines (306 loc) · 13.2 KB
/
gb18030.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#include "moar.h"
#include "gb18030_codeindex.h"
/* Information about GB18030: http://www.gb18030.com/
GB18030 to Unicode Mapping used (included in GNU LIBICONV Package):
ftp://ftp.gnu.org/pub/gnu/libiconv/libiconv-1.7.tar.gz
*/
const MVMint32 gb18030_two_byte_lower_bound[126] = {
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,161,161,161,161,161,161,161,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,
64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64,64};
const MVMint32 gb18030_two_byte_upper_bound[126] = {
254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,
254,254,254,254,254,254,254,254,254,254,254,254,254,252,254,243,246,245,241,233,
239,160,160,160,160,160,160,254,254,254,254,254,254,254,254,254,254,254,254,254,
254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,
254,254,254,254,254,254,249,254,254,254,254,254,254,254,254,254,254,254,254,254,
254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,160,
160,160,160,160,160,159};
MVMint32 gb18030_valid_check_len2(MVMint32 c_1, MVMint32 c_2) {
/* This function serves like a 'first stage check' of c_1 and c_2.
It eliminates most of the invalid combinations of c_1 and c_2,
but for code simplicity and to avoid lots of if-else here,
ther remaining invalid combinations will be processed
in the conversion table. */
if (c_1 < 0x81 || c_1 > 0xfe) return 0;
c_1 -= 0x81;
return gb18030_two_byte_lower_bound[c_1] <= c_2 && c_2 <= gb18030_two_byte_upper_bound[c_1];
}
MVMint32 gb18030_valid_check_len4(MVMint32 c_1, MVMint32 c_2, MVMint32 c_3, MVMint32 c_4) {
if ((0x81 <= c_1 && c_1 <= 0x83) || (c_1 == 0x84 && c_2 == 0x30)) {
return (0x30 <= c_2 && c_2 <= 0x39) && (0x81 <= c_3 && c_3 <= 0xfe) && (0x30 <= c_4 && c_4 <= 0x39);
} else if (c_1 == 0x84 && c_2 == 0x31) {
return (0x81 <= c_3 && c_3 <= 0xa4) && (0x30 <= c_4 && c_4 <= 0x39);
}
return 0;
}
MVMint32 gb18030_valid_check_len4_first2(MVMint32 c_1, MVMint32 c_2) {
return (((0x81 <= c_1 && c_1 <= 0x83) || (c_1 == 0x84 && c_2 == 0x30)) && (0x30 <= c_2 && c_2 <= 0x39)) || (c_1 == 0x84 && c_2 == 0x31);
}
MVMString * MVM_string_gb18030_decode(MVMThreadContext *tc, const MVMObject *result_type, const char *gb18030, size_t bytes) {
size_t i, result_graphs;
MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32) * bytes);
result_graphs = 0;
for (i = 0; i < bytes; i++) {
if (0 <= gb18030[i] && gb18030[i] <= 127) {
if (gb18030[i] == '\r' && i + 1 < bytes && gb18030[i + 1] == '\n') {
result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
i++;
}
else {
result->body.storage.blob_32[result_graphs++] = gb18030[i];
}
}
else {
if (i + 1 < bytes) {
/* GB18030 codepoint of length 2 */
MVMuint8 byte1 = gb18030[i];
MVMuint8 byte2 = gb18030[i + 1];
if (gb18030_valid_check_len2(byte1, byte2)) {
MVMGrapheme32 index = gb18030_index_to_cp_len2(byte1, byte2);
if (index != GB18030_NULL) {
result->body.storage.blob_32[result_graphs++] = index;
i++;
continue;
}
}
}
if (i + 3 < bytes) {
/* GB18030 codepoint of length 4 */
MVMuint8 byte1 = gb18030[i];
MVMuint8 byte2 = gb18030[i + 1];
MVMuint8 byte3 = gb18030[i + 2];
MVMuint8 byte4 = gb18030[i + 3];
if (gb18030_valid_check_len4(byte1, byte2, byte3, byte4)) {
MVMGrapheme32 index = gb18030_index_to_cp_len4(byte1, byte2, byte3, byte4);
if (index != GB18030_NULL) {
result->body.storage.blob_32[result_graphs++] = index;
i += 3;
continue;
}
}
}
MVM_exception_throw_adhoc(tc,
"Error decoding gb18030 string: invalid gb18030 format. Last byte seen was 0x%hhX\n",
(MVMuint8)gb18030[i]);
}
}
result->body.num_graphs = result_graphs;
return result;
}
MVMuint32 MVM_string_gb18030_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) {
MVMint32 count = 0, total = 0;
MVMint32 bufsize;
MVMGrapheme32 *buffer = NULL;
MVMDecodeStreamBytes *cur_bytes = NULL;
MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
MVMint32 last_accept_pos, last_was_cr;
MVMuint32 reached_stopper;
MVMint32 last_was_first_byte, is_len4;
MVMint32 last_codepoint;
MVMint32 len4_cnt,len4_byte1, len4_byte2, len4_byte3, len4_byte4;
/* If there's no buffers, we're done. */
if (!ds->bytes_head)
return 0;
last_accept_pos = ds->bytes_head_pos;
/* If we're asked for zero chars, also done. */
if (stopper_chars && *stopper_chars == 0)
return 1;
bufsize = ds->result_size_guess;
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
/* Decode each of the buffers. */
cur_bytes = ds->bytes_head;
last_was_cr = 0;
reached_stopper = 0;
last_was_first_byte = 0;
last_codepoint = 0;
is_len4 = 0;
len4_cnt = 0;
len4_byte1 = 0;
len4_byte2 = 0;
len4_byte3 = 0;
len4_byte4 = 0;
while (cur_bytes) {
/* Process this buffer. */
MVMint32 pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
MVMuint8 *bytes = (MVMuint8 *)cur_bytes->bytes;
while (pos < cur_bytes->length) {
MVMGrapheme32 graph;
MVMint32 codepoint = (MVMint32) bytes[pos++];
if (is_len4) {
if (len4_cnt == 2) {
len4_cnt++;
len4_byte3 = codepoint;
continue;
}
if (len4_cnt == 3) {
len4_byte4 = codepoint;
if (gb18030_valid_check_len4(len4_byte1, len4_byte2, len4_byte3, len4_byte4)) {
graph = gb18030_index_to_cp_len4(len4_byte1, len4_byte2, len4_byte3, len4_byte4);
is_len4 = 0;
} else {
MVM_exception_throw_adhoc(tc,
"Error decoding gb18030 string: invalid gb18030 format. Last four bytes seen was 0x%x, 0x%x, 0x%x, 0x%x\n",
len4_byte1, len4_byte2, len4_byte3, len4_byte4);
}
}
}
else if (codepoint <= 127 && !last_was_first_byte) {
if (last_was_cr) {
if (codepoint == '\n') {
graph = MVM_unicode_normalizer_translated_crlf(tc, &(ds->norm));
}
else {
graph = '\r';
pos--;
}
last_was_cr = 0;
}
else if (codepoint == '\r') {
last_was_cr = 1;
continue;
}
else {
graph = codepoint;
}
}
else {
if (last_was_first_byte) {
if (gb18030_valid_check_len4_first2(last_codepoint, codepoint)) {
is_len4 = 1;
len4_byte1 = last_codepoint;
len4_byte2 = codepoint;
len4_cnt = 2;
last_was_first_byte = 0;
continue;
}
graph = gb18030_index_to_cp_len2(last_codepoint, codepoint);
if (graph == GB18030_NULL) {
MVM_exception_throw_adhoc(tc,
"Error decoding gb18030 string: invalid gb18030 format. Last two bytes seen was 0x%x, 0x%x\n",
last_codepoint, codepoint);
}
last_was_first_byte = 0;
}
else {
last_was_first_byte = 1;
last_codepoint = codepoint;
continue;
}
}
if (count == bufsize) {
/* We filled the buffer. Attach this one to the buffers
* linked list, and continue with a new one. */
MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
count = 0;
}
buffer[count++] = graph;
last_accept_bytes = cur_bytes;
last_accept_pos = pos;
total++;
if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint) ||
(stopper_chars && *stopper_chars == total)) {
reached_stopper = 1;
goto done;
}
}
cur_bytes = cur_bytes -> next;
}
done:
/* Attach what we successfully parsed as a result buffer, and trim away
* what we chewed through. */
if (count) {
MVM_string_decodestream_add_chars(tc, ds, buffer, count);
}
else {
MVM_free(buffer);
}
MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);
return reached_stopper;
}
char * MVM_string_gb18030_encode_substr(MVMThreadContext *tc, MVMString *str,
MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement,
MVMint32 translate_newlines) {
MVMuint32 startu = (MVMuint32)start;
MVMStringIndex strgraphs = MVM_string_graphs(tc, str);
MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length);
MVMuint8 *result = NULL;
size_t result_alloc;
MVMuint8 *repl_bytes = NULL;
MVMuint64 repl_length;
if (start < 0 || start > strgraphs)
MVM_exception_throw_adhoc(tc, "start out of range");
if (length < -1 || start + lengthu > strgraphs)
MVM_exception_throw_adhoc(tc, "length out of range");
if (replacement)
repl_bytes = (MVMuint8 *) MVM_string_gb18030_encode_substr(tc,
replacement, &repl_length, 0, -1, NULL, translate_newlines);
result_alloc = lengthu;
result = MVM_malloc(result_alloc + 1);
if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) {
/* No encoding needed; directly copy. */
memcpy(result, str->body.storage.blob_ascii, lengthu);
result[lengthu] = 0;
if (output_size)
*output_size = lengthu;
}
else {
MVMuint32 out_pos = 0;
MVMCodepointIter ci;
MVM_string_ci_init(tc, &ci, str, translate_newlines, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMCodepoint codepoint = MVM_string_ci_get_codepoint(tc, &ci);
if (result_alloc <= out_pos + 1) {
result_alloc += 8;
result = MVM_realloc(result, result_alloc + 5);
}
if (codepoint <= 0x7F) {
result[out_pos++] = codepoint;
}
else {
MVMint64 gb18030_cp;
gb18030_cp = gb18030_cp_to_index(codepoint);
if (gb18030_cp == GB18030_NULL) {
if (replacement) {
size_t i;
if (result_alloc <= out_pos + repl_length) {
result_alloc += repl_length;
result = MVM_realloc(result, result_alloc + 1);
}
for (i = 0; i < repl_length; i++) {
result[out_pos++] = repl_bytes[i];
}
continue;
}
else {
MVM_free(result);
MVM_exception_throw_adhoc(tc, "Error encoding gb18030 string: could not encode codepoint 0x%x", codepoint);
}
}
if (gb18030_cp <= 0xffff) {
/* Length = 2 */
result[out_pos++] = gb18030_cp / 256;
result[out_pos++] = gb18030_cp % 256;
}
else {
/* Length = 4 */
result[out_pos++] = (gb18030_cp / 16777216) % 256;
result[out_pos++] = (gb18030_cp / 65536) % 256;
result[out_pos++] = (gb18030_cp / 256) % 256;
result[out_pos++] = gb18030_cp % 256;
}
}
}
result[out_pos] = 0;
if (output_size)
*output_size = out_pos;
}
if (repl_bytes) MVM_free(repl_bytes);
return (char *)result;
}