-
Notifications
You must be signed in to change notification settings - Fork 193
/
transcode.c
450 lines (389 loc) · 17.6 KB
/
transcode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/*
* MacRuby implementation of transcode.c.
*
* This file is covered by the Ruby license. See COPYING for more details.
*
* Copyright (C) 2007-2010, Apple Inc. All rights reserved.
* Copyright (C) 1993-2007 Yukihiro Matsumoto
* Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
* Copyright (C) 2000 Information-technology Promotion Agency, Japan
*/
// Notes:
// AFAICT, we need to add support for newline decorators.
#include "ruby/macruby.h"
#include "ruby/encoding.h"
#include "encoding.h"
static VALUE sym_invalid;
static VALUE sym_undef;
static VALUE sym_replace;
static VALUE sym_xml;
static VALUE sym_text;
static VALUE sym_attr;
typedef struct rb_econv_s {
rb_encoding_t *source;
rb_encoding_t *destination;
transcode_behavior_t invalid_sequence_behavior;
transcode_behavior_t undefined_conversion_behavior;
transcode_flags_t special_flags;
rb_str_t *replacement;
bool finished;
} rb_econv_t;
VALUE rb_cEncodingConverter;
static rb_econv_t* RConverter(VALUE self) {
rb_econv_t *conv;
Data_Get_Struct(self, rb_econv_t, conv);
return conv;
}
static VALUE
rb_econv_alloc(VALUE klass, SEL sel)
{
rb_econv_t *conv = ALLOC(rb_econv_t);
conv->source = NULL;
conv->destination = NULL;
conv->replacement = NULL;
conv->special_flags = 0;
conv->finished = false;
return Data_Wrap_Struct(klass, 0, 0, conv);
}
static VALUE
rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
{
rb_encoding_t *enc = NULL;
if (CLASS_OF(arg) == rb_cEncoding) {
enc = rb_to_encoding(arg);
}
else {
StringValue(arg);
enc = rb_enc_find(RSTRING_PTR(arg));
}
if ((enc == NULL) || (enc->ascii_compatible)) {
return Qnil;
}
else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
return (VALUE)rb_utf8_encoding();
}
// TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
}
static VALUE rb_econv_convpath(VALUE self, SEL sel);
static VALUE
rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
{
return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
}
static transcode_behavior_t
symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
{
if (given_symbol == sym_replace) {
return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
}
else if (given_symbol == sym_attr) {
return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
}
else if (given_symbol == sym_text) {
return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
}
else if (!NIL_P(given_symbol)) {
rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
}
return otherwise;
}
static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
{
*behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
*behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
// Because the API conflates the :xml and :undef options, we pass in the previous setting
*behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
*behavior_for_undefined, "xml-replacement");
*behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
*behavior_for_undefined, "xml-replacement");
VALUE replacement = rb_hash_aref(options, sym_replace);
if (!NIL_P(replacement)) {
*replacement_str = str_simple_transcode(str_need_string(replacement), destination);
}
}
static VALUE
rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
{
rb_econv_t *conv = RConverter(self);
VALUE sourceobj, destobj, options;
rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
rb_encoding_t* source = rb_to_encoding(sourceobj);
rb_encoding_t* destination = rb_to_encoding(destobj);
rb_str_t* replacement_str = NULL;
conv->source = source;
conv->destination = destination;
conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
// Extract the options. This is a hateful, hateful API.
if (!NIL_P(options)) {
if (FIXNUM_P(options)) {
rb_bug("fixnum arguments are not supported yet.");
}
else if (TYPE(options) == T_HASH) {
parse_conversion_options(options, &conv->invalid_sequence_behavior,
&conv->undefined_conversion_behavior, &replacement_str, destination);
}
else {
rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
}
}
// Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
if (replacement_str == NULL) {
replacement_str = replacement_string_for_encoding(destination);
}
GC_WB(&conv->replacement, replacement_str);
return self;
}
static VALUE
rb_econv_inspect(VALUE self, SEL sel)
{
// TODO: make this comply with the MRI output when we add newline decorators
rb_econv_t *conv = RConverter(self);
return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
conv->destination->public_name);
}
static VALUE
rb_econv_convpath(VALUE self, SEL sel)
{
// in MacRuby, the convpath always looks like this:
// [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
// The first element is omitted if the source encoding is UTF-16, obviously.
rb_econv_t *conv = RConverter(self);
VALUE to_return = rb_ary_new2(2);
rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
if (conv->source != nativeUTF16) {
rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
}
rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
return to_return;
}
static VALUE
rb_econv_source_encoding(VALUE self, SEL sel)
{
return (VALUE)(RConverter(self)->source);
}
static VALUE
rb_econv_destination_encoding(VALUE self, SEL sel)
{
return (VALUE)(RConverter(self)->destination);
}
// Since our converter is basically a black box at this point, we'll leave
// the lower-level methods unimplemented.
#define rb_econv_primitive_convert rb_f_notimplement
static VALUE
rb_econv_convert(VALUE self, SEL sel, VALUE str)
{
rb_econv_t *conv;
Data_Get_Struct(self, rb_econv_t, conv);
if (conv->finished) {
rb_raise(rb_eArgError, "convert() called on a finished stream");
}
assert(conv->replacement->encoding == conv->destination);
return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
}
static VALUE
rb_econv_finish(VALUE self, SEL sel)
{
// TODO: Flesh this out later.
RConverter(self)->finished = true;
return rb_str_new2("");
}
#define rb_econv_primitive_errinfo rb_f_notimplement
#define rb_econv_insert_output rb_f_notimplement
#define rb_econv_putback rb_f_notimplement
#define rb_econv_last_error rb_f_notimplement
static VALUE
rb_econv_replacement(VALUE self, SEL sel)
{
return (VALUE)(RConverter(self)->replacement);
}
static VALUE
rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
{
// TODO: Should we copy this string? Probably.
rb_econv_t *conv = RConverter(self);
if (TYPE(str) != T_STRING) {
rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
}
rb_str_force_encoding(str, conv->destination);
GC_WB(&conv->replacement, str_need_string(str));
return str;
}
/*
* call-seq:
* str.encode(encoding [, options] ) => str
* str.encode(dst_encoding, src_encoding [, options] ) => str
* str.encode([options]) => str
*
* The first form returns a copy of <i>str</i> transcoded
* to encoding +encoding+.
* The second form returns a copy of <i>str</i> transcoded
* from src_encoding to dst_encoding.
* The last form returns a copy of <i>str</i> transcoded to
* <code>Encoding.default_internal</code>.
* By default, the first and second form raise
* Encoding::UndefinedConversionError for characters that are
* undefined in the destination encoding, and
* Encoding::InvalidByteSequenceError for invalid byte sequences
* in the source encoding. The last form by default does not raise
* exceptions but uses replacement strings.
* The <code>options</code> Hash gives details for conversion.
*
* === options
* The hash <code>options</code> can have the following keys:
* :invalid ::
* If the value is <code>:replace</code>, <code>#encode</code> replaces
* invalid byte sequences in <code>str</code> with the replacement character.
* The default is to raise the exception
* :undef ::
* If the value is <code>:replace</code>, <code>#encode</code> replaces
* characters which are undefined in the destination encoding with
* the replacement character.
* :replace ::
* Sets the replacement string to the value. The default replacement
* string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
* :xml ::
* The value must be <code>:text</code> or <code>:attr</code>.
* If the value is <code>:text</code> <code>#encode</code> replaces
* undefined characters with their (upper-case hexadecimal) numeric
* character references. '&', '<', and '>' are converted to "&",
* "<", and ">", respectively.
* If the value is <code>:attr</code>, <code>#encode</code> also quotes
* the replacement result (using '"'), and replaces '"' with """.
*/
extern rb_encoding_t *default_internal;
static VALUE
rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
{
VALUE opt = Qnil;
if (argc > 0) {
opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
if (!NIL_P(opt)) {
argc--;
}
}
rb_str_t *self = RSTR(str);
rb_str_t *replacement_str = NULL;
rb_encoding_t *src_encoding, *dst_encoding;
transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
if (argc == 0) {
src_encoding = self->encoding;
dst_encoding = default_internal;
behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
}
else if (argc == 1) {
src_encoding = self->encoding;
dst_encoding = rb_to_encoding(argv[0]);
}
else if (argc == 2) {
dst_encoding = rb_to_encoding(argv[0]);
src_encoding = rb_to_encoding(argv[1]);
}
else {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
}
if (!NIL_P(opt)) {
parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
if ((replacement_str != NULL)
&& (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
&& (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
}
}
if ((replacement_str == NULL)
&& ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
|| (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
replacement_str = replacement_string_for_encoding(dst_encoding);
}
return (VALUE)str_transcode(self, src_encoding, dst_encoding,
behavior_for_invalid, behavior_for_undefined, replacement_str);
}
/*
* call-seq:
* str.encode!(encoding [, options] ) => str
* str.encode!(dst_encoding, src_encoding [, options] ) => str
*
* The first form transcodes the contents of <i>str</i> from
* str.encoding to +encoding+.
* The second form transcodes the contents of <i>str</i> from
* src_encoding to dst_encoding.
* The options Hash gives details for conversion. See String#encode
* for details.
* Returns the string even if no changes were made.
*/
static VALUE
rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
{
rstr_modify(str);
VALUE new_str = rstr_encode(str, sel, argc, argv);
str_replace_with_string(RSTR(str), RSTR(new_str));
return str;
}
void
Init_Transcode(void)
{
rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
sym_invalid = ID2SYM(rb_intern("invalid"));
sym_undef = ID2SYM(rb_intern("undef"));
sym_replace = ID2SYM(rb_intern("replace"));
sym_attr = ID2SYM(rb_intern("attr"));
sym_text = ID2SYM(rb_intern("text"));
sym_xml = ID2SYM(rb_intern("xml"));
// If only these mapped to the internal enums...
rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
#if 0
rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
Init_newline();
#endif
}