Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 451 lines (389 sloc) 17.98 kb
ffe45d2 Add support for Encoding::Converter and move String#encode and String…
Patrick Thomson authored
1 /*
2 * MacRuby implementation of transcode.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
12 // Notes:
13 // AFAICT, we need to add support for newline decorators.
14
468a2ea Move Obj-C related headers around.
Thibault Martin-Lagardette authored
15 #include "ruby/macruby.h"
ffe45d2 Add support for Encoding::Converter and move String#encode and String…
Patrick Thomson authored
16 #include "ruby/encoding.h"
17 #include "encoding.h"
18
19 static VALUE sym_invalid;
20 static VALUE sym_undef;
21 static VALUE sym_replace;
22 static VALUE sym_xml;
23 static VALUE sym_text;
24 static VALUE sym_attr;
25
26 typedef struct rb_econv_s {
27 rb_encoding_t *source;
28 rb_encoding_t *destination;
29 transcode_behavior_t invalid_sequence_behavior;
30 transcode_behavior_t undefined_conversion_behavior;
31 transcode_flags_t special_flags;
32 rb_str_t *replacement;
33 bool finished;
34 } rb_econv_t;
35
36 VALUE rb_cEncodingConverter;
37
38 static rb_econv_t* RConverter(VALUE self) {
39 rb_econv_t *conv;
40 Data_Get_Struct(self, rb_econv_t, conv);
41 return conv;
42 }
43
44 static VALUE
45 rb_econv_alloc(VALUE klass, SEL sel)
46 {
47 rb_econv_t *conv = ALLOC(rb_econv_t);
48 conv->source = NULL;
49 conv->destination = NULL;
50 conv->replacement = NULL;
51 conv->special_flags = 0;
52 conv->finished = false;
53 return Data_Wrap_Struct(klass, 0, 0, conv);
54 }
55
56 static VALUE
57 rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
58 {
59 rb_encoding_t *enc = NULL;
60 if (CLASS_OF(arg) == rb_cEncoding) {
61 enc = rb_to_encoding(arg);
62 }
63 else {
64 StringValue(arg);
65 enc = rb_enc_find(RSTRING_PTR(arg));
66 }
67
68 if ((enc == NULL) || (enc->ascii_compatible)) {
69 return Qnil;
70 }
71 else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
72 return (VALUE)rb_utf8_encoding();
73 }
74 // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
75 rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
76 }
77
78 static VALUE rb_econv_convpath(VALUE self, SEL sel);
79
80 static VALUE
81 rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
82 {
83 return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
84 }
85
86 static transcode_behavior_t
87 symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
88 {
89 if (given_symbol == sym_replace) {
90 return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
91 }
92 else if (given_symbol == sym_attr) {
93 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
94 }
95 else if (given_symbol == sym_text) {
96 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
97 }
98 else if (!NIL_P(given_symbol)) {
99 rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
100 }
101 return otherwise;
102 }
103
104 static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
105 transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
106 {
107
108 *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
109 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
110
111 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
112 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
113
114 // Because the API conflates the :xml and :undef options, we pass in the previous setting
115 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
116 *behavior_for_undefined, "xml-replacement");
117
118 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
119 *behavior_for_undefined, "xml-replacement");
120
121 VALUE replacement = rb_hash_aref(options, sym_replace);
122 if (!NIL_P(replacement)) {
123 *replacement_str = str_simple_transcode(str_need_string(replacement), destination);
124 }
125
126 }
127
128 static VALUE
129 rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
130 {
131 rb_econv_t *conv = RConverter(self);
132 VALUE sourceobj, destobj, options;
133 rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
134
135 rb_encoding_t* source = rb_to_encoding(sourceobj);
136 rb_encoding_t* destination = rb_to_encoding(destobj);
137 rb_str_t* replacement_str = NULL;
138
139 conv->source = source;
140 conv->destination = destination;
141
142 conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
143 conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
144
145 // Extract the options. This is a hateful, hateful API.
146 if (!NIL_P(options)) {
147
148 if (FIXNUM_P(options)) {
149 rb_bug("fixnum arguments are not supported yet.");
150 }
151 else if (TYPE(options) == T_HASH) {
152 parse_conversion_options(options, &conv->invalid_sequence_behavior,
153 &conv->undefined_conversion_behavior, &replacement_str, destination);
154 }
155 else {
156 rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
157 }
158 }
159
160 // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
161 if (replacement_str == NULL) {
162 replacement_str = replacement_string_for_encoding(destination);
163 }
164 GC_WB(&conv->replacement, replacement_str);
165
166 return self;
167 }
168
169 static VALUE
170 rb_econv_inspect(VALUE self, SEL sel)
171 {
172 // TODO: make this comply with the MRI output when we add newline decorators
173 rb_econv_t *conv = RConverter(self);
174 return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
175 conv->destination->public_name);
176 }
177
178 static VALUE
179 rb_econv_convpath(VALUE self, SEL sel)
180 {
181 // in MacRuby, the convpath always looks like this:
182 // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
183 // The first element is omitted if the source encoding is UTF-16, obviously.
184 rb_econv_t *conv = RConverter(self);
185 VALUE to_return = rb_ary_new2(2);
186 rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
187
188 if (conv->source != nativeUTF16) {
189 rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
190 }
191
192 rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
193
194 return to_return;
195 }
196
197 static VALUE
198 rb_econv_source_encoding(VALUE self, SEL sel)
199 {
200 return (VALUE)(RConverter(self)->source);
201 }
202
203 static VALUE
204 rb_econv_destination_encoding(VALUE self, SEL sel)
205 {
206 return (VALUE)(RConverter(self)->destination);
207 }
208
209 // Since our converter is basically a black box at this point, we'll leave
210 // the lower-level methods unimplemented.
211 #define rb_econv_primitive_convert rb_f_notimplement
212
213 static VALUE
214 rb_econv_convert(VALUE self, SEL sel, VALUE str)
215 {
216 rb_econv_t *conv;
217 Data_Get_Struct(self, rb_econv_t, conv);
218
219 if (conv->finished) {
220 rb_raise(rb_eArgError, "convert() called on a finished stream");
221 }
222
223 assert(conv->replacement->encoding == conv->destination);
224 return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
225 }
226
227 static VALUE
228 rb_econv_finish(VALUE self, SEL sel)
229 {
230 // TODO: Flesh this out later.
231 RConverter(self)->finished = true;
232 return rb_str_new2("");
233 }
234
235 #define rb_econv_primitive_errinfo rb_f_notimplement
236
237 #define rb_econv_insert_output rb_f_notimplement
238
239 #define rb_econv_putback rb_f_notimplement
240
241 #define rb_econv_last_error rb_f_notimplement
242
243 static VALUE
244 rb_econv_replacement(VALUE self, SEL sel)
245 {
246 return (VALUE)(RConverter(self)->replacement);
247 }
248
249 static VALUE
250 rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
251 {
252 // TODO: Should we copy this string? Probably.
253 rb_econv_t *conv = RConverter(self);
254 if (TYPE(str) != T_STRING) {
255 rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
256 }
257 rb_str_force_encoding(str, conv->destination);
258 GC_WB(&conv->replacement, str_need_string(str));
259 return str;
260 }
261
262 /*
263 * call-seq:
264 * str.encode(encoding [, options] ) => str
265 * str.encode(dst_encoding, src_encoding [, options] ) => str
266 * str.encode([options]) => str
267 *
268 * The first form returns a copy of <i>str</i> transcoded
269 * to encoding +encoding+.
270 * The second form returns a copy of <i>str</i> transcoded
271 * from src_encoding to dst_encoding.
272 * The last form returns a copy of <i>str</i> transcoded to
273 * <code>Encoding.default_internal</code>.
274 * By default, the first and second form raise
275 * Encoding::UndefinedConversionError for characters that are
276 * undefined in the destination encoding, and
277 * Encoding::InvalidByteSequenceError for invalid byte sequences
278 * in the source encoding. The last form by default does not raise
279 * exceptions but uses replacement strings.
280 * The <code>options</code> Hash gives details for conversion.
281 *
282 * === options
283 * The hash <code>options</code> can have the following keys:
284 * :invalid ::
285 * If the value is <code>:replace</code>, <code>#encode</code> replaces
286 * invalid byte sequences in <code>str</code> with the replacement character.
287 * The default is to raise the exception
288 * :undef ::
289 * If the value is <code>:replace</code>, <code>#encode</code> replaces
290 * characters which are undefined in the destination encoding with
291 * the replacement character.
292 * :replace ::
293 * Sets the replacement string to the value. The default replacement
294 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
295 * :xml ::
296 * The value must be <code>:text</code> or <code>:attr</code>.
297 * If the value is <code>:text</code> <code>#encode</code> replaces
298 * undefined characters with their (upper-case hexadecimal) numeric
299 * character references. '&', '<', and '>' are converted to "&amp;",
300 * "&lt;", and "&gt;", respectively.
301 * If the value is <code>:attr</code>, <code>#encode</code> also quotes
302 * the replacement result (using '"'), and replaces '"' with "&quot;".
303 */
304 extern rb_encoding_t *default_internal;
305 static VALUE
306 rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
307 {
308 VALUE opt = Qnil;
309 if (argc > 0) {
310 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
311 if (!NIL_P(opt)) {
312 argc--;
313 }
314 }
315
316 rb_str_t *self = RSTR(str);
317 rb_str_t *replacement_str = NULL;
318 rb_encoding_t *src_encoding, *dst_encoding;
319 transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
320 transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
321 if (argc == 0) {
322 src_encoding = self->encoding;
323 dst_encoding = default_internal;
324 behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
325 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
326 }
327 else if (argc == 1) {
328 src_encoding = self->encoding;
329 dst_encoding = rb_to_encoding(argv[0]);
330 }
331 else if (argc == 2) {
332 dst_encoding = rb_to_encoding(argv[0]);
333 src_encoding = rb_to_encoding(argv[1]);
334 }
335 else {
336 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
337 }
338
339 if (!NIL_P(opt)) {
340 parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
341 if ((replacement_str != NULL)
342 && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
343 && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
344 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
345 }
346 }
347
348 if ((replacement_str == NULL)
349 && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
350 || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
351 replacement_str = replacement_string_for_encoding(dst_encoding);
352 }
353
354 return (VALUE)str_transcode(self, src_encoding, dst_encoding,
355 behavior_for_invalid, behavior_for_undefined, replacement_str);
356 }
357
358 /*
359 * call-seq:
360 * str.encode!(encoding [, options] ) => str
361 * str.encode!(dst_encoding, src_encoding [, options] ) => str
362 *
363 * The first form transcodes the contents of <i>str</i> from
364 * str.encoding to +encoding+.
365 * The second form transcodes the contents of <i>str</i> from
366 * src_encoding to dst_encoding.
367 * The options Hash gives details for conversion. See String#encode
368 * for details.
369 * Returns the string even if no changes were made.
370 */
371 static VALUE
372 rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
373 {
374 rstr_modify(str);
375
376 VALUE new_str = rstr_encode(str, sel, argc, argv);
377 str_replace_with_string(RSTR(str), RSTR(new_str));
378 return str;
379 }
380
381 void
382 Init_Transcode(void)
383 {
384 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
385 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
386 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
387
388 rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
389 rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
390
391 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
392 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
393 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
394 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
395
396 rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
397 rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
398 rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
399 rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
400 rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
401 rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
402 rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
403 rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
404 rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
405 rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
406 rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
407 rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
408 rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
409 rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
410
411 sym_invalid = ID2SYM(rb_intern("invalid"));
412 sym_undef = ID2SYM(rb_intern("undef"));
413 sym_replace = ID2SYM(rb_intern("replace"));
414 sym_attr = ID2SYM(rb_intern("attr"));
415 sym_text = ID2SYM(rb_intern("text"));
416 sym_xml = ID2SYM(rb_intern("xml"));
417
418 // If only these mapped to the internal enums...
419 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
420 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
421 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
422 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
423 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
424 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
425 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
426 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
427 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
428 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
429 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
430 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
431 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
432
433 #if 0
434 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
435 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
436 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
437 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
438 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
439
440 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
441 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
442 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
443 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
444 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
445 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
446 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
447
448 Init_newline();
449 #endif
450 }
Something went wrong with that request. Please try again.