Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 451 lines (389 sloc) 17.98 kb
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
1 /*
2 * MacRuby implementation of transcode.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
12 // Notes:
13 // AFAICT, we need to add support for newline decorators.
14
468a2ea Move Obj-C related headers around.
Thibault Martin-Lagardette authored
15 #include "ruby/macruby.h"
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
16 #include "ruby/encoding.h"
17 #include "encoding.h"
18
19 static VALUE sym_invalid;
20 static VALUE sym_undef;
21 static VALUE sym_replace;
22 static VALUE sym_xml;
23 static VALUE sym_text;
24 static VALUE sym_attr;
25
26 typedef struct rb_econv_s {
27 rb_encoding_t *source;
28 rb_encoding_t *destination;
29 transcode_behavior_t invalid_sequence_behavior;
30 transcode_behavior_t undefined_conversion_behavior;
31 transcode_flags_t special_flags;
32 rb_str_t *replacement;
33 bool finished;
34 } rb_econv_t;
35
36 VALUE rb_cEncodingConverter;
37
38 static rb_econv_t* RConverter(VALUE self) {
39 rb_econv_t *conv;
40 Data_Get_Struct(self, rb_econv_t, conv);
41 return conv;
42 }
43
44 static VALUE
45 rb_econv_alloc(VALUE klass, SEL sel)
46 {
47 rb_econv_t *conv = ALLOC(rb_econv_t);
48 conv->source = NULL;
49 conv->destination = NULL;
50 conv->replacement = NULL;
51 conv->special_flags = 0;
52 conv->finished = false;
53 return Data_Wrap_Struct(klass, 0, 0, conv);
54 }
55
56 static VALUE
57 rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
58 {
59 rb_encoding_t *enc = NULL;
60 if (CLASS_OF(arg) == rb_cEncoding) {
61 enc = rb_to_encoding(arg);
62 }
63 else {
64 StringValue(arg);
65 enc = rb_enc_find(RSTRING_PTR(arg));
66 }
67
68 if ((enc == NULL) || (enc->ascii_compatible)) {
69 return Qnil;
70 }
71 else if (UTF16_ENC(enc) || UTF32_ENC(enc)) {
72 return (VALUE)rb_utf8_encoding();
73 }
74 // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
75 rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
76 }
77
78 static VALUE rb_econv_convpath(VALUE self, SEL sel);
79
80 static VALUE
81 rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
82 {
83 return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
84 }
85
86 static transcode_behavior_t
87 symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
88 {
89 if (given_symbol == sym_replace) {
90 return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
91 }
92 else if (given_symbol == sym_attr) {
93 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
94 }
95 else if (given_symbol == sym_text) {
96 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
97 }
98 else if (!NIL_P(given_symbol)) {
99 rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
100 }
101 return otherwise;
102 }
103
104 static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
105 transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
106 {
107
108 *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
109 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
110
111 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
112 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
113
114 // Because the API conflates the :xml and :undef options, we pass in the previous setting
115 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
116 *behavior_for_undefined, "xml-replacement");
117
118 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
119 *behavior_for_undefined, "xml-replacement");
120
121 VALUE replacement = rb_hash_aref(options, sym_replace);
122 if (!NIL_P(replacement)) {
123 *replacement_str = str_simple_transcode(str_need_string(replacement), destination);
124 }
125
126 }
127
128 static VALUE
129 rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
130 {
131 rb_econv_t *conv = RConverter(self);
132 VALUE sourceobj, destobj, options;
133 rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
134
135 rb_encoding_t* source = rb_to_encoding(sourceobj);
136 rb_encoding_t* destination = rb_to_encoding(destobj);
137 rb_str_t* replacement_str = NULL;
138
139 conv->source = source;
140 conv->destination = destination;
141
142 conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
143 conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
144
145 // Extract the options. This is a hateful, hateful API.
146 if (!NIL_P(options)) {
147
148 if (FIXNUM_P(options)) {
149 rb_bug("fixnum arguments are not supported yet.");
150 }
151 else if (TYPE(options) == T_HASH) {
152 parse_conversion_options(options, &conv->invalid_sequence_behavior,
153 &conv->undefined_conversion_behavior, &replacement_str, destination);
154 }
155 else {
156 rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
157 }
158 }
159
160 // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
161 if (replacement_str == NULL) {
162 replacement_str = replacement_string_for_encoding(destination);
163 }
164 GC_WB(&conv->replacement, replacement_str);
165
166 return self;
167 }
168
169 static VALUE
170 rb_econv_inspect(VALUE self, SEL sel)
171 {
172 // TODO: make this comply with the MRI output when we add newline decorators
173 rb_econv_t *conv = RConverter(self);
174 return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
175 conv->destination->public_name);
176 }
177
178 static VALUE
179 rb_econv_convpath(VALUE self, SEL sel)
180 {
181 // in MacRuby, the convpath always looks like this:
182 // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
183 // The first element is omitted if the source encoding is UTF-16, obviously.
184 rb_econv_t *conv = RConverter(self);
185 VALUE to_return = rb_ary_new2(2);
186 rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
187
188 if (conv->source != nativeUTF16) {
189 rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
190 }
191
192 rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
193
194 return to_return;
195 }
196
197 static VALUE
198 rb_econv_source_encoding(VALUE self, SEL sel)
199 {
200 return (VALUE)(RConverter(self)->source);
201 }
202
203 static VALUE
204 rb_econv_destination_encoding(VALUE self, SEL sel)
205 {
206 return (VALUE)(RConverter(self)->destination);
207 }
208
209 // Since our converter is basically a black box at this point, we'll leave
210 // the lower-level methods unimplemented.
211 #define rb_econv_primitive_convert rb_f_notimplement
212
213 static VALUE
214 rb_econv_convert(VALUE self, SEL sel, VALUE str)
215 {
216 rb_econv_t *conv;
217 Data_Get_Struct(self, rb_econv_t, conv);
218
219 if (conv->finished) {
220 rb_raise(rb_eArgError, "convert() called on a finished stream");
221 }
222
223 assert(conv->replacement->encoding == conv->destination);
224 return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
225 }
226
227 static VALUE
228 rb_econv_finish(VALUE self, SEL sel)
229 {
230 // TODO: Flesh this out later.
231 RConverter(self)->finished = true;
232 return rb_str_new2("");
233 }
234
235 #define rb_econv_primitive_errinfo rb_f_notimplement
236
237 #define rb_econv_insert_output rb_f_notimplement
238
239 #define rb_econv_putback rb_f_notimplement
240
241 #define rb_econv_last_error rb_f_notimplement
242
243 static VALUE
244 rb_econv_replacement(VALUE self, SEL sel)
245 {
246 return (VALUE)(RConverter(self)->replacement);
247 }
248
249 static VALUE
250 rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
251 {
252 // TODO: Should we copy this string? Probably.
253 rb_econv_t *conv = RConverter(self);
254 if (TYPE(str) != T_STRING) {
255 rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
256 }
257 rb_str_force_encoding(str, conv->destination);
258 GC_WB(&conv->replacement, str_need_string(str));
259 return str;
260 }
261
262 /*
263 * call-seq:
264 * str.encode(encoding [, options] ) => str
265 * str.encode(dst_encoding, src_encoding [, options] ) => str
266 * str.encode([options]) => str
267 *
268 * The first form returns a copy of <i>str</i> transcoded
269 * to encoding +encoding+.
270 * The second form returns a copy of <i>str</i> transcoded
271 * from src_encoding to dst_encoding.
272 * The last form returns a copy of <i>str</i> transcoded to
273 * <code>Encoding.default_internal</code>.
274 * By default, the first and second form raise
275 * Encoding::UndefinedConversionError for characters that are
276 * undefined in the destination encoding, and
277 * Encoding::InvalidByteSequenceError for invalid byte sequences
278 * in the source encoding. The last form by default does not raise
279 * exceptions but uses replacement strings.
280 * The <code>options</code> Hash gives details for conversion.
281 *
282 * === options
283 * The hash <code>options</code> can have the following keys:
284 * :invalid ::
285 * If the value is <code>:replace</code>, <code>#encode</code> replaces
286 * invalid byte sequences in <code>str</code> with the replacement character.
287 * The default is to raise the exception
288 * :undef ::
289 * If the value is <code>:replace</code>, <code>#encode</code> replaces
290 * characters which are undefined in the destination encoding with
291 * the replacement character.
292 * :replace ::
293 * Sets the replacement string to the value. The default replacement
294 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
295 * :xml ::
296 * The value must be <code>:text</code> or <code>:attr</code>.
297 * If the value is <code>:text</code> <code>#encode</code> replaces
298 * undefined characters with their (upper-case hexadecimal) numeric
299 * character references. '&', '<', and '>' are converted to "&amp;",
300 * "&lt;", and "&gt;", respectively.
301 * If the value is <code>:attr</code>, <code>#encode</code> also quotes
302 * the replacement result (using '"'), and replaces '"' with "&quot;".
303 */
304 extern rb_encoding_t *default_internal;
305 static VALUE
306 rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
307 {
308 VALUE opt = Qnil;
309 if (argc > 0) {
310 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
311 if (!NIL_P(opt)) {
312 argc--;
313 }
314 }
315
316 rb_str_t *self = RSTR(str);
317 rb_str_t *replacement_str = NULL;
318 rb_encoding_t *src_encoding, *dst_encoding;
319 transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
320 transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
321 if (argc == 0) {
322 src_encoding = self->encoding;
323 dst_encoding = default_internal;
324 behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
325 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
326 }
327 else if (argc == 1) {
328 src_encoding = self->encoding;
329 dst_encoding = rb_to_encoding(argv[0]);
330 }
331 else if (argc == 2) {
332 dst_encoding = rb_to_encoding(argv[0]);
333 src_encoding = rb_to_encoding(argv[1]);
334 }
335 else {
336 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
337 }
338
339 if (!NIL_P(opt)) {
340 parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
341 if ((replacement_str != NULL)
342 && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
343 && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
344 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
345 }
346 }
347
348 if ((replacement_str == NULL)
349 && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
350 || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
351 replacement_str = replacement_string_for_encoding(dst_encoding);
352 }
353
354 return (VALUE)str_transcode(self, src_encoding, dst_encoding,
355 behavior_for_invalid, behavior_for_undefined, replacement_str);
356 }
357
358 /*
359 * call-seq:
360 * str.encode!(encoding [, options] ) => str
361 * str.encode!(dst_encoding, src_encoding [, options] ) => str
362 *
363 * The first form transcodes the contents of <i>str</i> from
364 * str.encoding to +encoding+.
365 * The second form transcodes the contents of <i>str</i> from
366 * src_encoding to dst_encoding.
367 * The options Hash gives details for conversion. See String#encode
368 * for details.
369 * Returns the string even if no changes were made.
370 */
371 static VALUE
372 rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
373 {
374 rstr_modify(str);
375
376 VALUE new_str = rstr_encode(str, sel, argc, argv);
377 str_replace_with_string(RSTR(str), RSTR(new_str));
378 return str;
379 }
380
381 void
382 Init_Transcode(void)
383 {
384 rb_eUndefinedConversionError = rb_define_class_under(rb_cEncoding, "UndefinedConversionError", rb_eEncodingError);
385 rb_eInvalidByteSequenceError = rb_define_class_under(rb_cEncoding, "InvalidByteSequenceError", rb_eEncodingError);
386 rb_eConverterNotFoundError = rb_define_class_under(rb_cEncoding, "ConverterNotFoundError", rb_eEncodingError);
387
388 rb_objc_define_method(rb_cRubyString, "encode", rstr_encode, -1);
389 rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
390
391 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
392 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
393 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
394 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
395
396 rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
397 rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
398 rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
399 rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
400 rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
401 rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
402 rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
403 rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
404 rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
405 rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
406 rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
407 rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
408 rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
409 rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
410
411 sym_invalid = ID2SYM(rb_intern("invalid"));
412 sym_undef = ID2SYM(rb_intern("undef"));
413 sym_replace = ID2SYM(rb_intern("replace"));
414 sym_attr = ID2SYM(rb_intern("attr"));
415 sym_text = ID2SYM(rb_intern("text"));
416 sym_xml = ID2SYM(rb_intern("xml"));
417
418 // If only these mapped to the internal enums...
419 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
420 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
421 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
422 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
423 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
424 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
425 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
426 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
427 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
428 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
429 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
430 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
431 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
432
433 #if 0
434 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
435 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
436 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
437 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
438 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
439
440 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
441 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
442 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
443 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
444 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
445 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
446 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
447
448 Init_newline();
449 #endif
450 }
Something went wrong with that request. Please try again.