Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 451 lines (390 sloc) 17.871 kb
7d7d3e8 @ferrous26 Change ownership to The MacRuby Team and update copyrights
ferrous26 authored
1 /*
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
2 * MacRuby implementation of transcode.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
7d7d3e8 @ferrous26 Change ownership to The MacRuby Team and update copyrights
ferrous26 authored
5 *
6 * Copyright (C) 2012, The MacRuby Team. All rights reserved.
9595725 update copyrights to 2011
Laurent Sansonetti authored
7 * Copyright (C) 2007-2011, Apple Inc. All rights reserved.
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
8 * Copyright (C) 1993-2007 Yukihiro Matsumoto
9 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
11 */
12
13 // Notes:
14 // AFAICT, we need to add support for newline decorators.
15
d0898dd include/ruby/macruby.h -> macruby_internal.h
Laurent Sansonetti authored
16 #include "macruby_internal.h"
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
17 #include "ruby/encoding.h"
18 #include "encoding.h"
19
20 static VALUE sym_invalid;
21 static VALUE sym_undef;
22 static VALUE sym_replace;
23 static VALUE sym_xml;
24 static VALUE sym_text;
25 static VALUE sym_attr;
26
27 typedef struct rb_econv_s {
28 rb_encoding_t *source;
29 rb_encoding_t *destination;
30 transcode_behavior_t invalid_sequence_behavior;
31 transcode_behavior_t undefined_conversion_behavior;
32 transcode_flags_t special_flags;
33 rb_str_t *replacement;
34 bool finished;
35 } rb_econv_t;
36
37 VALUE rb_cEncodingConverter;
38
39 static rb_econv_t* RConverter(VALUE self) {
40 rb_econv_t *conv;
41 Data_Get_Struct(self, rb_econv_t, conv);
42 return conv;
43 }
44
45 static VALUE
46 rb_econv_alloc(VALUE klass, SEL sel)
47 {
48 rb_econv_t *conv = ALLOC(rb_econv_t);
49 conv->source = NULL;
50 conv->destination = NULL;
51 conv->replacement = NULL;
52 conv->special_flags = 0;
53 conv->finished = false;
54 return Data_Wrap_Struct(klass, 0, 0, conv);
55 }
56
57 static VALUE
58 rb_econv_asciicompat_encoding(VALUE klass, SEL sel, VALUE arg)
59 {
60 rb_encoding_t *enc = NULL;
61 if (CLASS_OF(arg) == rb_cEncoding) {
62 enc = rb_to_encoding(arg);
63 }
64 else {
65 StringValue(arg);
66 enc = rb_enc_find(RSTRING_PTR(arg));
67 }
68
69 if ((enc == NULL) || (enc->ascii_compatible)) {
70 return Qnil;
71 }
e22f26d @vincentisambart changed the internal representation of strings
vincentisambart authored
72 else if (IS_UTF16_ENC(enc) || IS_UTF32_ENC(enc)) {
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
73 return (VALUE)rb_utf8_encoding();
74 }
75 // TODO: Port MRI's table that maps ASCII-incompatible encodings to compatible ones.
76 rb_raise(rb_eConverterNotFoundError, "could not find ASCII-compatible encoding for %s", enc->public_name);
77 }
78
79 static VALUE rb_econv_convpath(VALUE self, SEL sel);
80
81 static VALUE
82 rb_econv_search_convpath(VALUE klass, SEL sel, int argc, VALUE* argv)
83 {
84 return rb_econv_convpath(rb_class_new_instance(argc, argv, klass), sel);
85 }
86
87 static transcode_behavior_t
88 symbol_option_with_default(VALUE given_symbol, transcode_behavior_t otherwise, const char* name)
89 {
90 if (given_symbol == sym_replace) {
91 return TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
92 }
93 else if (given_symbol == sym_attr) {
94 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR;
95 }
96 else if (given_symbol == sym_text) {
97 return TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT;
98 }
99 else if (!NIL_P(given_symbol)) {
100 rb_raise(rb_eArgError, "unknown value '%s' for option %s", StringValuePtr(given_symbol), name);
101 }
102 return otherwise;
103 }
104
105 static void parse_conversion_options(VALUE options, transcode_behavior_t* behavior_for_invalid,
106 transcode_behavior_t* behavior_for_undefined, rb_str_t** replacement_str, rb_encoding_t* destination)
107 {
108
109 *behavior_for_invalid = symbol_option_with_default(rb_hash_aref(options, sym_invalid),
110 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "invalid-character");
111
112 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_undef),
113 TRANSCODE_BEHAVIOR_RAISE_EXCEPTION, "undefined-conversion");
114
115 // Because the API conflates the :xml and :undef options, we pass in the previous setting
116 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
117 *behavior_for_undefined, "xml-replacement");
118
119 *behavior_for_undefined = symbol_option_with_default(rb_hash_aref(options, sym_xml),
120 *behavior_for_undefined, "xml-replacement");
121
122 VALUE replacement = rb_hash_aref(options, sym_replace);
123 if (!NIL_P(replacement)) {
124 *replacement_str = str_simple_transcode(str_need_string(replacement), destination);
125 }
126
127 }
128
129 static VALUE
130 rb_econv_initialize(VALUE self, SEL sel, int argc, VALUE* argv)
131 {
132 rb_econv_t *conv = RConverter(self);
133 VALUE sourceobj, destobj, options;
134 rb_scan_args(argc, argv, "21", &sourceobj, &destobj, &options);
135
136 rb_encoding_t* source = rb_to_encoding(sourceobj);
137 rb_encoding_t* destination = rb_to_encoding(destobj);
138 rb_str_t* replacement_str = NULL;
139
140 conv->source = source;
141 conv->destination = destination;
142
143 conv->invalid_sequence_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
144 conv->undefined_conversion_behavior = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
145
146 // Extract the options. This is a hateful, hateful API.
147 if (!NIL_P(options)) {
148
149 if (FIXNUM_P(options)) {
150 rb_bug("fixnum arguments are not supported yet.");
151 }
152 else if (TYPE(options) == T_HASH) {
153 parse_conversion_options(options, &conv->invalid_sequence_behavior,
154 &conv->undefined_conversion_behavior, &replacement_str, destination);
155 }
156 else {
157 rb_raise(rb_eArgError, "expected either a hash or a fixnum as the last parameter");
158 }
159 }
160
161 // Get the default replacement string. For UTF-[8, 16, 32] it's /uFFFD, and for others it's '?'
162 if (replacement_str == NULL) {
163 replacement_str = replacement_string_for_encoding(destination);
164 }
165 GC_WB(&conv->replacement, replacement_str);
166
167 return self;
168 }
169
170 static VALUE
171 rb_econv_inspect(VALUE self, SEL sel)
172 {
173 // TODO: make this comply with the MRI output when we add newline decorators
174 rb_econv_t *conv = RConverter(self);
175 return rb_sprintf("#<%s: %s to %s>", rb_obj_classname(self), conv->source->public_name,
176 conv->destination->public_name);
177 }
178
179 static VALUE
180 rb_econv_convpath(VALUE self, SEL sel)
181 {
182 // in MacRuby, the convpath always looks like this:
183 // [[source_encoding, native UTF-16], [native UTF-16, dest_encoding]]
184 // The first element is omitted if the source encoding is UTF-16, obviously.
185 rb_econv_t *conv = RConverter(self);
186 VALUE to_return = rb_ary_new2(2);
187 rb_encoding_t* nativeUTF16 = rb_encodings[ENCODING_UTF16_NATIVE];
188
189 if (conv->source != nativeUTF16) {
190 rb_ary_push(to_return, rb_assoc_new((VALUE)conv->source, (VALUE)nativeUTF16));
191 }
192
193 rb_ary_push(to_return, rb_assoc_new((VALUE)nativeUTF16, (VALUE)conv->destination));
194
195 return to_return;
196 }
197
198 static VALUE
199 rb_econv_source_encoding(VALUE self, SEL sel)
200 {
201 return (VALUE)(RConverter(self)->source);
202 }
203
204 static VALUE
205 rb_econv_destination_encoding(VALUE self, SEL sel)
206 {
207 return (VALUE)(RConverter(self)->destination);
208 }
209
210 // Since our converter is basically a black box at this point, we'll leave
211 // the lower-level methods unimplemented.
212 #define rb_econv_primitive_convert rb_f_notimplement
213
214 static VALUE
215 rb_econv_convert(VALUE self, SEL sel, VALUE str)
216 {
217 rb_econv_t *conv;
218 Data_Get_Struct(self, rb_econv_t, conv);
219
220 if (conv->finished) {
221 rb_raise(rb_eArgError, "convert() called on a finished stream");
222 }
223
224 assert(conv->replacement->encoding == conv->destination);
225 return (VALUE)str_transcode(str_need_string(str), conv->source, conv->destination, conv->invalid_sequence_behavior, conv->undefined_conversion_behavior, conv->replacement);
226 }
227
228 static VALUE
229 rb_econv_finish(VALUE self, SEL sel)
230 {
231 // TODO: Flesh this out later.
232 RConverter(self)->finished = true;
233 return rb_str_new2("");
234 }
235
236 #define rb_econv_primitive_errinfo rb_f_notimplement
237
238 #define rb_econv_insert_output rb_f_notimplement
239
240 #define rb_econv_putback rb_f_notimplement
241
242 #define rb_econv_last_error rb_f_notimplement
243
244 static VALUE
245 rb_econv_replacement(VALUE self, SEL sel)
246 {
247 return (VALUE)(RConverter(self)->replacement);
248 }
249
250 static VALUE
251 rb_econv_set_replacement(VALUE self, SEL sel, VALUE str)
252 {
253 // TODO: Should we copy this string? Probably.
254 rb_econv_t *conv = RConverter(self);
255 if (TYPE(str) != T_STRING) {
256 rb_raise(rb_eTypeError, "wrong argument type %s (expected String)", rb_obj_classname(str));
257 }
258 rb_str_force_encoding(str, conv->destination);
259 GC_WB(&conv->replacement, str_need_string(str));
260 return str;
261 }
262
263 /*
264 * call-seq:
265 * str.encode(encoding [, options] ) => str
266 * str.encode(dst_encoding, src_encoding [, options] ) => str
267 * str.encode([options]) => str
268 *
269 * The first form returns a copy of <i>str</i> transcoded
270 * to encoding +encoding+.
271 * The second form returns a copy of <i>str</i> transcoded
272 * from src_encoding to dst_encoding.
273 * The last form returns a copy of <i>str</i> transcoded to
274 * <code>Encoding.default_internal</code>.
275 * By default, the first and second form raise
276 * Encoding::UndefinedConversionError for characters that are
277 * undefined in the destination encoding, and
278 * Encoding::InvalidByteSequenceError for invalid byte sequences
279 * in the source encoding. The last form by default does not raise
280 * exceptions but uses replacement strings.
281 * The <code>options</code> Hash gives details for conversion.
282 *
283 * === options
284 * The hash <code>options</code> can have the following keys:
285 * :invalid ::
286 * If the value is <code>:replace</code>, <code>#encode</code> replaces
287 * invalid byte sequences in <code>str</code> with the replacement character.
288 * The default is to raise the exception
289 * :undef ::
290 * If the value is <code>:replace</code>, <code>#encode</code> replaces
291 * characters which are undefined in the destination encoding with
292 * the replacement character.
293 * :replace ::
294 * Sets the replacement string to the value. The default replacement
295 * string is "\uFFFD" for Unicode encoding forms, and "?" otherwise.
296 * :xml ::
297 * The value must be <code>:text</code> or <code>:attr</code>.
298 * If the value is <code>:text</code> <code>#encode</code> replaces
299 * undefined characters with their (upper-case hexadecimal) numeric
300 * character references. '&', '<', and '>' are converted to "&amp;",
301 * "&lt;", and "&gt;", respectively.
302 * If the value is <code>:attr</code>, <code>#encode</code> also quotes
303 * the replacement result (using '"'), and replaces '"' with "&quot;".
304 */
305 extern rb_encoding_t *default_internal;
306 static VALUE
307 rstr_encode(VALUE str, SEL sel, int argc, VALUE *argv)
308 {
309 VALUE opt = Qnil;
310 if (argc > 0) {
311 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
312 if (!NIL_P(opt)) {
313 argc--;
314 }
315 }
316
d178850 @lrz add NSString#encode which returns a properly encoded RubyString, and NSS...
lrz authored
317 rb_str_t *self = str_need_string(str);
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
318 rb_str_t *replacement_str = NULL;
319 rb_encoding_t *src_encoding, *dst_encoding;
320 transcode_behavior_t behavior_for_invalid = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
321 transcode_behavior_t behavior_for_undefined = TRANSCODE_BEHAVIOR_RAISE_EXCEPTION;
322 if (argc == 0) {
323 src_encoding = self->encoding;
324 dst_encoding = default_internal;
325 behavior_for_invalid = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
326 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
327 }
328 else if (argc == 1) {
329 src_encoding = self->encoding;
330 dst_encoding = rb_to_encoding(argv[0]);
331 }
332 else if (argc == 2) {
333 dst_encoding = rb_to_encoding(argv[0]);
334 src_encoding = rb_to_encoding(argv[1]);
335 }
336 else {
337 rb_raise(rb_eArgError, "wrong number of arguments (%d for 0..2)", argc);
338 }
339
340 if (!NIL_P(opt)) {
341 parse_conversion_options(opt, &behavior_for_invalid, &behavior_for_undefined, &replacement_str, dst_encoding);
342 if ((replacement_str != NULL)
343 && (behavior_for_invalid != TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
344 && (behavior_for_undefined == TRANSCODE_BEHAVIOR_RAISE_EXCEPTION)) {
345 behavior_for_undefined = TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING;
346 }
347 }
348
349 if ((replacement_str == NULL)
350 && ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
351 || (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING))) {
352 replacement_str = replacement_string_for_encoding(dst_encoding);
353 }
354
355 return (VALUE)str_transcode(self, src_encoding, dst_encoding,
356 behavior_for_invalid, behavior_for_undefined, replacement_str);
357 }
358
359 /*
360 * call-seq:
361 * str.encode!(encoding [, options] ) => str
362 * str.encode!(dst_encoding, src_encoding [, options] ) => str
363 *
364 * The first form transcodes the contents of <i>str</i> from
365 * str.encoding to +encoding+.
366 * The second form transcodes the contents of <i>str</i> from
367 * src_encoding to dst_encoding.
368 * The options Hash gives details for conversion. See String#encode
369 * for details.
370 * Returns the string even if no changes were made.
371 */
372 static VALUE
373 rstr_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
374 {
375 rstr_modify(str);
376
377 VALUE new_str = rstr_encode(str, sel, argc, argv);
378 str_replace_with_string(RSTR(str), RSTR(new_str));
379 return str;
380 }
381
382 void
383 Init_Transcode(void)
384 {
d178850 @lrz add NSString#encode which returns a properly encoded RubyString, and NSS...
lrz authored
385 // #encode works on both NSStrings and RubyStrings, #encode! only works
386 // on RubyStrings.
387 rb_objc_define_method(rb_cNSString, "encode", rstr_encode, -1);
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
388 rb_objc_define_method(rb_cRubyString, "encode!", rstr_encode_bang, -1);
d178850 @lrz add NSString#encode which returns a properly encoded RubyString, and NSS...
lrz authored
389 rb_objc_define_method(rb_cNSString, "encode!", rstr_only, -1);
ffe45d2 Add support for Encoding::Converter and move String#encode and String#en...
Patrick Thomson authored
390
391 rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cObject);
392 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "alloc", rb_econv_alloc, 0);
393 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "asciicompat_encoding", rb_econv_asciicompat_encoding, 1);
394 rb_objc_define_method(*(VALUE *)rb_cEncodingConverter, "search_convpath", rb_econv_search_convpath, -1);
395
396 rb_objc_define_method(rb_cEncodingConverter, "initialize", rb_econv_initialize, -1);
397 rb_objc_define_method(rb_cEncodingConverter, "inspect", rb_econv_inspect, 0);
398 rb_objc_define_method(rb_cEncodingConverter, "convpath", rb_econv_convpath, 0);
399 rb_objc_define_method(rb_cEncodingConverter, "source_encoding", rb_econv_source_encoding, 0);
400 rb_objc_define_method(rb_cEncodingConverter, "destination_encoding", rb_econv_destination_encoding, 0);
401 rb_objc_define_method(rb_cEncodingConverter, "primitive_convert", rb_econv_primitive_convert, -1);
402 rb_objc_define_method(rb_cEncodingConverter, "convert", rb_econv_convert, 1);
403 rb_objc_define_method(rb_cEncodingConverter, "finish", rb_econv_finish, 0);
404 rb_objc_define_method(rb_cEncodingConverter, "primitive_errinfo", rb_econv_primitive_errinfo, 0);
405 rb_objc_define_method(rb_cEncodingConverter, "insert_output", rb_econv_insert_output, 1);
406 rb_objc_define_method(rb_cEncodingConverter, "putback", rb_econv_putback, -1);
407 rb_objc_define_method(rb_cEncodingConverter, "last_error", rb_econv_last_error, 0);
408 rb_objc_define_method(rb_cEncodingConverter, "replacement", rb_econv_replacement, 0);
409 rb_objc_define_method(rb_cEncodingConverter, "replacement=", rb_econv_set_replacement, 1);
410
411 sym_invalid = ID2SYM(rb_intern("invalid"));
412 sym_undef = ID2SYM(rb_intern("undef"));
413 sym_replace = ID2SYM(rb_intern("replace"));
414 sym_attr = ID2SYM(rb_intern("attr"));
415 sym_text = ID2SYM(rb_intern("text"));
416 sym_xml = ID2SYM(rb_intern("xml"));
417
418 // If only these mapped to the internal enums...
419 rb_define_const(rb_cEncodingConverter, "INVALID_MASK", INT2FIX(ECONV_INVALID_MASK));
420 rb_define_const(rb_cEncodingConverter, "INVALID_REPLACE", INT2FIX(ECONV_INVALID_REPLACE));
421 rb_define_const(rb_cEncodingConverter, "UNDEF_MASK", INT2FIX(ECONV_UNDEF_MASK));
422 rb_define_const(rb_cEncodingConverter, "UNDEF_REPLACE", INT2FIX(ECONV_UNDEF_REPLACE));
423 rb_define_const(rb_cEncodingConverter, "UNDEF_HEX_CHARREF", INT2FIX(ECONV_UNDEF_HEX_CHARREF));
424 rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(ECONV_PARTIAL_INPUT));
425 rb_define_const(rb_cEncodingConverter, "AFTER_OUTPUT", INT2FIX(ECONV_AFTER_OUTPUT));
426 rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE_DECORATOR", INT2FIX(ECONV_UNIVERSAL_NEWLINE_DECORATOR));
427 rb_define_const(rb_cEncodingConverter, "CRLF_NEWLINE_DECORATOR", INT2FIX(ECONV_CRLF_NEWLINE_DECORATOR));
428 rb_define_const(rb_cEncodingConverter, "CR_NEWLINE_DECORATOR", INT2FIX(ECONV_CR_NEWLINE_DECORATOR));
429 rb_define_const(rb_cEncodingConverter, "XML_TEXT_DECORATOR", INT2FIX(ECONV_XML_TEXT_DECORATOR));
430 rb_define_const(rb_cEncodingConverter, "XML_ATTR_CONTENT_DECORATOR", INT2FIX(ECONV_XML_ATTR_CONTENT_DECORATOR));
431 rb_define_const(rb_cEncodingConverter, "XML_ATTR_QUOTE_DECORATOR", INT2FIX(ECONV_XML_ATTR_QUOTE_DECORATOR));
432
433 #if 0
434 rb_define_method(rb_eUndefinedConversionError, "source_encoding_name", ecerr_source_encoding_name, 0);
435 rb_define_method(rb_eUndefinedConversionError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
436 rb_define_method(rb_eUndefinedConversionError, "source_encoding", ecerr_source_encoding, 0);
437 rb_define_method(rb_eUndefinedConversionError, "destination_encoding", ecerr_destination_encoding, 0);
438 rb_define_method(rb_eUndefinedConversionError, "error_char", ecerr_error_char, 0);
439
440 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding_name", ecerr_source_encoding_name, 0);
441 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding_name", ecerr_destination_encoding_name, 0);
442 rb_define_method(rb_eInvalidByteSequenceError, "source_encoding", ecerr_source_encoding, 0);
443 rb_define_method(rb_eInvalidByteSequenceError, "destination_encoding", ecerr_destination_encoding, 0);
444 rb_define_method(rb_eInvalidByteSequenceError, "error_bytes", ecerr_error_bytes, 0);
445 rb_define_method(rb_eInvalidByteSequenceError, "readagain_bytes", ecerr_readagain_bytes, 0);
446 rb_define_method(rb_eInvalidByteSequenceError, "incomplete_input?", ecerr_incomplete_input, 0);
447
448 Init_newline();
449 #endif
450 }
Something went wrong with that request. Please try again.