Skip to content
This repository
Newer
Older
100644 599 lines (527 sloc) 14.24 kb
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
1 /*
2 * MacRuby implementation of Ruby 1.9's encoding.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
53679566 » drernie
2010-01-21 Updated copyrights for 2010
6 * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "regenc.h"
15 #include <ctype.h>
16 #ifdef HAVE_LANGINFO_H
17 #include <langinfo.h>
18 #endif
19
20 static ID id_encoding, id_base_encoding;
21 VALUE rb_cEncoding;
22
23 static CFMutableDictionaryRef __encodings = NULL;
24
25 static VALUE
26 enc_new(const CFStringEncoding *enc)
27 {
28 return Data_Wrap_Struct(rb_cEncoding, NULL, NULL, (void *)enc);
29 }
30
31 static void
32 enc_init_db(void)
33 {
34 const CFStringEncoding *e;
35
1a1a9354 » Laurent Sansonetti
2009-03-13 fixed IO regressions + other bugs
36 __encodings = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks);
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
37
38 /* XXX CFStringGetListOfAvailableEncodings() is a costly call and should
39 * be called on demand and not by default when the interpreter starts.
40 */
41 e = CFStringGetListOfAvailableEncodings();
42 while (e != NULL && *e != kCFStringEncodingInvalidId) {
43 VALUE iana;
44 VALUE encoding;
45
46 encoding = enc_new(e);
47
48 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*e);
49 if (iana != 0) {
50 const char *name;
51
52 name = RSTRING_PTR(iana);
077fd751 » Laurent Sansonetti
2009-06-08 better logic to rubyfy encoding names (was failing on 10.6)
53
54 // new_name = name.gsub(/-/, '_').upcase
55 char *new_name = alloca(strlen(name));
56 strcpy(new_name, name);
57 char *p = strchr(name, '-');
58 if (p != NULL) {
59 p = new_name + (p - name);
60 do {
61 *p = '_';
62 p++;
63 p = strchr(p, '-');
64 }
65 while (p != NULL);
66 }
67 p = new_name;
68 while (*p != '\0') {
69 if (islower(*p)) {
70 *p = toupper(*p);
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
71 }
077fd751 » Laurent Sansonetti
2009-06-08 better logic to rubyfy encoding names (was failing on 10.6)
72 p++;
73 }
74
75 ID encoding_id = rb_intern(new_name);
76 if (!rb_const_defined(rb_cEncoding, encoding_id)) {
77 rb_const_set(rb_cEncoding, encoding_id, encoding);
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
78 }
79 }
80 CFDictionarySetValue(__encodings, (const void *)iana,
81 (const void *)encoding);
82 e++;
83 }
84
85 assert(CFDictionaryGetCount((CFDictionaryRef)__encodings) > 0);
8b9745b6 » Laurent Sansonetti
2009-06-04 define Encoding::ASCII_8BIT as a shortcut to US_ASCII (for now)
86
87 // Define shortcuts.
88 rb_define_const(rb_cEncoding, "ASCII_8BIT",
89 rb_const_get(rb_cEncoding, rb_intern("US_ASCII")));
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
90 }
91
92 static VALUE
93 enc_make(const CFStringEncoding *enc)
94 {
95 VALUE iana, v;
96
97 assert(enc != NULL);
98 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*enc);
99 v = (VALUE)CFDictionaryGetValue((CFDictionaryRef)__encodings,
100 (const void *)iana);
101 assert(v != 0);
102 return v;
103 }
104
105 VALUE
106 rb_enc_from_encoding(rb_encoding *enc)
107 {
108 return enc_make(enc);
109 }
110
111 static inline CFStringEncoding
112 rb_enc_to_enc(VALUE v)
113 {
114 return *(CFStringEncoding *)DATA_PTR(v);
115 }
116
117 static inline CFStringEncoding *
118 rb_enc_to_enc_ptr(VALUE v)
119 {
120 return (CFStringEncoding *)DATA_PTR(v);
121 }
122
123 rb_encoding *
124 rb_to_encoding(VALUE v)
125 {
126 if (TYPE(v) == T_STRING)
127 return rb_enc_find2(v);
128 return rb_enc_to_enc_ptr(v);
129 }
130
131 /*
132 * call-seq:
133 * enc.dummy? => true or false
134 *
135 * Returns true for dummy encodings.
136 * A dummy encoding is an encoding for which character handling is not properly
137 * implemented.
138 * It is used for stateful encodings.
139 *
140 * Encoding::ISO_2022_JP.dummy? #=> true
141 * Encoding::UTF_8.dummy? #=> false
142 *
143 */
144 static VALUE
145 enc_dummy_p(VALUE enc, SEL sel)
146 {
147 return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
148 }
149
150 ID
151 rb_id_encoding(void)
152 {
153 if (!id_encoding) {
154 id_encoding = rb_intern("encoding");
155 }
156 return id_encoding;
157 }
158
159 rb_encoding*
160 rb_enc_compatible(VALUE str1, VALUE str2)
161 {
162 /* TODO */
163 rb_encoding *enc = rb_enc_get(str1);
164 if (enc == rb_enc_get(str2))
165 return enc;
166 return NULL;
167 }
168
169 /*
170 * call-seq:
171 * obj.encoding => encoding
172 *
173 * Returns the Encoding object that represents the encoding of obj.
174 */
175
176 VALUE
d5fb09e8 » Laurent Sansonetti
2009-03-22 fixed multiple assignment where the left side does not exist + misc f…
177 rb_obj_encoding(VALUE obj, SEL sel)
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
178 {
179 rb_encoding *enc = rb_enc_get(obj);
180 if (!enc) {
181 rb_raise(rb_eTypeError, "unknown encoding");
182 }
183 return rb_enc_from_encoding(enc);
184 }
185
186 /*
187 * call-seq:
188 * enc.inspect => string
189 *
190 * Returns a string which represents the encoding for programmers.
191 *
192 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
193 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
194 */
195 static VALUE
196 enc_inspect(VALUE self, SEL sel)
197 {
198 char buffer[512];
199 VALUE enc_name;
200 long n;
201
202 enc_name = (VALUE)CFStringGetNameOfEncoding(rb_enc_to_enc(self));
203
204 n = snprintf(buffer, sizeof buffer, "#<%s:%s>", rb_obj_classname(self),
205 RSTRING_PTR(enc_name));
206
207 return rb_str_new(buffer, n);
208 }
209
210 /*
211 * call-seq:
212 * enc.name => string
213 *
214 * Returns the name of the encoding.
215 *
216 * Encoding::UTF_8.name => "UTF-8"
217 */
218 static VALUE
219 enc_name(VALUE self, SEL sel)
220 {
023dd4df » Laurent Sansonetti
2009-06-08 fixed Encoding#name for 10.6
221 return rb_enc_name2(rb_enc_to_enc_ptr(self));
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
222 }
223
224 static VALUE
225 enc_base_encoding(VALUE self, SEL sel)
226 {
227 return rb_attr_get(self, id_base_encoding);
228 }
229
230 /*
231 * call-seq:
232 * Encoding.list => [enc1, enc2, ...]
233 *
234 * Returns the list of loaded encodings.
235 *
236 * Encoding.list
237 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
238 * #<Encoding:ISO-2022-JP (dummy)>]
239 *
240 * Encoding.find("US-ASCII")
241 * => #<Encoding:US-ASCII>
242 *
243 * Encoding.list
244 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
245 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
246 *
247 */
248 static VALUE
249 enc_list(VALUE klass, SEL sel)
250 {
251 VALUE ary;
252 const CFStringEncoding *e;
253
254 ary = rb_ary_new();
255 e = CFStringGetListOfAvailableEncodings();
256 while (e != NULL && *e != kCFStringEncodingInvalidId) {
257 rb_ary_push(ary, enc_make(e));
258 e++;
259 }
260 return ary;
261 }
262
263 /*
264 * call-seq:
265 * Encoding.find(string) => enc
266 * Encoding.find(symbol) => enc
267 *
268 * Search the encoding with specified <i>name</i>.
269 * <i>name</i> should be a string or symbol.
270 *
271 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
272 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
273 *
274 */
275 static VALUE
276 enc_find2(VALUE enc)
277 {
278 CFStringRef str;
279 CFStringEncoding e;
280
281 str = (CFStringRef)StringValue(enc);
282 if (CFStringCompare(str, CFSTR("ASCII-8BIT"),
283 kCFCompareCaseInsensitive) == 0) {
284 str = CFSTR("ASCII");
285 }
286 else if (CFStringCompare(str, CFSTR("SJIS"),
287 kCFCompareCaseInsensitive) == 0) {
288 str = CFSTR("Shift-JIS");
289 }
290
291 e = CFStringConvertIANACharSetNameToEncoding(str);
292 if (e == kCFStringEncodingInvalidId)
293 return Qnil;
294 return enc_make(&e);
295 }
296
297 static VALUE
298 enc_find(VALUE klass, SEL sel, VALUE enc)
299 {
300 VALUE e = enc_find2(enc);
28f110be » Laurent Sansonetti
2009-03-16 removed the previous bytestring code and now use the new one
301 if (e == Qnil) {
302 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
303 }
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
304 return e;
305 }
306
307 /*
308 * call-seq:
309 * Encoding.compatible?(str1, str2) => enc or nil
310 *
311 * Checks the compatibility of two strings.
312 * If they are compatible, means concatenatable,
313 * returns an encoding which the concatinated string will be.
314 * If they are not compatible, nil is returned.
315 *
316 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
317 * => #<Encoding:ISO-8859-1>
318 *
319 * Encoding.compatible?(
320 * "\xa1".force_encoding("iso-8859-1"),
321 * "\xa1\xa1".force_encoding("euc-jp"))
322 * => nil
323 *
324 */
325 static VALUE
326 enc_compatible_p(VALUE klass, SEL sel, VALUE str1, VALUE str2)
327 {
328 rb_encoding *enc = rb_enc_compatible(str1, str2);
329 VALUE encoding = Qnil;
330 if (!enc || !(encoding = rb_enc_from_encoding(enc)))
331 encoding = Qnil;
332 return encoding;
333 }
334
335 /* :nodoc: */
336 static VALUE
337 enc_dump(VALUE self, SEL sel, int argc, VALUE *argv)
338 {
339 rb_scan_args(argc, argv, "01", 0);
340 return enc_name(self, 0);
341 }
342
343 /* :nodoc: */
344 static VALUE
345 enc_load(VALUE klass, SEL sel, VALUE str)
346 {
347 return enc_find(klass, 0, str);
348 }
349
350 static rb_encoding *default_external;
351
352 rb_encoding *
353 rb_default_external_encoding(void)
354 {
355 return default_external;
356 }
357
358 VALUE
359 rb_enc_default_external(void)
360 {
361 return enc_make(default_external);
362 }
363
364 /*
365 * call-seq:
366 * Encoding.default_external => enc
367 *
368 * Returns default external encoding.
369 *
370 * It is initialized by the locale or -E option.
371 */
372 static VALUE
373 get_default_external(VALUE klass, SEL sel)
374 {
375 return rb_enc_default_external();
376 }
377
16235323 » Laurent Sansonetti
2009-11-10 added Encoding#default_external= and Encoding#default_internal= which…
378 static VALUE
379 set_default_external(VALUE klass, SEL sel, VALUE enc)
380 {
381 // TODO
382 return enc;
383 }
384
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
385 void
386 rb_enc_set_default_external(VALUE encoding)
387 {
388 default_external = rb_enc_to_enc_ptr(encoding);
389 }
390
391 /*
392 * call-seq:
393 * Encoding.locale_charmap => string
394 *
395 * Returns the locale charmap name.
396 *
397 * Debian GNU/Linux
398 * LANG=C
399 * Encoding.locale_charmap => "ANSI_X3.4-1968"
400 * LANG=ja_JP.EUC-JP
401 * Encoding.locale_charmap => "EUC-JP"
402 *
403 * SunOS 5
404 * LANG=C
405 * Encoding.locale_charmap => "646"
406 * LANG=ja
407 * Encoding.locale_charmap => "eucJP"
408 *
409 */
410 static VALUE
411 rb_locale_charmap(VALUE klass, SEL sel)
412 {
413 CFStringEncoding enc = CFStringGetSystemEncoding();
414 return (VALUE)CFStringConvertEncodingToIANACharSetName(enc);
415 }
416
417 /*
418 * call-seq:
419 * Encoding.name_list => ["enc1", "enc2", ...]
420 *
421 * Returns the list of available encoding names.
422 *
423 * Encoding.name_list
424 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
425 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
426 * "Windows-31J",
427 * "BINARY", "CP932", "eucJP"]
428 *
429 * This list doesn't include dummy encodings.
430 *
431 */
432
433 static VALUE
434 rb_enc_name_list(VALUE klass, SEL sel)
435 {
436 VALUE ary, list;
437 long i, count;
438
439 ary = rb_ary_new();
440 list = enc_list(klass, 0);
441 for (i = 0, count = RARRAY_LEN(list); i < count; i++) {
442 rb_ary_push(ary, enc_name(RARRAY_AT(list, i), 0));
443 }
444 return ary;
445 }
446
447 /*
448 * call-seq:
449 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
450 *
451 * Returns the hash of available encoding alias and original encoding name.
452 *
453 * Encoding.aliases
454 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
455 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
456 *
457 */
458
459 static VALUE
460 rb_enc_aliases(VALUE klass, SEL sel)
461 {
462 /* TODO: the CFString IANA <-> charset code does support aliases, we should
463 * find a way to return them here.
464 */
465 return rb_hash_new();
466 }
467
468 VALUE
469 rb_enc_name2(rb_encoding *enc)
470 {
023dd4df » Laurent Sansonetti
2009-06-08 fixed Encoding#name for 10.6
471 if (enc != NULL) {
472 CFStringRef str = CFStringConvertEncodingToIANACharSetName(*enc);
473 if (str != NULL) {
474 VALUE name = rb_str_dup((VALUE)str);
475 CFStringUppercase((CFMutableStringRef)name, NULL);
476 return name;
477 }
478 }
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
479 return Qnil;
480 }
481
482 const char *
483 rb_enc_name(rb_encoding *enc)
484 {
485 VALUE str = rb_enc_name2(enc);
486 return str == Qnil ? NULL : RSTRING_PTR(str);
487 }
488
489 long
490 rb_enc_mbminlen(rb_encoding *enc)
491 {
492 return rb_enc_mbmaxlen(enc);
493 }
494
495 long
496 rb_enc_mbmaxlen(rb_encoding *enc)
497 {
498 return enc == NULL
499 ? 1 : CFStringGetMaximumSizeForEncoding(1, *enc);
500 }
501
502 rb_encoding *
503 rb_enc_find(const char *name)
504 {
505 return rb_enc_find2(rb_str_new2(name));
506 }
507
508 rb_encoding *
509 rb_enc_find2(VALUE name)
510 {
511 VALUE e = enc_find2(name);
512 return e == Qnil ? NULL : rb_enc_to_enc_ptr(e);
513 }
514
515 rb_encoding *
516 rb_enc_get(VALUE obj)
517 {
022cd7ca » Laurent Sansonetti
2009-06-05 fixed ByteString#encoding to always return US_ASCII (for now)
518 CFStringEncoding enc = kCFStringEncodingInvalidId;
519
520 switch (TYPE(obj)) {
521 case T_STRING:
522 enc = *(VALUE *)obj == rb_cByteString
523 ? kCFStringEncodingASCII
524 : CFStringGetFastestEncoding((CFStringRef)obj);
525 break;
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
526 }
022cd7ca » Laurent Sansonetti
2009-06-05 fixed ByteString#encoding to always return US_ASCII (for now)
527
528 if (enc == kCFStringEncodingInvalidId) {
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
529 return NULL;
530 }
022cd7ca » Laurent Sansonetti
2009-06-05 fixed ByteString#encoding to always return US_ASCII (for now)
531 return rb_enc_to_enc_ptr(enc_make(&enc));
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
532 }
533
534 rb_encoding *
535 rb_locale_encoding(void)
536 {
537 CFStringEncoding enc = CFStringGetSystemEncoding();
538 return rb_enc_to_enc_ptr(enc_make(&enc));
539 }
540
541 void
542 Init_Encoding(void)
543 {
544 id_base_encoding = rb_intern("#base_encoding");
545
546 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
547 rb_undef_alloc_func(rb_cEncoding);
548 rb_objc_define_method(rb_cEncoding, "to_s", enc_name, 0);
549 rb_objc_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
550 rb_objc_define_method(rb_cEncoding, "name", enc_name, 0);
551 rb_objc_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
552 rb_objc_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
553 rb_objc_define_method(*(VALUE *)rb_cEncoding, "list", enc_list, 0);
554 rb_objc_define_method(*(VALUE *)rb_cEncoding, "name_list", rb_enc_name_list, 0);
555 rb_objc_define_method(*(VALUE *)rb_cEncoding, "aliases", rb_enc_aliases, 0);
556 rb_objc_define_method(*(VALUE *)rb_cEncoding, "find", enc_find, 1);
557 rb_objc_define_method(*(VALUE *)rb_cEncoding, "compatible?", enc_compatible_p, 2);
558
559 rb_objc_define_method(rb_cEncoding, "_dump", enc_dump, -1);
560 rb_objc_define_method(*(VALUE *)rb_cEncoding, "_load", enc_load, 1);
561
562 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external", get_default_external, 0);
16235323 » Laurent Sansonetti
2009-11-10 added Encoding#default_external= and Encoding#default_internal= which…
563 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external=", set_default_external, 1);
e7fabe72 » Laurent Sansonetti
2009-08-29 creates Encoding.default_internal as a temporary shortcut to Encoding…
564 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_internal", get_default_external, 0); // TODO
16235323 » Laurent Sansonetti
2009-11-10 added Encoding#default_external= and Encoding#default_internal= which…
565 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_internal=", set_default_external, 1); // TODO
9c1d2307 » Laurent Sansonetti
2009-03-11 committing experimental branch content
566 rb_objc_define_method(*(VALUE *)rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
567
568 enc_init_db();
569 }
570
571 /* locale insensitive functions */
572
573 #define ctype_test(c, ctype) \
574 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
575
576 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
577 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
578 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
579 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
580 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
581 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
582 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
583 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
584 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
585 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
586 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
587 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
588
589 int
590 rb_tolower(int c)
591 {
592 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
593 }
594
595 int
596 rb_toupper(int c)
597 {
598 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
599 }
600
Something went wrong with that request. Please try again.