Skip to content
Newer
Older
100644 574 lines (506 sloc) 13.1 KB
9c1d230 committing experimental branch content
Laurent Sansonetti authored Mar 11, 2009
1 /*
2 * MacRuby implementation of Ruby 1.9's encoding.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2008, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "regenc.h"
15 #include <ctype.h>
16 #ifdef HAVE_LANGINFO_H
17 #include <langinfo.h>
18 #endif
19
20 static ID id_encoding, id_base_encoding;
21 VALUE rb_cEncoding;
22
23 static CFMutableDictionaryRef __encodings = NULL;
24
25 static VALUE
26 enc_new(const CFStringEncoding *enc)
27 {
28 return Data_Wrap_Struct(rb_cEncoding, NULL, NULL, (void *)enc);
29 }
30
31 static void
32 enc_init_db(void)
33 {
34 const CFStringEncoding *e;
35
1a1a935 fixed IO regressions + other bugs
Laurent Sansonetti authored Mar 13, 2009
36 __encodings = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks);
9c1d230 committing experimental branch content
Laurent Sansonetti authored Mar 11, 2009
37
38 /* XXX CFStringGetListOfAvailableEncodings() is a costly call and should
39 * be called on demand and not by default when the interpreter starts.
40 */
41 e = CFStringGetListOfAvailableEncodings();
42 while (e != NULL && *e != kCFStringEncodingInvalidId) {
43 VALUE iana;
44 VALUE encoding;
45
46 encoding = enc_new(e);
47
48 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*e);
49 if (iana != 0) {
50 const char *name;
51 char *p;
52
53 name = RSTRING_PTR(iana);
54 p = strchr(name, '-');
55 if ((p = strchr(name, '-')) != NULL
56 || islower(*name)) {
57 char *tmp = alloca(strlen(name));
58 strcpy(tmp, name);
59 if (p != NULL) {
60 p = tmp + (p - name);
61 do {
62 *p = '_';
63 p++;
64 p = strchr(p, '-');
65 }
66 while (p != NULL);
67 }
68 if (islower(*tmp))
69 *tmp = toupper(*tmp);
70 name = tmp;
71 }
72 rb_define_const(rb_cEncoding, name, encoding);
73 }
74 CFDictionarySetValue(__encodings, (const void *)iana,
75 (const void *)encoding);
76 e++;
77 }
78
79 assert(CFDictionaryGetCount((CFDictionaryRef)__encodings) > 0);
80 }
81
82 static VALUE
83 enc_make(const CFStringEncoding *enc)
84 {
85 VALUE iana, v;
86
87 assert(enc != NULL);
88 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*enc);
89 v = (VALUE)CFDictionaryGetValue((CFDictionaryRef)__encodings,
90 (const void *)iana);
91 assert(v != 0);
92 return v;
93 }
94
95 VALUE
96 rb_enc_from_encoding(rb_encoding *enc)
97 {
98 return enc_make(enc);
99 }
100
101 static inline CFStringEncoding
102 rb_enc_to_enc(VALUE v)
103 {
104 return *(CFStringEncoding *)DATA_PTR(v);
105 }
106
107 static inline CFStringEncoding *
108 rb_enc_to_enc_ptr(VALUE v)
109 {
110 return (CFStringEncoding *)DATA_PTR(v);
111 }
112
113 rb_encoding *
114 rb_to_encoding(VALUE v)
115 {
116 if (TYPE(v) == T_STRING)
117 return rb_enc_find2(v);
118 return rb_enc_to_enc_ptr(v);
119 }
120
121 /*
122 * call-seq:
123 * enc.dummy? => true or false
124 *
125 * Returns true for dummy encodings.
126 * A dummy encoding is an encoding for which character handling is not properly
127 * implemented.
128 * It is used for stateful encodings.
129 *
130 * Encoding::ISO_2022_JP.dummy? #=> true
131 * Encoding::UTF_8.dummy? #=> false
132 *
133 */
134 static VALUE
135 enc_dummy_p(VALUE enc, SEL sel)
136 {
137 return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
138 }
139
140 ID
141 rb_id_encoding(void)
142 {
143 if (!id_encoding) {
144 id_encoding = rb_intern("encoding");
145 }
146 return id_encoding;
147 }
148
149 rb_encoding*
150 rb_enc_compatible(VALUE str1, VALUE str2)
151 {
152 /* TODO */
153 rb_encoding *enc = rb_enc_get(str1);
154 if (enc == rb_enc_get(str2))
155 return enc;
156 return NULL;
157 }
158
159 /*
160 * call-seq:
161 * obj.encoding => encoding
162 *
163 * Returns the Encoding object that represents the encoding of obj.
164 */
165
166 VALUE
d5fb09e fixed multiple assignment where the left side does not exist + misc f…
Laurent Sansonetti authored Mar 22, 2009
167 rb_obj_encoding(VALUE obj, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored Mar 11, 2009
168 {
169 rb_encoding *enc = rb_enc_get(obj);
170 if (!enc) {
171 rb_raise(rb_eTypeError, "unknown encoding");
172 }
173 return rb_enc_from_encoding(enc);
174 }
175
176 /*
177 * call-seq:
178 * enc.inspect => string
179 *
180 * Returns a string which represents the encoding for programmers.
181 *
182 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
183 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
184 */
185 static VALUE
186 enc_inspect(VALUE self, SEL sel)
187 {
188 char buffer[512];
189 VALUE enc_name;
190 long n;
191
192 enc_name = (VALUE)CFStringGetNameOfEncoding(rb_enc_to_enc(self));
193
194 n = snprintf(buffer, sizeof buffer, "#<%s:%s>", rb_obj_classname(self),
195 RSTRING_PTR(enc_name));
196
197 return rb_str_new(buffer, n);
198 }
199
200 /*
201 * call-seq:
202 * enc.name => string
203 *
204 * Returns the name of the encoding.
205 *
206 * Encoding::UTF_8.name => "UTF-8"
207 */
208 static VALUE
209 enc_name(VALUE self, SEL sel)
210 {
211 return (VALUE)CFStringConvertEncodingToIANACharSetName(rb_enc_to_enc(self));
212 }
213
214 static VALUE
215 enc_base_encoding(VALUE self, SEL sel)
216 {
217 return rb_attr_get(self, id_base_encoding);
218 }
219
220 /*
221 * call-seq:
222 * Encoding.list => [enc1, enc2, ...]
223 *
224 * Returns the list of loaded encodings.
225 *
226 * Encoding.list
227 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
228 * #<Encoding:ISO-2022-JP (dummy)>]
229 *
230 * Encoding.find("US-ASCII")
231 * => #<Encoding:US-ASCII>
232 *
233 * Encoding.list
234 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
235 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
236 *
237 */
238 static VALUE
239 enc_list(VALUE klass, SEL sel)
240 {
241 VALUE ary;
242 const CFStringEncoding *e;
243
244 ary = rb_ary_new();
245 e = CFStringGetListOfAvailableEncodings();
246 while (e != NULL && *e != kCFStringEncodingInvalidId) {
247 rb_ary_push(ary, enc_make(e));
248 e++;
249 }
250 return ary;
251 }
252
253 /*
254 * call-seq:
255 * Encoding.find(string) => enc
256 * Encoding.find(symbol) => enc
257 *
258 * Search the encoding with specified <i>name</i>.
259 * <i>name</i> should be a string or symbol.
260 *
261 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
262 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
263 *
264 */
265 static VALUE
266 enc_find2(VALUE enc)
267 {
268 CFStringRef str;
269 CFStringEncoding e;
270
271 str = (CFStringRef)StringValue(enc);
272 if (CFStringCompare(str, CFSTR("ASCII-8BIT"),
273 kCFCompareCaseInsensitive) == 0) {
274 str = CFSTR("ASCII");
275 }
276 else if (CFStringCompare(str, CFSTR("SJIS"),
277 kCFCompareCaseInsensitive) == 0) {
278 str = CFSTR("Shift-JIS");
279 }
280
281 e = CFStringConvertIANACharSetNameToEncoding(str);
282 if (e == kCFStringEncodingInvalidId)
283 return Qnil;
284 return enc_make(&e);
285 }
286
287 static VALUE
288 enc_find(VALUE klass, SEL sel, VALUE enc)
289 {
290 VALUE e = enc_find2(enc);
28f110b removed the previous bytestring code and now use the new one
Laurent Sansonetti authored Mar 16, 2009
291 if (e == Qnil) {
292 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
293 }
9c1d230 committing experimental branch content
Laurent Sansonetti authored Mar 11, 2009
294 return e;
295 }
296
297 /*
298 * call-seq:
299 * Encoding.compatible?(str1, str2) => enc or nil
300 *
301 * Checks the compatibility of two strings.
302 * If they are compatible, means concatenatable,
303 * returns an encoding which the concatinated string will be.
304 * If they are not compatible, nil is returned.
305 *
306 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
307 * => #<Encoding:ISO-8859-1>
308 *
309 * Encoding.compatible?(
310 * "\xa1".force_encoding("iso-8859-1"),
311 * "\xa1\xa1".force_encoding("euc-jp"))
312 * => nil
313 *
314 */
315 static VALUE
316 enc_compatible_p(VALUE klass, SEL sel, VALUE str1, VALUE str2)
317 {
318 rb_encoding *enc = rb_enc_compatible(str1, str2);
319 VALUE encoding = Qnil;
320 if (!enc || !(encoding = rb_enc_from_encoding(enc)))
321 encoding = Qnil;
322 return encoding;
323 }
324
325 /* :nodoc: */
326 static VALUE
327 enc_dump(VALUE self, SEL sel, int argc, VALUE *argv)
328 {
329 rb_scan_args(argc, argv, "01", 0);
330 return enc_name(self, 0);
331 }
332
333 /* :nodoc: */
334 static VALUE
335 enc_load(VALUE klass, SEL sel, VALUE str)
336 {
337 return enc_find(klass, 0, str);
338 }
339
340 static rb_encoding *default_external;
341
342 rb_encoding *
343 rb_default_external_encoding(void)
344 {
345 return default_external;
346 }
347
348 VALUE
349 rb_enc_default_external(void)
350 {
351 return enc_make(default_external);
352 }
353
354 /*
355 * call-seq:
356 * Encoding.default_external => enc
357 *
358 * Returns default external encoding.
359 *
360 * It is initialized by the locale or -E option.
361 */
362 static VALUE
363 get_default_external(VALUE klass, SEL sel)
364 {
365 return rb_enc_default_external();
366 }
367
368 void
369 rb_enc_set_default_external(VALUE encoding)
370 {
371 default_external = rb_enc_to_enc_ptr(encoding);
372 }
373
374 /*
375 * call-seq:
376 * Encoding.locale_charmap => string
377 *
378 * Returns the locale charmap name.
379 *
380 * Debian GNU/Linux
381 * LANG=C
382 * Encoding.locale_charmap => "ANSI_X3.4-1968"
383 * LANG=ja_JP.EUC-JP
384 * Encoding.locale_charmap => "EUC-JP"
385 *
386 * SunOS 5
387 * LANG=C
388 * Encoding.locale_charmap => "646"
389 * LANG=ja
390 * Encoding.locale_charmap => "eucJP"
391 *
392 */
393 static VALUE
394 rb_locale_charmap(VALUE klass, SEL sel)
395 {
396 CFStringEncoding enc = CFStringGetSystemEncoding();
397 return (VALUE)CFStringConvertEncodingToIANACharSetName(enc);
398 }
399
400 /*
401 * call-seq:
402 * Encoding.name_list => ["enc1", "enc2", ...]
403 *
404 * Returns the list of available encoding names.
405 *
406 * Encoding.name_list
407 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
408 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
409 * "Windows-31J",
410 * "BINARY", "CP932", "eucJP"]
411 *
412 * This list doesn't include dummy encodings.
413 *
414 */
415
416 static VALUE
417 rb_enc_name_list(VALUE klass, SEL sel)
418 {
419 VALUE ary, list;
420 long i, count;
421
422 ary = rb_ary_new();
423 list = enc_list(klass, 0);
424 for (i = 0, count = RARRAY_LEN(list); i < count; i++) {
425 rb_ary_push(ary, enc_name(RARRAY_AT(list, i), 0));
426 }
427 return ary;
428 }
429
430 /*
431 * call-seq:
432 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
433 *
434 * Returns the hash of available encoding alias and original encoding name.
435 *
436 * Encoding.aliases
437 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
438 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
439 *
440 */
441
442 static VALUE
443 rb_enc_aliases(VALUE klass, SEL sel)
444 {
445 /* TODO: the CFString IANA <-> charset code does support aliases, we should
446 * find a way to return them here.
447 */
448 return rb_hash_new();
449 }
450
451 VALUE
452 rb_enc_name2(rb_encoding *enc)
453 {
454 CFStringRef str;
455 if (enc != NULL
456 && (str = CFStringConvertEncodingToIANACharSetName(*enc)) != NULL)
457 return (VALUE)str;
458 return Qnil;
459 }
460
461 const char *
462 rb_enc_name(rb_encoding *enc)
463 {
464 VALUE str = rb_enc_name2(enc);
465 return str == Qnil ? NULL : RSTRING_PTR(str);
466 }
467
468 long
469 rb_enc_mbminlen(rb_encoding *enc)
470 {
471 return rb_enc_mbmaxlen(enc);
472 }
473
474 long
475 rb_enc_mbmaxlen(rb_encoding *enc)
476 {
477 return enc == NULL
478 ? 1 : CFStringGetMaximumSizeForEncoding(1, *enc);
479 }
480
481 rb_encoding *
482 rb_enc_find(const char *name)
483 {
484 return rb_enc_find2(rb_str_new2(name));
485 }
486
487 rb_encoding *
488 rb_enc_find2(VALUE name)
489 {
490 VALUE e = enc_find2(name);
491 return e == Qnil ? NULL : rb_enc_to_enc_ptr(e);
492 }
493
494 rb_encoding *
495 rb_enc_get(VALUE obj)
496 {
497 int type = TYPE(obj);
498 if (type == T_STRING) {
499 CFStringEncoding enc = CFStringGetFastestEncoding((CFStringRef)obj);
500 if (enc == kCFStringEncodingInvalidId)
501 return NULL;
502 return rb_enc_to_enc_ptr(enc_make(&enc));
503 }
504 else {
505 /* TODO */
506 return NULL;
507 }
508 }
509
510 rb_encoding *
511 rb_locale_encoding(void)
512 {
513 CFStringEncoding enc = CFStringGetSystemEncoding();
514 return rb_enc_to_enc_ptr(enc_make(&enc));
515 }
516
517 void
518 Init_Encoding(void)
519 {
520 id_base_encoding = rb_intern("#base_encoding");
521
522 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
523 rb_undef_alloc_func(rb_cEncoding);
524 rb_objc_define_method(rb_cEncoding, "to_s", enc_name, 0);
525 rb_objc_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
526 rb_objc_define_method(rb_cEncoding, "name", enc_name, 0);
527 rb_objc_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
528 rb_objc_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
529 rb_objc_define_method(*(VALUE *)rb_cEncoding, "list", enc_list, 0);
530 rb_objc_define_method(*(VALUE *)rb_cEncoding, "name_list", rb_enc_name_list, 0);
531 rb_objc_define_method(*(VALUE *)rb_cEncoding, "aliases", rb_enc_aliases, 0);
532 rb_objc_define_method(*(VALUE *)rb_cEncoding, "find", enc_find, 1);
533 rb_objc_define_method(*(VALUE *)rb_cEncoding, "compatible?", enc_compatible_p, 2);
534
535 rb_objc_define_method(rb_cEncoding, "_dump", enc_dump, -1);
536 rb_objc_define_method(*(VALUE *)rb_cEncoding, "_load", enc_load, 1);
537
538 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external", get_default_external, 0);
539 rb_objc_define_method(*(VALUE *)rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
540
541 enc_init_db();
542 }
543
544 /* locale insensitive functions */
545
546 #define ctype_test(c, ctype) \
547 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
548
549 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
550 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
551 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
552 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
553 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
554 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
555 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
556 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
557 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
558 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
559 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
560 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
561
562 int
563 rb_tolower(int c)
564 {
565 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
566 }
567
568 int
569 rb_toupper(int c)
570 {
571 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
572 }
573
Something went wrong with that request. Please try again.