Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 573 lines (505 sloc) 13.401 kb
9c1d230 committing experimental branch content
Laurent Sansonetti authored
1 /*
2 * MacRuby implementation of Ruby 1.9's encoding.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2008, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "regenc.h"
15 #include <ctype.h>
16 #ifdef HAVE_LANGINFO_H
17 #include <langinfo.h>
18 #endif
19
20 static ID id_encoding, id_base_encoding;
21 VALUE rb_cEncoding;
22
23 static CFMutableDictionaryRef __encodings = NULL;
24
25 static VALUE
26 enc_new(const CFStringEncoding *enc)
27 {
28 return Data_Wrap_Struct(rb_cEncoding, NULL, NULL, (void *)enc);
29 }
30
31 static void
32 enc_init_db(void)
33 {
34 const CFStringEncoding *e;
35
1a1a935 fixed IO regressions + other bugs
Laurent Sansonetti authored
36 __encodings = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
37
38 /* XXX CFStringGetListOfAvailableEncodings() is a costly call and should
39 * be called on demand and not by default when the interpreter starts.
40 */
41 e = CFStringGetListOfAvailableEncodings();
42 while (e != NULL && *e != kCFStringEncodingInvalidId) {
43 VALUE iana;
44 VALUE encoding;
45
46 encoding = enc_new(e);
47
48 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*e);
49 if (iana != 0) {
50 const char *name;
51 char *p;
52
53 name = RSTRING_PTR(iana);
54 p = strchr(name, '-');
55 if ((p = strchr(name, '-')) != NULL
56 || islower(*name)) {
57 char *tmp = alloca(strlen(name));
58 strcpy(tmp, name);
59 if (p != NULL) {
60 p = tmp + (p - name);
61 do {
62 *p = '_';
63 p++;
64 p = strchr(p, '-');
65 }
66 while (p != NULL);
67 }
68 if (islower(*tmp))
69 *tmp = toupper(*tmp);
70 name = tmp;
71 }
72 rb_define_const(rb_cEncoding, name, encoding);
73 }
74 CFDictionarySetValue(__encodings, (const void *)iana,
75 (const void *)encoding);
76 e++;
77 }
78
79 assert(CFDictionaryGetCount((CFDictionaryRef)__encodings) > 0);
80 }
81
82 static VALUE
83 enc_make(const CFStringEncoding *enc)
84 {
85 VALUE iana, v;
86
87 assert(enc != NULL);
88 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*enc);
89 v = (VALUE)CFDictionaryGetValue((CFDictionaryRef)__encodings,
90 (const void *)iana);
91 assert(v != 0);
92 return v;
93 }
94
95 VALUE
96 rb_enc_from_encoding(rb_encoding *enc)
97 {
98 return enc_make(enc);
99 }
100
101 static inline CFStringEncoding
102 rb_enc_to_enc(VALUE v)
103 {
104 return *(CFStringEncoding *)DATA_PTR(v);
105 }
106
107 static inline CFStringEncoding *
108 rb_enc_to_enc_ptr(VALUE v)
109 {
110 return (CFStringEncoding *)DATA_PTR(v);
111 }
112
113 rb_encoding *
114 rb_to_encoding(VALUE v)
115 {
116 if (TYPE(v) == T_STRING)
117 return rb_enc_find2(v);
118 return rb_enc_to_enc_ptr(v);
119 }
120
121 /*
122 * call-seq:
123 * enc.dummy? => true or false
124 *
125 * Returns true for dummy encodings.
126 * A dummy encoding is an encoding for which character handling is not properly
127 * implemented.
128 * It is used for stateful encodings.
129 *
130 * Encoding::ISO_2022_JP.dummy? #=> true
131 * Encoding::UTF_8.dummy? #=> false
132 *
133 */
134 static VALUE
135 enc_dummy_p(VALUE enc, SEL sel)
136 {
137 return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
138 }
139
140 ID
141 rb_id_encoding(void)
142 {
143 if (!id_encoding) {
144 id_encoding = rb_intern("encoding");
145 }
146 return id_encoding;
147 }
148
149 rb_encoding*
150 rb_enc_compatible(VALUE str1, VALUE str2)
151 {
152 /* TODO */
153 rb_encoding *enc = rb_enc_get(str1);
154 if (enc == rb_enc_get(str2))
155 return enc;
156 return NULL;
157 }
158
159 /*
160 * call-seq:
161 * obj.encoding => encoding
162 *
163 * Returns the Encoding object that represents the encoding of obj.
164 */
165
166 VALUE
167 rb_obj_encoding(VALUE obj)
168 {
169 rb_encoding *enc = rb_enc_get(obj);
170 if (!enc) {
171 rb_raise(rb_eTypeError, "unknown encoding");
172 }
173 return rb_enc_from_encoding(enc);
174 }
175
176 /*
177 * call-seq:
178 * enc.inspect => string
179 *
180 * Returns a string which represents the encoding for programmers.
181 *
182 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
183 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
184 */
185 static VALUE
186 enc_inspect(VALUE self, SEL sel)
187 {
188 char buffer[512];
189 VALUE enc_name;
190 long n;
191
192 enc_name = (VALUE)CFStringGetNameOfEncoding(rb_enc_to_enc(self));
193
194 n = snprintf(buffer, sizeof buffer, "#<%s:%s>", rb_obj_classname(self),
195 RSTRING_PTR(enc_name));
196
197 return rb_str_new(buffer, n);
198 }
199
200 /*
201 * call-seq:
202 * enc.name => string
203 *
204 * Returns the name of the encoding.
205 *
206 * Encoding::UTF_8.name => "UTF-8"
207 */
208 static VALUE
209 enc_name(VALUE self, SEL sel)
210 {
211 return (VALUE)CFStringConvertEncodingToIANACharSetName(rb_enc_to_enc(self));
212 }
213
214 static VALUE
215 enc_base_encoding(VALUE self, SEL sel)
216 {
217 return rb_attr_get(self, id_base_encoding);
218 }
219
220 /*
221 * call-seq:
222 * Encoding.list => [enc1, enc2, ...]
223 *
224 * Returns the list of loaded encodings.
225 *
226 * Encoding.list
227 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
228 * #<Encoding:ISO-2022-JP (dummy)>]
229 *
230 * Encoding.find("US-ASCII")
231 * => #<Encoding:US-ASCII>
232 *
233 * Encoding.list
234 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
235 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
236 *
237 */
238 static VALUE
239 enc_list(VALUE klass, SEL sel)
240 {
241 VALUE ary;
242 const CFStringEncoding *e;
243
244 ary = rb_ary_new();
245 e = CFStringGetListOfAvailableEncodings();
246 while (e != NULL && *e != kCFStringEncodingInvalidId) {
247 rb_ary_push(ary, enc_make(e));
248 e++;
249 }
250 return ary;
251 }
252
253 /*
254 * call-seq:
255 * Encoding.find(string) => enc
256 * Encoding.find(symbol) => enc
257 *
258 * Search the encoding with specified <i>name</i>.
259 * <i>name</i> should be a string or symbol.
260 *
261 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
262 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
263 *
264 */
265 static VALUE
266 enc_find2(VALUE enc)
267 {
268 CFStringRef str;
269 CFStringEncoding e;
270
271 str = (CFStringRef)StringValue(enc);
272 if (CFStringCompare(str, CFSTR("ASCII-8BIT"),
273 kCFCompareCaseInsensitive) == 0) {
274 str = CFSTR("ASCII");
275 }
276 else if (CFStringCompare(str, CFSTR("SJIS"),
277 kCFCompareCaseInsensitive) == 0) {
278 str = CFSTR("Shift-JIS");
279 }
280
281 e = CFStringConvertIANACharSetNameToEncoding(str);
282 if (e == kCFStringEncodingInvalidId)
283 return Qnil;
284 return enc_make(&e);
285 }
286
287 static VALUE
288 enc_find(VALUE klass, SEL sel, VALUE enc)
289 {
290 VALUE e = enc_find2(enc);
291 if (e == Qnil)
292 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_BYTEPTR(enc));
293 return e;
294 }
295
296 /*
297 * call-seq:
298 * Encoding.compatible?(str1, str2) => enc or nil
299 *
300 * Checks the compatibility of two strings.
301 * If they are compatible, means concatenatable,
302 * returns an encoding which the concatinated string will be.
303 * If they are not compatible, nil is returned.
304 *
305 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
306 * => #<Encoding:ISO-8859-1>
307 *
308 * Encoding.compatible?(
309 * "\xa1".force_encoding("iso-8859-1"),
310 * "\xa1\xa1".force_encoding("euc-jp"))
311 * => nil
312 *
313 */
314 static VALUE
315 enc_compatible_p(VALUE klass, SEL sel, VALUE str1, VALUE str2)
316 {
317 rb_encoding *enc = rb_enc_compatible(str1, str2);
318 VALUE encoding = Qnil;
319 if (!enc || !(encoding = rb_enc_from_encoding(enc)))
320 encoding = Qnil;
321 return encoding;
322 }
323
324 /* :nodoc: */
325 static VALUE
326 enc_dump(VALUE self, SEL sel, int argc, VALUE *argv)
327 {
328 rb_scan_args(argc, argv, "01", 0);
329 return enc_name(self, 0);
330 }
331
332 /* :nodoc: */
333 static VALUE
334 enc_load(VALUE klass, SEL sel, VALUE str)
335 {
336 return enc_find(klass, 0, str);
337 }
338
339 static rb_encoding *default_external;
340
341 rb_encoding *
342 rb_default_external_encoding(void)
343 {
344 return default_external;
345 }
346
347 VALUE
348 rb_enc_default_external(void)
349 {
350 return enc_make(default_external);
351 }
352
353 /*
354 * call-seq:
355 * Encoding.default_external => enc
356 *
357 * Returns default external encoding.
358 *
359 * It is initialized by the locale or -E option.
360 */
361 static VALUE
362 get_default_external(VALUE klass, SEL sel)
363 {
364 return rb_enc_default_external();
365 }
366
367 void
368 rb_enc_set_default_external(VALUE encoding)
369 {
370 default_external = rb_enc_to_enc_ptr(encoding);
371 }
372
373 /*
374 * call-seq:
375 * Encoding.locale_charmap => string
376 *
377 * Returns the locale charmap name.
378 *
379 * Debian GNU/Linux
380 * LANG=C
381 * Encoding.locale_charmap => "ANSI_X3.4-1968"
382 * LANG=ja_JP.EUC-JP
383 * Encoding.locale_charmap => "EUC-JP"
384 *
385 * SunOS 5
386 * LANG=C
387 * Encoding.locale_charmap => "646"
388 * LANG=ja
389 * Encoding.locale_charmap => "eucJP"
390 *
391 */
392 static VALUE
393 rb_locale_charmap(VALUE klass, SEL sel)
394 {
395 CFStringEncoding enc = CFStringGetSystemEncoding();
396 return (VALUE)CFStringConvertEncodingToIANACharSetName(enc);
397 }
398
399 /*
400 * call-seq:
401 * Encoding.name_list => ["enc1", "enc2", ...]
402 *
403 * Returns the list of available encoding names.
404 *
405 * Encoding.name_list
406 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
407 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
408 * "Windows-31J",
409 * "BINARY", "CP932", "eucJP"]
410 *
411 * This list doesn't include dummy encodings.
412 *
413 */
414
415 static VALUE
416 rb_enc_name_list(VALUE klass, SEL sel)
417 {
418 VALUE ary, list;
419 long i, count;
420
421 ary = rb_ary_new();
422 list = enc_list(klass, 0);
423 for (i = 0, count = RARRAY_LEN(list); i < count; i++) {
424 rb_ary_push(ary, enc_name(RARRAY_AT(list, i), 0));
425 }
426 return ary;
427 }
428
429 /*
430 * call-seq:
431 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
432 *
433 * Returns the hash of available encoding alias and original encoding name.
434 *
435 * Encoding.aliases
436 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
437 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
438 *
439 */
440
441 static VALUE
442 rb_enc_aliases(VALUE klass, SEL sel)
443 {
444 /* TODO: the CFString IANA <-> charset code does support aliases, we should
445 * find a way to return them here.
446 */
447 return rb_hash_new();
448 }
449
450 VALUE
451 rb_enc_name2(rb_encoding *enc)
452 {
453 CFStringRef str;
454 if (enc != NULL
455 && (str = CFStringConvertEncodingToIANACharSetName(*enc)) != NULL)
456 return (VALUE)str;
457 return Qnil;
458 }
459
460 const char *
461 rb_enc_name(rb_encoding *enc)
462 {
463 VALUE str = rb_enc_name2(enc);
464 return str == Qnil ? NULL : RSTRING_PTR(str);
465 }
466
467 long
468 rb_enc_mbminlen(rb_encoding *enc)
469 {
470 return rb_enc_mbmaxlen(enc);
471 }
472
473 long
474 rb_enc_mbmaxlen(rb_encoding *enc)
475 {
476 return enc == NULL
477 ? 1 : CFStringGetMaximumSizeForEncoding(1, *enc);
478 }
479
480 rb_encoding *
481 rb_enc_find(const char *name)
482 {
483 return rb_enc_find2(rb_str_new2(name));
484 }
485
486 rb_encoding *
487 rb_enc_find2(VALUE name)
488 {
489 VALUE e = enc_find2(name);
490 return e == Qnil ? NULL : rb_enc_to_enc_ptr(e);
491 }
492
493 rb_encoding *
494 rb_enc_get(VALUE obj)
495 {
496 int type = TYPE(obj);
497 if (type == T_STRING) {
498 CFStringEncoding enc = CFStringGetFastestEncoding((CFStringRef)obj);
499 if (enc == kCFStringEncodingInvalidId)
500 return NULL;
501 return rb_enc_to_enc_ptr(enc_make(&enc));
502 }
503 else {
504 /* TODO */
505 return NULL;
506 }
507 }
508
509 rb_encoding *
510 rb_locale_encoding(void)
511 {
512 CFStringEncoding enc = CFStringGetSystemEncoding();
513 return rb_enc_to_enc_ptr(enc_make(&enc));
514 }
515
516 void
517 Init_Encoding(void)
518 {
519 id_base_encoding = rb_intern("#base_encoding");
520
521 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
522 rb_undef_alloc_func(rb_cEncoding);
523 rb_objc_define_method(rb_cEncoding, "to_s", enc_name, 0);
524 rb_objc_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
525 rb_objc_define_method(rb_cEncoding, "name", enc_name, 0);
526 rb_objc_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
527 rb_objc_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
528 rb_objc_define_method(*(VALUE *)rb_cEncoding, "list", enc_list, 0);
529 rb_objc_define_method(*(VALUE *)rb_cEncoding, "name_list", rb_enc_name_list, 0);
530 rb_objc_define_method(*(VALUE *)rb_cEncoding, "aliases", rb_enc_aliases, 0);
531 rb_objc_define_method(*(VALUE *)rb_cEncoding, "find", enc_find, 1);
532 rb_objc_define_method(*(VALUE *)rb_cEncoding, "compatible?", enc_compatible_p, 2);
533
534 rb_objc_define_method(rb_cEncoding, "_dump", enc_dump, -1);
535 rb_objc_define_method(*(VALUE *)rb_cEncoding, "_load", enc_load, 1);
536
537 rb_objc_define_method(*(VALUE *)rb_cEncoding, "default_external", get_default_external, 0);
538 rb_objc_define_method(*(VALUE *)rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
539
540 enc_init_db();
541 }
542
543 /* locale insensitive functions */
544
545 #define ctype_test(c, ctype) \
546 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
547
548 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
549 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
550 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
551 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
552 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
553 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
554 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
555 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
556 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
557 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
558 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
559 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
560
561 int
562 rb_tolower(int c)
563 {
564 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
565 }
566
567 int
568 rb_toupper(int c)
569 {
570 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
571 }
572
Something went wrong with that request. Please try again.