Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 567 lines (500 sloc) 12.929 kB
a25c222 merging from trunk
Laurent Sansonetti authored
1 /*
2 * MacRuby implementation of Ruby 1.9's encoding.c.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2008, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
511dc44 initial import
Laurent Sansonetti authored
11
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "regenc.h"
15 #include <ctype.h>
16 #ifdef HAVE_LANGINFO_H
17 #include <langinfo.h>
18 #endif
19
20 static ID id_encoding, id_base_encoding;
4734ee1 merge with trunk
Laurent Sansonetti authored
21 VALUE rb_cEncoding;
511dc44 initial import
Laurent Sansonetti authored
22
fe192be merge from trunk
Laurent Sansonetti authored
23 static CFMutableDictionaryRef __encodings = NULL;
24
25 static VALUE
26 enc_new(const CFStringEncoding *enc)
27 {
28 return Data_Wrap_Struct(rb_cEncoding, NULL, NULL, (void *)enc);
29 }
30
31 static void
32 enc_init_db(void)
33 {
34 const CFStringEncoding *e;
35
36 __encodings = CFDictionaryCreateMutable(NULL, 0, NULL, NULL);
37
38 e = CFStringGetListOfAvailableEncodings();
39 while (e != NULL && *e != kCFStringEncodingInvalidId) {
40 VALUE iana;
41 VALUE encoding;
42
43 encoding = enc_new(e);
44
45 iana = (VALUE)CFStringConvertEncodingToIANACharSetName(*e);
46 if (iana != 0) {
47 const char *name;
48 char *p;
49
a25c222 merging from trunk
Laurent Sansonetti authored
50 name = RSTRING_PTR(iana);
fe192be merge from trunk
Laurent Sansonetti authored
51 p = strchr(name, '-');
52 if ((p = strchr(name, '-')) != NULL
53 || islower(*name)) {
54 char *tmp = alloca(strlen(name));
55 strcpy(tmp, name);
56 if (p != NULL) {
57 p = tmp + (p - name);
58 do {
59 *p = '_';
60 p++;
61 p = strchr(p, '-');
62 }
63 while (p != NULL);
64 }
65 if (islower(*tmp))
66 *tmp = toupper(*tmp);
67 name = tmp;
68 }
69 rb_define_const(rb_cEncoding, name, encoding);
70 }
71 CFDictionarySetValue(__encodings, (const void *)(*e),
72 (const void *)encoding);
73 e++;
74 }
75
76 assert(CFDictionaryGetCount((CFDictionaryRef)__encodings) > 0);
77 }
78
79 static VALUE
80 enc_make(const CFStringEncoding *enc)
81 {
82 VALUE v;
4734ee1 merge with trunk
Laurent Sansonetti authored
83 assert(enc != NULL);
fe192be merge from trunk
Laurent Sansonetti authored
84 v = (VALUE)CFDictionaryGetValue( (CFDictionaryRef)__encodings,
85 (const void *)(*enc));
86 assert(v != 0);
87 return v;
88 }
89
90 VALUE
91 rb_enc_from_encoding(rb_encoding *enc)
92 {
93 return enc_make(enc);
94 }
95
96 static inline CFStringEncoding
97 rb_enc_to_enc(VALUE v)
98 {
99 return *(CFStringEncoding *)DATA_PTR(v);
100 }
101
102 static inline CFStringEncoding *
103 rb_enc_to_enc_ptr(VALUE v)
104 {
105 return (CFStringEncoding *)DATA_PTR(v);
106 }
107
108 rb_encoding *
109 rb_to_encoding(VALUE v)
110 {
111 if (TYPE(v) == T_STRING)
112 return rb_enc_find2(v);
113 return rb_enc_to_enc_ptr(v);
114 }
115
511dc44 initial import
Laurent Sansonetti authored
116 /*
117 * call-seq:
118 * enc.dummy? => true or false
119 *
6537c7a merging with trunk
Laurent Sansonetti authored
120 * Returns true for dummy encodings.
121 * A dummy encoding is an encoding for which character handling is not properly
511dc44 initial import
Laurent Sansonetti authored
122 * implemented.
6537c7a merging with trunk
Laurent Sansonetti authored
123 * It is used for stateful encodings.
511dc44 initial import
Laurent Sansonetti authored
124 *
125 * Encoding::ISO_2022_JP.dummy? #=> true
126 * Encoding::UTF_8.dummy? #=> false
127 *
128 */
129 static VALUE
130 enc_dummy_p(VALUE enc)
131 {
132 return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
133 }
134
135 ID
136 rb_id_encoding(void)
137 {
138 if (!id_encoding) {
139 id_encoding = rb_intern("encoding");
140 }
141 return id_encoding;
142 }
143
144 rb_encoding*
145 rb_enc_compatible(VALUE str1, VALUE str2)
146 {
fe192be merge from trunk
Laurent Sansonetti authored
147 /* TODO */
148 rb_encoding *enc = rb_enc_get(str1);
149 if (enc == rb_enc_get(str2))
150 return enc;
151 return NULL;
511dc44 initial import
Laurent Sansonetti authored
152 }
153
154 /*
155 * call-seq:
156 * obj.encoding => encoding
157 *
158 * Returns the Encoding object that represents the encoding of obj.
159 */
160
161 VALUE
162 rb_obj_encoding(VALUE obj)
163 {
164 rb_encoding *enc = rb_enc_get(obj);
165 if (!enc) {
166 rb_raise(rb_eTypeError, "unknown encoding");
167 }
168 return rb_enc_from_encoding(enc);
169 }
170
171 /*
172 * call-seq:
173 * enc.inspect => string
174 *
175 * Returns a string which represents the encoding for programmers.
176 *
177 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
178 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
179 */
180 static VALUE
181 enc_inspect(VALUE self)
182 {
fe192be merge from trunk
Laurent Sansonetti authored
183 char buffer[512];
184 VALUE enc_name;
185 long n;
186
187 enc_name = (VALUE)CFStringGetNameOfEncoding(rb_enc_to_enc(self));
188
189 n = snprintf(buffer, sizeof buffer, "#<%s:%s>", rb_obj_classname(self),
a25c222 merging from trunk
Laurent Sansonetti authored
190 RSTRING_PTR(enc_name));
fe192be merge from trunk
Laurent Sansonetti authored
191
192 return rb_str_new(buffer, n);
511dc44 initial import
Laurent Sansonetti authored
193 }
194
195 /*
196 * call-seq:
197 * enc.name => string
198 *
199 * Returns the name of the encoding.
200 *
201 * Encoding::UTF_8.name => "UTF-8"
202 */
203 static VALUE
204 enc_name(VALUE self)
205 {
fe192be merge from trunk
Laurent Sansonetti authored
206 return (VALUE)CFStringConvertEncodingToIANACharSetName(rb_enc_to_enc(self));
511dc44 initial import
Laurent Sansonetti authored
207 }
208
209 static VALUE
210 enc_base_encoding(VALUE self)
211 {
212 return rb_attr_get(self, id_base_encoding);
213 }
214
215 /*
216 * call-seq:
217 * Encoding.list => [enc1, enc2, ...]
218 *
219 * Returns the list of loaded encodings.
220 *
221 * Encoding.list
222 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
223 * #<Encoding:ISO-2022-JP (dummy)>]
224 *
225 * Encoding.find("US-ASCII")
226 * => #<Encoding:US-ASCII>
227 *
228 * Encoding.list
229 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
230 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
231 *
232 */
233 static VALUE
234 enc_list(VALUE klass)
235 {
fe192be merge from trunk
Laurent Sansonetti authored
236 VALUE ary;
237 const CFStringEncoding *e;
238
239 ary = rb_ary_new();
240 e = CFStringGetListOfAvailableEncodings();
241 while (e != NULL && *e != kCFStringEncodingInvalidId) {
242 rb_ary_push(ary, enc_make(e));
243 e++;
244 }
511dc44 initial import
Laurent Sansonetti authored
245 return ary;
246 }
247
248 /*
249 * call-seq:
250 * Encoding.find(string) => enc
251 * Encoding.find(symbol) => enc
252 *
253 * Search the encoding with specified <i>name</i>.
254 * <i>name</i> should be a string or symbol.
255 *
256 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
257 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
258 *
259 */
260 static VALUE
fe192be merge from trunk
Laurent Sansonetti authored
261 enc_find2(VALUE enc)
262 {
263 CFStringRef str;
264 CFStringEncoding e;
265
266 str = (CFStringRef)StringValue(enc);
267 if (CFStringCompare(str, CFSTR("ASCII-8BIT"),
268 kCFCompareCaseInsensitive) == 0) {
269 str = CFSTR("ASCII");
270 }
271 else if (CFStringCompare(str, CFSTR("SJIS"),
272 kCFCompareCaseInsensitive) == 0) {
273 str = CFSTR("Shift-JIS");
274 }
275
276 e = CFStringConvertIANACharSetNameToEncoding(str);
277 if (e == kCFStringEncodingInvalidId)
278 return Qnil;
279 return enc_make(&e);
280 }
281
282 static VALUE
511dc44 initial import
Laurent Sansonetti authored
283 enc_find(VALUE klass, VALUE enc)
284 {
fe192be merge from trunk
Laurent Sansonetti authored
285 VALUE e = enc_find2(enc);
286 if (e == Qnil)
a25c222 merging from trunk
Laurent Sansonetti authored
287 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_BYTEPTR(enc));
fe192be merge from trunk
Laurent Sansonetti authored
288 return e;
511dc44 initial import
Laurent Sansonetti authored
289 }
290
291 /*
292 * call-seq:
293 * Encoding.compatible?(str1, str2) => enc or nil
294 *
295 * Checks the compatibility of two strings.
296 * If they are compatible, means concatenatable,
297 * returns an encoding which the concatinated string will be.
298 * If they are not compatible, nil is returned.
299 *
300 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
301 * => #<Encoding:ISO-8859-1>
302 *
303 * Encoding.compatible?(
304 * "\xa1".force_encoding("iso-8859-1"),
305 * "\xa1\xa1".force_encoding("euc-jp"))
306 * => nil
307 *
308 */
309 static VALUE
310 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
311 {
312 rb_encoding *enc = rb_enc_compatible(str1, str2);
313 VALUE encoding = Qnil;
314 if (!enc || !(encoding = rb_enc_from_encoding(enc)))
315 encoding = Qnil;
316 return encoding;
317 }
318
319 /* :nodoc: */
320 static VALUE
321 enc_dump(int argc, VALUE *argv, VALUE self)
322 {
323 rb_scan_args(argc, argv, "01", 0);
324 return enc_name(self);
325 }
326
327 /* :nodoc: */
328 static VALUE
329 enc_load(VALUE klass, VALUE str)
330 {
331 return enc_find(klass, str);
332 }
333
fe192be merge from trunk
Laurent Sansonetti authored
334 static rb_encoding *default_external;
335
336 rb_encoding *
337 rb_default_external_encoding(void)
338 {
339 return default_external;
340 }
341
342 VALUE
343 rb_enc_default_external(void)
344 {
345 return enc_make(default_external);
346 }
511dc44 initial import
Laurent Sansonetti authored
347
348 /*
349 * call-seq:
350 * Encoding.default_external => enc
351 *
352 * Returns default external encoding.
353 *
354 * It is initialized by the locale or -E option.
355 */
356 static VALUE
357 get_default_external(VALUE klass)
358 {
359 return rb_enc_default_external();
360 }
361
362 void
363 rb_enc_set_default_external(VALUE encoding)
364 {
fe192be merge from trunk
Laurent Sansonetti authored
365 default_external = rb_enc_to_enc_ptr(encoding);
511dc44 initial import
Laurent Sansonetti authored
366 }
367
368 /*
369 * call-seq:
370 * Encoding.locale_charmap => string
371 *
372 * Returns the locale charmap name.
373 *
374 * Debian GNU/Linux
375 * LANG=C
376 * Encoding.locale_charmap => "ANSI_X3.4-1968"
377 * LANG=ja_JP.EUC-JP
378 * Encoding.locale_charmap => "EUC-JP"
379 *
380 * SunOS 5
381 * LANG=C
382 * Encoding.locale_charmap => "646"
383 * LANG=ja
384 * Encoding.locale_charmap => "eucJP"
385 *
386 */
387 VALUE
388 rb_locale_charmap(VALUE klass)
389 {
fe192be merge from trunk
Laurent Sansonetti authored
390 CFStringEncoding enc = CFStringGetSystemEncoding();
391 return (VALUE)CFStringConvertEncodingToIANACharSetName(enc);
511dc44 initial import
Laurent Sansonetti authored
392 }
393
394 /*
395 * call-seq:
396 * Encoding.name_list => ["enc1", "enc2", ...]
397 *
398 * Returns the list of available encoding names.
399 *
400 * Encoding.name_list
401 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
402 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
403 * "Windows-31J",
404 * "BINARY", "CP932", "eucJP"]
405 *
406 * This list doesn't include dummy encodings.
407 *
408 */
409
410 static VALUE
411 rb_enc_name_list(VALUE klass)
412 {
fe192be merge from trunk
Laurent Sansonetti authored
413 VALUE ary, list;
414 long i, count;
415
416 ary = rb_ary_new();
417 list = enc_list(klass);
418 for (i = 0, count = RARRAY_LEN(list); i < count; i++)
419 rb_ary_push(ary, enc_name(RARRAY_AT(list, i)));
511dc44 initial import
Laurent Sansonetti authored
420 return ary;
421 }
422
423 /*
424 * call-seq:
425 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
426 *
427 * Returns the hash of available encoding alias and original encoding name.
428 *
429 * Encoding.aliases
430 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
431 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
432 *
433 */
434
435 static VALUE
436 rb_enc_aliases(VALUE klass)
437 {
fe192be merge from trunk
Laurent Sansonetti authored
438 /* TODO: the CFString IANA <-> charset code does support aliases, we should
439 * find a way to return them here.
440 */
441 return rb_hash_new();
442 }
443
444 VALUE
445 rb_enc_name2(rb_encoding *enc)
446 {
447 CFStringRef str;
448 if (enc != NULL
449 && (str = CFStringConvertEncodingToIANACharSetName(*enc)) != NULL)
450 return (VALUE)str;
451 return Qnil;
452 }
453
454 const char *
455 rb_enc_name(rb_encoding *enc)
456 {
457 VALUE str = rb_enc_name2(enc);
a25c222 merging from trunk
Laurent Sansonetti authored
458 return str == Qnil ? NULL : RSTRING_PTR(str);
fe192be merge from trunk
Laurent Sansonetti authored
459 }
460
461 long
462 rb_enc_mbminlen(rb_encoding *enc)
463 {
464 return rb_enc_mbmaxlen(enc);
465 }
466
467 long
468 rb_enc_mbmaxlen(rb_encoding *enc)
469 {
470 return enc == NULL
471 ? 1 : CFStringGetMaximumSizeForEncoding(1, *enc);
472 }
473
474 rb_encoding *
475 rb_enc_find(const char *name)
476 {
477 return rb_enc_find2(rb_str_new2(name));
478 }
479
480 rb_encoding *
481 rb_enc_find2(VALUE name)
482 {
483 VALUE e = enc_find2(name);
484 return e == Qnil ? NULL : rb_enc_to_enc_ptr(e);
485 }
486
487 rb_encoding *
488 rb_enc_get(VALUE obj)
489 {
490 int type = TYPE(obj);
491 if (type == T_STRING) {
492 CFStringEncoding enc = CFStringGetFastestEncoding((CFStringRef)obj);
493 if (enc == kCFStringEncodingInvalidId)
494 return NULL;
495 return rb_enc_to_enc_ptr(enc_make(&enc));
496 }
497 else {
498 /* TODO */
499 return NULL;
500 }
501 }
502
503 rb_encoding *
504 rb_locale_encoding(void)
505 {
506 CFStringEncoding enc = CFStringGetSystemEncoding();
507 return rb_enc_to_enc_ptr(enc_make(&enc));
511dc44 initial import
Laurent Sansonetti authored
508 }
509
510 void
511 Init_Encoding(void)
512 {
513 id_base_encoding = rb_intern("#base_encoding");
514
515 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
516 rb_undef_alloc_func(rb_cEncoding);
517 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
518 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
519 rb_define_method(rb_cEncoding, "name", enc_name, 0);
520 rb_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
521 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
522 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
523 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
524 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
525 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
526 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
527
528 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
529 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
530
531 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
532 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
533
534 enc_init_db();
535 }
536
537 /* locale insensitive functions */
538
539 #define ctype_test(c, ctype) \
540 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
541
542 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
543 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
544 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
545 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
546 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
547 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
548 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
549 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
550 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
551 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
552 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
553 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
554
555 int
556 rb_tolower(int c)
557 {
558 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
559 }
560
561 int
562 rb_toupper(int c)
563 {
564 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
565 }
566
Something went wrong with that request. Please try again.