Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100644 1264 lines (1123 sloc) 28.622 kB
511dc44 initial import
Laurent Sansonetti authored
1 /**********************************************************************
2
3 encoding.c -
4
6537c7a merging with trunk
Laurent Sansonetti authored
5 $Author: nobu $
511dc44 initial import
Laurent Sansonetti authored
6 created at: Thu May 24 17:23:27 JST 2007
7
8 Copyright (C) 2007 Yukihiro Matsumoto
9
10 **********************************************************************/
11
12 #include "ruby/ruby.h"
13 #include "ruby/encoding.h"
14 #include "regenc.h"
15 #include <ctype.h>
16 #ifdef HAVE_LANGINFO_H
17 #include <langinfo.h>
18 #endif
19
20 static ID id_encoding, id_base_encoding;
21 static VALUE rb_cEncoding;
22
23 struct rb_encoding_entry {
24 const char *name;
25 rb_encoding *enc;
26 };
27
28 static struct {
29 struct rb_encoding_entry *list;
30 int count;
31 int size;
32 st_table *names;
33 } enc_table;
34
35 void rb_enc_init(void);
36
37 #ifndef NO_ENCDB_H
38 #undef ENC_REPLICATE
39 #undef ENC_ALIAS
40 #undef ENC_DUMMY
41 static int encdb_replicate(const char *alias, const char *orig);
42 static int encdb_alias(const char *alias, const char *orig);
43 static int encdb_dummy(const char *name);
44 static void encdb_declare(const char *name);
45 #define ENC_REPLICATE(name, orig) encdb_replicate(name, orig)
46 #define ENC_ALIAS(name, orig) encdb_alias(name, orig)
47 #define ENC_DUMMY(name) encdb_dummy(name)
48 #define ENC_DEFINE(name) encdb_declare(name)
49 #endif
50
51 static void
52 enc_init_db(void)
53 {
54 #ifdef NO_ENCDB_H
55 #define ENCODING_COUNT ENCINDEX_BUILTIN_MAX
56 #else
57 #include "encdb.h"
58 #endif
59 }
60
61 #define enc_autoload_p(enc) (!rb_enc_mbmaxlen(enc))
62
63 #define ENC_UNINITIALIZED (&rb_cEncoding)
64 #define enc_initialized_p(enc) ((enc)->auxiliary_data != &rb_cEncoding)
65 #define ENC_FROM_ENCODING(enc) ((VALUE)(enc)->auxiliary_data)
66
67 #define ENC_DUMMY_FLAG FL_USER2
68 #define ENC_DUMMY_P(enc) (RBASIC(enc)->flags & ENC_DUMMY_FLAG)
69 #define ENC_SET_DUMMY(enc) (RBASIC(enc)->flags |= ENC_DUMMY_FLAG)
70
71 static int load_encoding(const char *name);
72 static VALUE enc_base_encoding(VALUE self);
73
74 static void
75 enc_mark(void *ptr)
76 {
77 }
78
79 static VALUE
80 enc_new(rb_encoding *encoding)
81 {
82 VALUE enc = Data_Wrap_Struct(rb_cEncoding, enc_mark, 0, encoding);
83 encoding->auxiliary_data = (void *)enc;
84 #if WITH_OBJC
85 rb_objc_retain(enc);
86 #endif
87 return enc;
88 }
89
90 VALUE
91 rb_enc_from_encoding(rb_encoding *encoding)
92 {
93 if (!encoding) return Qnil;
94 if (enc_initialized_p(encoding))
95 return ENC_FROM_ENCODING(encoding);
96 return enc_new(encoding);
97 }
98
99 static int
100 enc_check_encoding(VALUE obj)
101 {
102 int index;
103 rb_encoding *enc;
104
105 if (SPECIAL_CONST_P(obj) || BUILTIN_TYPE(obj) != T_DATA ||
106 RDATA(obj)->dmark != enc_mark) {
107 return -1;
108 }
109 enc = (rb_encoding*)RDATA(obj)->data;
110 index = rb_enc_to_index(enc);
111 if (rb_enc_from_index(index) != enc)
112 return -1;
113 if (enc_autoload_p(enc)) {
114 index = load_encoding(enc->name);
115 }
116 return index;
117 }
118
119 int
120 rb_to_encoding_index(VALUE enc)
121 {
122 int idx;
123
124 idx = enc_check_encoding(enc);
125 if (idx >= 0) {
126 return idx;
127 }
128 else if (NIL_P(enc = rb_check_string_type(enc))) {
129 return -1;
130 }
131 else {
132 return rb_enc_find_index(StringValueCStr(enc));
133 }
134 }
135
136 rb_encoding *
137 rb_to_encoding(VALUE enc)
138 {
139 int idx;
140
141 idx = enc_check_encoding(enc);
142 if (idx >= 0) return RDATA(enc)->data;
143 if ((idx = rb_enc_find_index(StringValueCStr(enc))) < 0) {
144 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
145 }
146 return rb_enc_from_index(idx);
147 }
148
149 void
150 rb_gc_mark_encodings(void)
151 {
152 int i;
153 for (i = 0; i < enc_table.count; ++i) {
154 rb_encoding *enc = enc_table.list[i].enc;
155 if (enc && enc_initialized_p(enc)) {
156 rb_gc_mark(ENC_FROM_ENCODING(enc));
157 }
158 }
159 }
160
161 static int
162 enc_table_expand(int newsize)
163 {
164 struct rb_encoding_entry *ent;
165 int count = newsize;
166
167 if (enc_table.size >= newsize) return newsize;
168 newsize = (newsize + 7) / 8 * 8;
169 ent = realloc(enc_table.list, sizeof(*enc_table.list) * newsize);
170 if (!ent) return -1;
171 memset(ent + enc_table.size, 0, sizeof(*ent)*(newsize - enc_table.size));
172 enc_table.list = ent;
173 enc_table.size = newsize;
174 return count;
175 }
176
177 static int
178 enc_register_at(int index, const char *name, rb_encoding *encoding)
179 {
180 struct rb_encoding_entry *ent = &enc_table.list[index];
181 void *obj = ENC_UNINITIALIZED;
182
183 if (!ent->name) {
184 ent->name = name = strdup(name);
185 }
186 else if (STRCASECMP(name, ent->name)) {
187 return -1;
188 }
189 if (!ent->enc) {
190 ent->enc = malloc(sizeof(rb_encoding));
191 }
192 else {
193 obj = ent->enc->auxiliary_data;
194 }
195 if (encoding) {
196 *ent->enc = *encoding;
197 }
198 else {
199 memset(ent->enc, 0, sizeof(*ent->enc));
200 }
201 encoding = ent->enc;
202 encoding->name = name;
203 encoding->ruby_encoding_index = index;
204 st_insert(enc_table.names, (st_data_t)name, (st_data_t)index);
205 if (obj != ENC_UNINITIALIZED) {
206 encoding->auxiliary_data = obj;
207 }
208 else if (rb_cEncoding) {
209 /* initialize encoding data */
210 enc_new(encoding);
211 }
212 else {
213 encoding->auxiliary_data = ENC_UNINITIALIZED;
214 }
215 return index;
216 }
217
218 static int
219 enc_register(const char *name, rb_encoding *encoding)
220 {
221 int index = enc_table.count;
222
223 if ((index = enc_table_expand(index + 1)) < 0) return -1;
224 enc_table.count = index;
225 return enc_register_at(index - 1, name, encoding);
226 }
227
228 static void set_encoding_const(const char *, rb_encoding *);
229 int rb_enc_registered(const char *name);
230
231 int
232 rb_enc_register(const char *name, rb_encoding *encoding)
233 {
234 int index = rb_enc_registered(name);
235
236 if (index >= 0) {
237 rb_encoding *oldenc = rb_enc_from_index(index);
238 if (STRCASECMP(name, rb_enc_name(oldenc))) {
239 index = enc_register(name, encoding);
240 }
241 else if (!enc_autoload_p(oldenc) ||
242 (enc_initialized_p(oldenc) &&
243 !ENC_DUMMY_P(ENC_FROM_ENCODING(oldenc)))) {
244 enc_register_at(index, name, encoding);
245 }
246 else {
247 rb_raise(rb_eArgError, "encoding %s is already registered", name);
248 }
249 }
250 else {
251 index = enc_register(name, encoding);
252 set_encoding_const(name, rb_enc_from_index(index));
253 }
254 return index;
255 }
256
257 #ifndef NO_ENCDB_H
258 static void
259 encdb_declare(const char *name)
260 {
261 int idx = rb_enc_registered(name);
262 if (idx < 0) {
263 idx = enc_register(name, 0);
264 }
265 set_encoding_const(name, rb_enc_from_index(idx));
266 }
267 #endif
268
269 static void
270 enc_check_duplication(const char *name)
271 {
272 if (rb_enc_registered(name) >= 0) {
273 rb_raise(rb_eArgError, "encoding %s is already registered", name);
274 }
275 }
276
277 static VALUE
278 set_base_encoding(int index, rb_encoding *base)
279 {
280 VALUE enc = rb_enc_from_encoding(enc_table.list[index].enc);
281
282 rb_ivar_set(enc, id_base_encoding, rb_enc_from_encoding(base));
283 if (rb_enc_dummy_p(base)) ENC_SET_DUMMY(enc);
284 return enc;
285 }
286
287 int
288 rb_enc_replicate(const char *name, rb_encoding *encoding)
289 {
290 int idx;
291
292 enc_check_duplication(name);
293 idx = enc_register(name, encoding);
294 set_base_encoding(idx, encoding);
295 set_encoding_const(name, rb_enc_from_index(idx));
296 return idx;
297 }
298
299 #ifndef NO_ENCDB_H
300 static int
301 enc_replicate(int idx, const char *name, rb_encoding *origenc)
302 {
303 if (idx < 0) {
304 idx = enc_register(name, origenc);
305 }
306 else {
307 idx = enc_register_at(idx, name, origenc);
308 }
309 if (idx >= 0) {
310 set_base_encoding(idx, origenc);
311 set_encoding_const(name, rb_enc_from_index(idx));
312 }
313 return idx;
314 }
315
316 static int
317 encdb_replicate(const char *name, const char *orig)
318 {
319 int origidx = rb_enc_registered(orig);
320 int idx = rb_enc_registered(name);
321
322 if (origidx < 0) {
323 origidx = enc_register(orig, 0);
324 }
325 return enc_replicate(idx, name, rb_enc_from_index(origidx));
326 }
327 #endif
328
329 int
330 rb_define_dummy_encoding(const char *name)
331 {
332 int index = rb_enc_replicate(name, rb_ascii8bit_encoding());
333 VALUE enc = rb_enc_from_encoding(enc_table.list[index].enc);
334
335 ENC_SET_DUMMY(enc);
336 return index;
337 }
338
339 #ifndef NO_ENCDB_H
340 static int
341 encdb_dummy(const char *name)
342 {
343 int index = enc_replicate(rb_enc_registered(name), name,
344 rb_ascii8bit_encoding());
345 VALUE enc = rb_enc_from_encoding(enc_table.list[index].enc);
346
347 ENC_SET_DUMMY(enc);
348 return index;
349 }
350 #endif
351
352 int
353 rb_enc_dummy_p(rb_encoding *enc)
354 {
355 VALUE encoding = rb_enc_from_encoding(enc);
356 return ENC_DUMMY_P(encoding);
357 }
358
359 /*
360 * call-seq:
361 * enc.dummy? => true or false
362 *
6537c7a merging with trunk
Laurent Sansonetti authored
363 * Returns true for dummy encodings.
364 * A dummy encoding is an encoding for which character handling is not properly
511dc44 initial import
Laurent Sansonetti authored
365 * implemented.
6537c7a merging with trunk
Laurent Sansonetti authored
366 * It is used for stateful encodings.
511dc44 initial import
Laurent Sansonetti authored
367 *
368 * Encoding::ISO_2022_JP.dummy? #=> true
369 * Encoding::UTF_8.dummy? #=> false
370 *
371 */
372 static VALUE
373 enc_dummy_p(VALUE enc)
374 {
375 return rb_enc_dummy_p(rb_to_encoding(enc)) ? Qtrue : Qfalse;
376 }
377
378 static int
379 enc_alias(const char *alias, int idx)
380 {
381 alias = strdup(alias);
382 st_insert(enc_table.names, (st_data_t)alias, (st_data_t)idx);
383 set_encoding_const(alias, rb_enc_from_index(idx));
384 return idx;
385 }
386
387 int
388 rb_enc_alias(const char *alias, const char *orig)
389 {
390 int idx;
391
392 enc_check_duplication(alias);
393 if (!enc_table.list) {
394 rb_enc_init();
395 }
396 if ((idx = rb_enc_find_index(orig)) < 0) {
397 return -1;
398 }
399 return enc_alias(alias, idx);
400 }
401
402 #ifndef NO_ENCDB_H
403 static int
404 encdb_alias(const char *alias, const char *orig)
405 {
406 int idx = rb_enc_registered(orig);
407
408 if (idx < 0) {
409 idx = enc_register(orig, 0);
410 }
411 return enc_alias(alias, idx);
412 }
413 #endif
414
415 enum {
416 ENCINDEX_ASCII,
417 ENCINDEX_UTF_8,
418 ENCINDEX_US_ASCII,
419 ENCINDEX_BUILTIN_MAX
420 };
421
422 extern rb_encoding OnigEncodingUTF_8;
423 extern rb_encoding OnigEncodingUS_ASCII;
424
425 void
426 rb_enc_init(void)
427 {
428 enc_table_expand(ENCODING_COUNT + 1);
429 if (!enc_table.names) {
430 enc_table.names = st_init_strcasetable();
431 GC_ROOT(&enc_table.names);
432 }
433 #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(&OnigEncoding##enc), &OnigEncoding##enc)
434 ENC_REGISTER(ASCII);
435 ENC_REGISTER(UTF_8);
436 ENC_REGISTER(US_ASCII);
437 #undef ENC_REGISTER
438 enc_table.count = ENCINDEX_BUILTIN_MAX;
439 }
440
441 rb_encoding *
442 rb_enc_from_index(int index)
443 {
444 if (!enc_table.list) {
445 rb_enc_init();
446 }
447 if (index < 0 || enc_table.count <= index) {
448 return 0;
449 }
450 return enc_table.list[index].enc;
451 }
452
453 int
454 rb_enc_registered(const char *name)
455 {
456 st_data_t idx = 0;
457
458 if (!name) return -1;
459 if (!enc_table.list) return -1;
460 if (st_lookup(enc_table.names, (st_data_t)name, &idx)) {
461 return (int)idx;
462 }
463 return -1;
464 }
465
466 static VALUE
467 require_enc(VALUE enclib)
468 {
469 return rb_require_safe(enclib, rb_safe_level());
470 }
471
472 static int
473 load_encoding(const char *name)
474 {
475 VALUE enclib = rb_sprintf("enc/%s", name);
476 VALUE verbose = ruby_verbose;
477 VALUE debug = ruby_debug;
478 VALUE loaded;
479 char *s = RSTRING_PTR(enclib) + 4, *e = RSTRING_END(enclib);
480 int idx;
481
482 while (s < e) {
483 if (!ISALNUM(*s)) *s = '_';
484 else if (ISUPPER(*s)) *s = TOLOWER(*s);
485 ++s;
486 }
487 OBJ_FREEZE(enclib);
488 ruby_verbose = Qfalse;
489 ruby_debug = Qfalse;
490 loaded = rb_protect(require_enc, enclib, 0);
491 ruby_verbose = verbose;
492 ruby_debug = debug;
493 rb_set_errinfo(Qnil);
494 if (NIL_P(loaded)) return -1;
495 if ((idx = rb_enc_registered(name)) < 0) return -1;
496 if (enc_autoload_p(enc_table.list[idx].enc)) return -1;
497 return idx;
498 }
499
500 int
501 rb_enc_find_index(const char *name)
502 {
503 int i = rb_enc_registered(name), b;
504 rb_encoding *enc;
505 VALUE base;
506
507 if (i < 0) {
508 i = load_encoding(name);
509 }
510 else if (enc_autoload_p(enc = rb_enc_from_index(i))) {
511 if (enc_initialized_p(enc) &&
512 (base = enc_base_encoding(ENC_FROM_ENCODING(enc)), !NIL_P(base))) {
513 if ((b = enc_check_encoding(base)) < 0) {
514 goto failed;
515 }
516 enc_register_at(i, rb_enc_name(enc), rb_enc_from_index(b));
517 }
518 else {
519 i = load_encoding(rb_enc_name(enc));
520 if (i < 0) {
521 failed:
522 rb_warn("failed to load encoding (%s); use ASCII-8BIT instead",
523 name);
524 return 0;
525 }
526 }
527 }
528 return i;
529 }
530
531 rb_encoding *
532 rb_enc_find(const char *name)
533 {
534 int idx = rb_enc_find_index(name);
535 if (idx < 0) idx = 0;
536 return rb_enc_from_index(idx);
537 }
538
539 static inline int
540 enc_capable(VALUE obj)
541 {
6537c7a merging with trunk
Laurent Sansonetti authored
542 if (SPECIAL_CONST_P(obj)) return Qfalse;
511dc44 initial import
Laurent Sansonetti authored
543 switch (BUILTIN_TYPE(obj)) {
544 case T_STRING:
545 case T_REGEXP:
546 case T_FILE:
547 return Qtrue;
548 case T_DATA:
549 if (RDATA(obj)->dmark == enc_mark) return Qtrue;
550 default:
551 return Qfalse;
552 }
553 }
554
555 static void
556 enc_check_capable(VALUE x)
557 {
558 if (!enc_capable(x)) {
559 const char *etype;
560
561 if (NIL_P(x)) {
562 etype = "nil";
563 }
564 else if (FIXNUM_P(x)) {
565 etype = "Fixnum";
566 }
567 else if (SYMBOL_P(x)) {
568 etype = "Symbol";
569 }
570 else if (rb_special_const_p(x)) {
571 etype = RSTRING_PTR(rb_obj_as_string(x));
572 }
573 else {
574 etype = rb_obj_classname(x);
575 }
576 rb_raise(rb_eTypeError, "wrong argument type %s (not encode capable)", etype);
577 }
578 }
579
580 ID
581 rb_id_encoding(void)
582 {
583 if (!id_encoding) {
584 id_encoding = rb_intern("encoding");
585 }
586 return id_encoding;
587 }
588
589 int
590 rb_enc_internal_get_index(VALUE obj)
591 {
592 int i;
593
594 i = ENCODING_GET_INLINED(obj);
595 if (i == ENCODING_INLINE_MAX) {
596 VALUE iv;
597
598 iv = rb_ivar_get(obj, rb_id_encoding());
599 i = NUM2INT(iv);
600 }
601 return i;
602 }
603
604 void
605 rb_enc_internal_set_index(VALUE obj, int idx)
606 {
607 if (idx < ENCODING_INLINE_MAX) {
608 ENCODING_SET_INLINED(obj, idx);
609 return;
610 }
611 ENCODING_SET_INLINED(obj, ENCODING_INLINE_MAX);
612 rb_ivar_set(obj, rb_id_encoding(), INT2NUM(idx));
613 return;
614 }
615
616 void
617 rb_enc_associate_index(VALUE obj, int idx)
618 {
619 enc_check_capable(obj);
6537c7a merging with trunk
Laurent Sansonetti authored
620 if (rb_enc_get_index(obj) == idx)
621 return;
511dc44 initial import
Laurent Sansonetti authored
622 if (!ENC_CODERANGE_ASCIIONLY(obj) ||
623 !rb_enc_asciicompat(rb_enc_from_index(idx))) {
624 ENC_CODERANGE_CLEAR(obj);
625 }
626 rb_enc_internal_set_index(obj, idx);
627 }
628
629 void
630 rb_enc_associate(VALUE obj, rb_encoding *enc)
631 {
632 rb_enc_associate_index(obj, rb_enc_to_index(enc));
633 }
634
635 int
636 rb_enc_get_index(VALUE obj)
637 {
638 if (!enc_capable(obj)) return -1;
639 return rb_enc_internal_get_index(obj);
640 }
641
642 rb_encoding*
643 rb_enc_get(VALUE obj)
644 {
645 return rb_enc_from_index(rb_enc_get_index(obj));
646 }
647
648 rb_encoding*
649 rb_enc_check(VALUE str1, VALUE str2)
650 {
651 rb_encoding *enc = rb_enc_compatible(str1, str2);
652 if (!enc)
653 rb_raise(rb_eArgError, "character encodings differ: %s and %s",
654 rb_enc_name(rb_enc_get(str1)),
655 rb_enc_name(rb_enc_get(str2)));
656 return enc;
657 }
658
659 rb_encoding*
660 rb_enc_compatible(VALUE str1, VALUE str2)
661 {
662 int idx1, idx2;
663 rb_encoding *enc1, *enc2;
664
665 idx1 = rb_enc_get_index(str1);
666 idx2 = rb_enc_get_index(str2);
667
668 if (idx1 < 0 || idx2 < 0)
669 return 0;
670
671 if (idx1 == idx2) {
672 return rb_enc_from_index(idx1);
673 }
674 enc1 = rb_enc_from_index(idx1);
675 enc2 = rb_enc_from_index(idx2);
676
6537c7a merging with trunk
Laurent Sansonetti authored
677 if (TYPE(str2) == T_STRING && RSTRING_LEN(str2) == 0)
678 return enc1;
679 if (TYPE(str1) == T_STRING && RSTRING_LEN(str1) == 0)
680 return enc2;
511dc44 initial import
Laurent Sansonetti authored
681 if (!rb_enc_asciicompat(enc1) || !rb_enc_asciicompat(enc2)) {
682 return 0;
683 }
684
685 if (BUILTIN_TYPE(str1) != T_STRING) {
686 VALUE tmp = str1;
687 int idx0 = idx1;
688 str1 = str2;
689 str2 = tmp;
690 idx1 = idx2;
691 idx2 = idx0;
692 }
693 if (BUILTIN_TYPE(str1) == T_STRING) {
694 int cr1, cr2;
695
696 cr1 = rb_enc_str_coderange(str1);
697 if (BUILTIN_TYPE(str2) == T_STRING) {
698 cr2 = rb_enc_str_coderange(str2);
699 if (cr1 != cr2) {
700 /* may need to handle ENC_CODERANGE_BROKEN */
701 if (cr1 == ENC_CODERANGE_7BIT) return enc2;
702 if (cr2 == ENC_CODERANGE_7BIT) return enc1;
703 }
704 if (cr2 == ENC_CODERANGE_7BIT) {
705 if (idx1 == 0) return enc2;
706 return enc1;
707 }
708 }
709 if (cr1 == ENC_CODERANGE_7BIT)
710 return enc2;
711 }
712 return 0;
713 }
714
715 void
716 rb_enc_copy(VALUE obj1, VALUE obj2)
717 {
718 rb_enc_associate_index(obj1, rb_enc_get_index(obj2));
719 }
720
721
722 /*
723 * call-seq:
724 * obj.encoding => encoding
725 *
726 * Returns the Encoding object that represents the encoding of obj.
727 */
728
729 VALUE
730 rb_obj_encoding(VALUE obj)
731 {
732 rb_encoding *enc = rb_enc_get(obj);
733 if (!enc) {
734 rb_raise(rb_eTypeError, "unknown encoding");
735 }
736 return rb_enc_from_encoding(enc);
737 }
738
739 int
740 rb_enc_mbclen(const char *p, const char *e, rb_encoding *enc)
741 {
742 int n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
743 if (MBCLEN_CHARFOUND_P(n) && MBCLEN_CHARFOUND_LEN(n) <= e-p)
744 return MBCLEN_CHARFOUND_LEN(n);
6537c7a merging with trunk
Laurent Sansonetti authored
745 else {
746 int min = rb_enc_mbminlen(enc);
747 return min <= e-p ? min : e-p;
748 }
511dc44 initial import
Laurent Sansonetti authored
749 }
750
751 int
752 rb_enc_precise_mbclen(const char *p, const char *e, rb_encoding *enc)
753 {
754 int n;
755 if (e <= p)
756 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(1);
757 n = ONIGENC_PRECISE_MBC_ENC_LEN(enc, (UChar*)p, (UChar*)e);
758 if (e-p < n)
759 return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(n-(e-p));
760 return n;
761 }
762
763 int
764 rb_enc_ascget(const char *p, const char *e, int *len, rb_encoding *enc)
765 {
766 int c, l;
767 if (e <= p)
768 return -1;
769 if (rb_enc_asciicompat(enc)) {
770 c = (unsigned char)*p;
771 if (!ISASCII(c))
772 return -1;
773 if (len) *len = 1;
774 return c;
775 }
776 l = rb_enc_precise_mbclen(p, e, enc);
777 if (!MBCLEN_CHARFOUND_P(l))
778 return -1;
6537c7a merging with trunk
Laurent Sansonetti authored
779 c = rb_enc_mbc_to_codepoint(p, e, enc);
511dc44 initial import
Laurent Sansonetti authored
780 if (!rb_enc_isascii(c, enc))
781 return -1;
782 if (len) *len = l;
783 return c;
784 }
785
786 int
787 rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
788 {
789 int r;
790 if (e <= p)
791 rb_raise(rb_eArgError, "empty string");
792 r = rb_enc_precise_mbclen(p, e, enc);
793 if (MBCLEN_CHARFOUND_P(r))
794 return rb_enc_mbc_to_codepoint(p, e, enc);
795 else
796 rb_raise(rb_eArgError, "invalid mbstring sequence");
797 }
798
799 int
800 rb_enc_codelen(int c, rb_encoding *enc)
801 {
802 int n = ONIGENC_CODE_TO_MBCLEN(enc,c);
803 if (n == 0) {
804 rb_raise(rb_eArgError, "invalid codepoint 0x%x", c);
805 }
806 return n;
807 }
808
809 int
810 rb_enc_toupper(int c, rb_encoding *enc)
811 {
812 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_UPPER_CASE(c):(c));
813 }
814
815 int
816 rb_enc_tolower(int c, rb_encoding *enc)
817 {
818 return (ONIGENC_IS_ASCII_CODE(c)?ONIGENC_ASCII_CODE_TO_LOWER_CASE(c):(c));
819 }
820
821 /*
822 * call-seq:
823 * enc.inspect => string
824 *
825 * Returns a string which represents the encoding for programmers.
826 *
827 * Encoding::UTF_8.inspect #=> "#<Encoding:UTF-8>"
828 * Encoding::ISO_2022_JP.inspect #=> "#<Encoding:ISO-2022-JP (dummy)>"
829 */
830 static VALUE
831 enc_inspect(VALUE self)
832 {
833 VALUE str = rb_sprintf("#<%s:%s%s>", rb_obj_classname(self),
834 rb_enc_name((rb_encoding*)DATA_PTR(self)),
835 (ENC_DUMMY_P(self) ? " (dummy)" : ""));
836 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
837 return str;
838 }
839
840 /*
841 * call-seq:
842 * enc.name => string
843 *
844 * Returns the name of the encoding.
845 *
846 * Encoding::UTF_8.name => "UTF-8"
847 */
848 static VALUE
849 enc_name(VALUE self)
850 {
851 return rb_usascii_str_new2(rb_enc_name((rb_encoding*)DATA_PTR(self)));
852 }
853
854 static VALUE
855 enc_base_encoding(VALUE self)
856 {
857 return rb_attr_get(self, id_base_encoding);
858 }
859
860 /*
861 * call-seq:
862 * Encoding.list => [enc1, enc2, ...]
863 *
864 * Returns the list of loaded encodings.
865 *
866 * Encoding.list
867 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
868 * #<Encoding:ISO-2022-JP (dummy)>]
869 *
870 * Encoding.find("US-ASCII")
871 * => #<Encoding:US-ASCII>
872 *
873 * Encoding.list
874 * => [#<Encoding:ASCII-8BIT>, #<Encoding:UTF-8>,
875 * #<Encoding:US-ASCII>, #<Encoding:ISO-2022-JP (dummy)>]
876 *
877 */
878 static VALUE
879 enc_list(VALUE klass)
880 {
881 VALUE ary = rb_ary_new2(enc_table.count);
882 int i;
883 for (i = 0; i < enc_table.count; ++i) {
884 rb_encoding *enc = enc_table.list[i].enc;
885 if (enc) {
886 rb_ary_push(ary, rb_enc_from_encoding(enc));
887 }
888 }
889 return ary;
890 }
891
892 /*
893 * call-seq:
894 * Encoding.find(string) => enc
895 * Encoding.find(symbol) => enc
896 *
897 * Search the encoding with specified <i>name</i>.
898 * <i>name</i> should be a string or symbol.
899 *
900 * Encoding.find("US-ASCII") => #<Encoding:US-ASCII>
901 * Encoding.find(:Shift_JIS) => #<Encoding:Shift_JIS>
902 *
903 */
904 static VALUE
905 enc_find(VALUE klass, VALUE enc)
906 {
907 int idx;
908
6537c7a merging with trunk
Laurent Sansonetti authored
909 StringValue(enc);
511dc44 initial import
Laurent Sansonetti authored
910 if (!rb_enc_asciicompat(rb_enc_get(enc))) {
911 rb_raise(rb_eArgError, "invalid name encoding (non ASCII)");
912 }
913 idx = rb_enc_find_index(StringValueCStr(enc));
914 if (idx < 0) {
915 rb_raise(rb_eArgError, "unknown encoding name - %s", RSTRING_PTR(enc));
916 }
917 return rb_enc_from_encoding(rb_enc_from_index(idx));
918 }
919
920 /*
921 * call-seq:
922 * Encoding.compatible?(str1, str2) => enc or nil
923 *
924 * Checks the compatibility of two strings.
925 * If they are compatible, means concatenatable,
926 * returns an encoding which the concatinated string will be.
927 * If they are not compatible, nil is returned.
928 *
929 * Encoding.compatible?("\xa1".force_encoding("iso-8859-1"), "b")
930 * => #<Encoding:ISO-8859-1>
931 *
932 * Encoding.compatible?(
933 * "\xa1".force_encoding("iso-8859-1"),
934 * "\xa1\xa1".force_encoding("euc-jp"))
935 * => nil
936 *
937 */
938 static VALUE
939 enc_compatible_p(VALUE klass, VALUE str1, VALUE str2)
940 {
941 rb_encoding *enc = rb_enc_compatible(str1, str2);
942 VALUE encoding = Qnil;
943 if (!enc || !(encoding = rb_enc_from_encoding(enc)))
944 encoding = Qnil;
945 return encoding;
946 }
947
948 /* :nodoc: */
949 static VALUE
950 enc_dump(int argc, VALUE *argv, VALUE self)
951 {
952 rb_scan_args(argc, argv, "01", 0);
953 return enc_name(self);
954 }
955
956 /* :nodoc: */
957 static VALUE
958 enc_load(VALUE klass, VALUE str)
959 {
960 return enc_find(klass, str);
961 }
962
963 rb_encoding *
964 rb_ascii8bit_encoding(void)
965 {
966 if (!enc_table.list) {
967 rb_enc_init();
968 }
969 return enc_table.list[0].enc;
970 }
971
972 rb_encoding *
973 rb_utf8_encoding(void)
974 {
975 if (!enc_table.list) {
976 rb_enc_init();
977 }
978 return enc_table.list[ENCINDEX_UTF_8].enc;
979 }
980
981 rb_encoding *
982 rb_usascii_encoding(void)
983 {
984 if (!enc_table.list) {
985 rb_enc_init();
986 }
987 return enc_table.list[ENCINDEX_US_ASCII].enc;
988 }
989
990 int
991 rb_usascii_encindex(void)
992 {
993 return ENCINDEX_US_ASCII;
994 }
995
996 rb_encoding *
997 rb_locale_encoding(void)
998 {
999 VALUE charmap = rb_locale_charmap(rb_cEncoding);
1000 int idx;
1001
1002 if (NIL_P(charmap))
1003 idx = rb_enc_find_index("US-ASCII");
1004 else
1005 idx = rb_enc_find_index(StringValueCStr(charmap));
1006 if (idx < 0)
1007 return rb_ascii8bit_encoding();
1008
1009 return rb_enc_from_index(idx);
1010 }
1011
1012 static int default_external_index;
1013
1014 rb_encoding *
1015 rb_default_external_encoding(void)
1016 {
1017 return rb_enc_from_index(default_external_index);
1018 }
1019
1020 VALUE
1021 rb_enc_default_external(void)
1022 {
1023 return rb_enc_from_encoding(rb_default_external_encoding());
1024 }
1025
1026 /*
1027 * call-seq:
1028 * Encoding.default_external => enc
1029 *
1030 * Returns default external encoding.
1031 *
1032 * It is initialized by the locale or -E option.
1033 */
1034 static VALUE
1035 get_default_external(VALUE klass)
1036 {
1037 return rb_enc_default_external();
1038 }
1039
1040 void
1041 rb_enc_set_default_external(VALUE encoding)
1042 {
1043 default_external_index = rb_enc_to_index(rb_to_encoding(encoding));
1044 }
1045
1046 /*
1047 * call-seq:
1048 * Encoding.locale_charmap => string
1049 *
1050 * Returns the locale charmap name.
1051 *
1052 * Debian GNU/Linux
1053 * LANG=C
1054 * Encoding.locale_charmap => "ANSI_X3.4-1968"
1055 * LANG=ja_JP.EUC-JP
1056 * Encoding.locale_charmap => "EUC-JP"
1057 *
1058 * SunOS 5
1059 * LANG=C
1060 * Encoding.locale_charmap => "646"
1061 * LANG=ja
1062 * Encoding.locale_charmap => "eucJP"
1063 *
1064 */
1065 VALUE
1066 rb_locale_charmap(VALUE klass)
1067 {
1068 #if defined NO_LOCALE_CHARMAP
1069 return rb_usascii_str_new2("ASCII-8BIT");
1070 #elif defined HAVE_LANGINFO_H
1071 char *codeset;
1072 codeset = nl_langinfo(CODESET);
1073 return rb_usascii_str_new2(codeset);
1074 #elif defined _WIN32
1075 return rb_sprintf("CP%d", GetACP());
1076 #else
1077 return Qnil;
1078 #endif
1079 }
1080
1081 static void
1082 set_encoding_const(const char *name, rb_encoding *enc)
1083 {
1084 VALUE encoding = rb_enc_from_encoding(enc);
1085 char *s = (char *)name;
1086 int haslower = 0, hasupper = 0, valid = 0;
1087
1088 if (ISDIGIT(*s)) return;
1089 if (ISUPPER(*s)) {
1090 hasupper = 1;
1091 while (*++s && (ISALNUM(*s) || *s == '_')) {
1092 if (ISLOWER(*s)) haslower = 1;
1093 }
1094 }
1095 if (!*s) {
1096 valid = 1;
1097 rb_define_const(rb_cEncoding, name, encoding);
1098 }
1099 if (!valid || haslower) {
1100 int len = strlen(name) + 1;
1101 if (!haslower || !hasupper) {
1102 do {
1103 if (ISLOWER(*s)) haslower = 1;
1104 if (ISUPPER(*s)) hasupper = 1;
1105 } while (*++s && (!haslower || !hasupper));
1106 }
1107 MEMCPY(s = ALLOCA_N(char, len), name, char, len);
1108 name = s;
1109 if (!valid) {
1110 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1111 for (; *s; ++s) {
1112 if (!ISALNUM(*s)) *s = '_';
1113 }
1114 if (hasupper) {
1115 rb_define_const(rb_cEncoding, name, encoding);
1116 }
1117 }
1118 if (haslower) {
1119 for (s = (char *)name; *s; ++s) {
1120 if (ISLOWER(*s)) *s = ONIGENC_ASCII_CODE_TO_UPPER_CASE((int)*s);
1121 }
1122 rb_define_const(rb_cEncoding, name, encoding);
1123 }
1124 }
1125 }
1126
1127 static int
1128 rb_enc_name_list_i(st_data_t name, st_data_t idx, st_data_t arg)
1129 {
1130 VALUE ary = (VALUE)arg;
1131 VALUE str = rb_usascii_str_new2((char *)name);
1132 OBJ_FREEZE(str);
1133 rb_ary_push(ary, str);
1134 return ST_CONTINUE;
1135 }
1136
1137 /*
1138 * call-seq:
1139 * Encoding.name_list => ["enc1", "enc2", ...]
1140 *
1141 * Returns the list of available encoding names.
1142 *
1143 * Encoding.name_list
1144 * => ["US-ASCII", "ASCII-8BIT", "UTF-8",
1145 * "ISO-8859-1", "Shift_JIS", "EUC-JP",
1146 * "Windows-31J",
1147 * "BINARY", "CP932", "eucJP"]
1148 *
1149 * This list doesn't include dummy encodings.
1150 *
1151 */
1152
1153 static VALUE
1154 rb_enc_name_list(VALUE klass)
1155 {
1156 VALUE ary = rb_ary_new2(enc_table.names->num_entries);
1157 st_foreach(enc_table.names, rb_enc_name_list_i, (st_data_t)ary);
1158 return ary;
1159 }
1160
1161 static int
1162 rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg)
1163 {
1164 VALUE *p = (VALUE *)arg;
1165 VALUE aliases = p[0], ary = p[1];
1166 int idx = (int)orig;
1167 VALUE key, str = rb_ary_entry(ary, idx);
1168
1169 if (NIL_P(str)) {
1170 rb_encoding *enc = rb_enc_from_index(idx);
1171
1172 if (STRCASECMP((char*)name, rb_enc_name(enc)) == 0) {
1173 return ST_CONTINUE;
1174 }
1175 str = rb_usascii_str_new2(rb_enc_name(enc));
1176 OBJ_FREEZE(str);
1177 rb_ary_store(ary, idx, str);
1178 }
1179 key = rb_usascii_str_new2((char *)name);
1180 OBJ_FREEZE(key);
1181 rb_hash_aset(aliases, key, str);
1182 return ST_CONTINUE;
1183 }
1184
1185 /*
1186 * call-seq:
1187 * Encoding.aliases => {"alias1" => "orig1", "alias2" => "orig2", ...}
1188 *
1189 * Returns the hash of available encoding alias and original encoding name.
1190 *
1191 * Encoding.aliases
1192 * => {"BINARY"=>"ASCII-8BIT", "ASCII"=>"US-ASCII", "ANSI_X3.4-1986"=>"US-ASCII",
1193 * "SJIS"=>"Shift_JIS", "eucJP"=>"EUC-JP", "CP932"=>"Windows-31J"}
1194 *
1195 */
1196
1197 static VALUE
1198 rb_enc_aliases(VALUE klass)
1199 {
1200 VALUE aliases[2];
1201 aliases[0] = rb_hash_new();
1202 aliases[1] = rb_ary_new();
1203 st_foreach(enc_table.names, rb_enc_aliases_enc_i, (st_data_t)aliases);
1204 return aliases[0];
1205 }
1206
1207 void
1208 Init_Encoding(void)
1209 {
1210 id_base_encoding = rb_intern("#base_encoding");
1211
1212 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
1213 rb_undef_alloc_func(rb_cEncoding);
1214 rb_define_method(rb_cEncoding, "to_s", enc_name, 0);
1215 rb_define_method(rb_cEncoding, "inspect", enc_inspect, 0);
1216 rb_define_method(rb_cEncoding, "name", enc_name, 0);
1217 rb_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0);
1218 rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0);
1219 rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0);
1220 rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0);
1221 rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0);
1222 rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
1223 rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2);
1224
1225 rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
1226 rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
1227
1228 rb_define_singleton_method(rb_cEncoding, "default_external", get_default_external, 0);
1229 rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
1230
1231 enc_init_db();
1232 }
1233
1234 /* locale insensitive functions */
1235
1236 #define ctype_test(c, ctype) \
1237 (rb_isascii(c) && ONIGENC_IS_ASCII_CODE_CTYPE((c), ctype))
1238
1239 int rb_isalnum(int c) { return ctype_test(c, ONIGENC_CTYPE_ALNUM); }
1240 int rb_isalpha(int c) { return ctype_test(c, ONIGENC_CTYPE_ALPHA); }
1241 int rb_isblank(int c) { return ctype_test(c, ONIGENC_CTYPE_BLANK); }
1242 int rb_iscntrl(int c) { return ctype_test(c, ONIGENC_CTYPE_CNTRL); }
1243 int rb_isdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_DIGIT); }
1244 int rb_isgraph(int c) { return ctype_test(c, ONIGENC_CTYPE_GRAPH); }
1245 int rb_islower(int c) { return ctype_test(c, ONIGENC_CTYPE_LOWER); }
1246 int rb_isprint(int c) { return ctype_test(c, ONIGENC_CTYPE_PRINT); }
1247 int rb_ispunct(int c) { return ctype_test(c, ONIGENC_CTYPE_PUNCT); }
1248 int rb_isspace(int c) { return ctype_test(c, ONIGENC_CTYPE_SPACE); }
1249 int rb_isupper(int c) { return ctype_test(c, ONIGENC_CTYPE_UPPER); }
1250 int rb_isxdigit(int c) { return ctype_test(c, ONIGENC_CTYPE_XDIGIT); }
1251
1252 int
1253 rb_tolower(int c)
1254 {
1255 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_LOWER_CASE(c) : c;
1256 }
1257
1258 int
1259 rb_toupper(int c)
1260 {
1261 return rb_isascii(c) ? ONIGENC_ASCII_CODE_TO_UPPER_CASE(c) : c;
1262 }
1263
Something went wrong with that request. Please try again.