Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 381 lines (333 sloc) 11.558 kb
96ab900 more work
Laurent Sansonetti authored
1 /*
2 * MacRuby implementation of Ruby 1.9 String.
3 *
4 * This file is covered by the Ruby license. See COPYING for more details.
5 *
6 * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
7 * Copyright (C) 1993-2007 Yukihiro Matsumoto
8 * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
9 * Copyright (C) 2000 Information-technology Promotion Agency, Japan
10 */
11
2b7d5d5 import vincent's work
Laurent Sansonetti authored
12 #include "encoding.h"
13 #include <string.h>
9c1d230 committing experimental branch content
Laurent Sansonetti authored
14
96ab900 more work
Laurent Sansonetti authored
15 VALUE rb_cEncoding;
8b9745b define Encoding::ASCII_8BIT as a shortcut to US_ASCII (for now)
Laurent Sansonetti authored
16
96ab900 more work
Laurent Sansonetti authored
17 static rb_encoding_t *default_internal = NULL;
18 static rb_encoding_t *default_external = NULL;
19 rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
9c1d230 committing experimental branch content
Laurent Sansonetti authored
20
96ab900 more work
Laurent Sansonetti authored
21 static void str_undefined_update_flags(rb_str_t *self) { abort(); }
22 static void str_undefined_make_data_binary(rb_str_t *self) { abort(); }
23 static bool str_undefined_try_making_data_uchars(rb_str_t *self) { abort(); }
24 static long str_undefined_length(rb_str_t *self, bool ucs2_mode) { abort(); }
25 static long str_undefined_bytesize(rb_str_t *self) { abort(); }
26 static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
27 static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
9c1d230 committing experimental branch content
Laurent Sansonetti authored
28
29 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
30 mr_enc_s_list(VALUE klass, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
31 {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
32 VALUE ary = rb_ary_new2(ENCODINGS_COUNT);
33 for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
96ab900 more work
Laurent Sansonetti authored
34 rb_ary_push(ary, (VALUE)rb_encodings[i]);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
35 }
2b7d5d5 import vincent's work
Laurent Sansonetti authored
36 return ary;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
37 }
38
39 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
40 mr_enc_s_name_list(VALUE klass, SEL sel)
41 {
42 VALUE ary = rb_ary_new();
43 for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
96ab900 more work
Laurent Sansonetti authored
44 rb_encoding_t *encoding = RENC(rb_encodings[i]);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
45 // TODO: use US-ASCII strings
96ab900 more work
Laurent Sansonetti authored
46 rb_ary_push(ary, rb_usascii_str_new2(encoding->public_name));
2b7d5d5 import vincent's work
Laurent Sansonetti authored
47 for (unsigned int j = 0; j < encoding->aliases_count; ++j) {
96ab900 more work
Laurent Sansonetti authored
48 rb_ary_push(ary, rb_usascii_str_new2(encoding->aliases[j]));
2b7d5d5 import vincent's work
Laurent Sansonetti authored
49 }
9c1d230 committing experimental branch content
Laurent Sansonetti authored
50 }
51 return ary;
52 }
53
54 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
55 mr_enc_s_aliases(VALUE klass, SEL sel)
56 {
57 VALUE hash = rb_hash_new();
58 for (unsigned int i = 0; i < ENCODINGS_COUNT; ++i) {
96ab900 more work
Laurent Sansonetti authored
59 rb_encoding_t *encoding = RENC(rb_encodings[i]);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
60 for (unsigned int j = 0; j < encoding->aliases_count; ++j) {
96ab900 more work
Laurent Sansonetti authored
61 rb_hash_aset(hash, rb_usascii_str_new2(encoding->aliases[j]),
62 rb_usascii_str_new2(encoding->public_name));
2b7d5d5 import vincent's work
Laurent Sansonetti authored
63 }
9c1d230 committing experimental branch content
Laurent Sansonetti authored
64 }
2b7d5d5 import vincent's work
Laurent Sansonetti authored
65 return hash;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
66 }
67
68 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
69 mr_enc_s_default_internal(VALUE klass, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
70 {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
71 return (VALUE)default_internal;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
72 }
73
74 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
75 mr_enc_s_default_external(VALUE klass, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
76 {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
77 return (VALUE)default_external;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
78 }
79
80 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
81 mr_enc_name(VALUE self, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
82 {
96ab900 more work
Laurent Sansonetti authored
83 return rb_usascii_str_new2(RENC(self)->public_name);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
84 }
85
86 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
87 mr_enc_inspect(VALUE self, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
88 {
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
89 return rb_sprintf("#<%s:%s>", rb_obj_classname(self),
96ab900 more work
Laurent Sansonetti authored
90 RENC(self)->public_name);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
91 }
92
93 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
94 mr_enc_names(VALUE self, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
95 {
96ab900 more work
Laurent Sansonetti authored
96 rb_encoding_t *encoding = RENC(self);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
97
2b7d5d5 import vincent's work
Laurent Sansonetti authored
98 VALUE ary = rb_ary_new2(encoding->aliases_count + 1);
96ab900 more work
Laurent Sansonetti authored
99 rb_ary_push(ary, rb_usascii_str_new2(encoding->public_name));
2b7d5d5 import vincent's work
Laurent Sansonetti authored
100 for (unsigned int i = 0; i < encoding->aliases_count; ++i) {
96ab900 more work
Laurent Sansonetti authored
101 rb_ary_push(ary, rb_usascii_str_new2(encoding->aliases[i]));
2b7d5d5 import vincent's work
Laurent Sansonetti authored
102 }
103 return ary;
1623532 added Encoding#default_external= and Encoding#default_internal= which do...
Laurent Sansonetti authored
104 }
105
2b7d5d5 import vincent's work
Laurent Sansonetti authored
106 static VALUE
107 mr_enc_ascii_compatible_p(VALUE self, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
108 {
96ab900 more work
Laurent Sansonetti authored
109 return RENC(self)->ascii_compatible ? Qtrue : Qfalse;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
110 }
111
112 static VALUE
2b7d5d5 import vincent's work
Laurent Sansonetti authored
113 mr_enc_dummy_p(VALUE self, SEL sel)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
114 {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
115 return Qfalse;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
116 }
117
2b7d5d5 import vincent's work
Laurent Sansonetti authored
118 static void
96ab900 more work
Laurent Sansonetti authored
119 define_encoding_constant(const char *name, rb_encoding_t *encoding)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
120 {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
121 char c = name[0];
122 if ((c >= '0') && (c <= '9')) {
123 // constants can't start with a number
124 return;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
125 }
126
2b7d5d5 import vincent's work
Laurent Sansonetti authored
127 char *name_copy = strdup(name);
128 if ((c >= 'a') && (c <= 'z')) {
129 // the first character must be upper case
130 name_copy[0] = c - ('a' - 'A');
131 }
9c1d230 committing experimental branch content
Laurent Sansonetti authored
132
2b7d5d5 import vincent's work
Laurent Sansonetti authored
133 // '.' and '-' must be transformed into '_'
134 for (int i = 0; name_copy[i]; ++i) {
135 if ((name_copy[i] == '.') || (name_copy[i] == '-')) {
136 name_copy[i] = '_';
023dd4d fixed Encoding#name for 10.6
Laurent Sansonetti authored
137 }
138 }
b881853 s/MR//
Laurent Sansonetti authored
139 rb_define_const(rb_cEncoding, name_copy, (VALUE)encoding);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
140 free(name_copy);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
141 }
142
96ab900 more work
Laurent Sansonetti authored
143 extern void enc_init_ucnv_encoding(rb_encoding_t *encoding);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
144
2b7d5d5 import vincent's work
Laurent Sansonetti authored
145 enum {
146 ENCODING_TYPE_SPECIAL = 0,
147 ENCODING_TYPE_UCNV
148 };
9c1d230 committing experimental branch content
Laurent Sansonetti authored
149
2b7d5d5 import vincent's work
Laurent Sansonetti authored
150 static void
151 add_encoding(
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
152 unsigned int encoding_index, // index of the encoding in the encodings
153 // array
96ab900 more work
Laurent Sansonetti authored
154 unsigned int rb_encoding_type,
2b7d5d5 import vincent's work
Laurent Sansonetti authored
155 const char *public_name, // public name for the encoding
156 unsigned char min_char_size,
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
157 bool single_byte_encoding, // in the encoding a character takes only
158 // one byte
2b7d5d5 import vincent's work
Laurent Sansonetti authored
159 bool ascii_compatible, // is the encoding ASCII compatible or not
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
160 ... // aliases for the encoding (should no include the public name)
161 // - must end with a NULL
2b7d5d5 import vincent's work
Laurent Sansonetti authored
162 )
163 {
164 assert(encoding_index < ENCODINGS_COUNT);
165
166 // create an array for the aliases
167 unsigned int aliases_count = 0;
168 va_list va_aliases;
169 va_start(va_aliases, ascii_compatible);
170 while (va_arg(va_aliases, const char *) != NULL) {
171 ++aliases_count;
172 }
173 va_end(va_aliases);
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
174 const char **aliases = (const char **)
175 malloc(sizeof(const char *) * aliases_count);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
176 va_start(va_aliases, ascii_compatible);
177 for (unsigned int i = 0; i < aliases_count; ++i) {
178 aliases[i] = va_arg(va_aliases, const char *);
179 }
180 va_end(va_aliases);
181
182 // create the MacRuby object
96ab900 more work
Laurent Sansonetti authored
183 NEWOBJ(encoding, rb_encoding_t);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
184 encoding->basic.flags = 0;
b881853 s/MR//
Laurent Sansonetti authored
185 encoding->basic.klass = rb_cEncoding;
96ab900 more work
Laurent Sansonetti authored
186 rb_encodings[encoding_index] = encoding;
187 GC_RETAIN(encoding); // it should never be deallocated
2b7d5d5 import vincent's work
Laurent Sansonetti authored
188
189 // fill the fields
190 encoding->index = encoding_index;
191 encoding->public_name = public_name;
192 encoding->min_char_size = min_char_size;
193 encoding->single_byte_encoding = single_byte_encoding;
194 encoding->ascii_compatible = ascii_compatible;
195 encoding->aliases_count = aliases_count;
196 encoding->aliases = aliases;
197
198 // fill the default implementations with aborts
199 encoding->methods.update_flags = str_undefined_update_flags;
200 encoding->methods.make_data_binary = str_undefined_make_data_binary;
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
201 encoding->methods.try_making_data_uchars =
202 str_undefined_try_making_data_uchars;
2b7d5d5 import vincent's work
Laurent Sansonetti authored
203 encoding->methods.length = str_undefined_length;
204 encoding->methods.bytesize = str_undefined_bytesize;
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
205 encoding->methods.get_character_boundaries =
206 str_undefined_get_character_boundaries;
207 encoding->methods.offset_in_bytes_to_index =
208 str_undefined_offset_in_bytes_to_index;
2b7d5d5 import vincent's work
Laurent Sansonetti authored
209
96ab900 more work
Laurent Sansonetti authored
210 switch (rb_encoding_type) {
2b7d5d5 import vincent's work
Laurent Sansonetti authored
211 case ENCODING_TYPE_SPECIAL:
022cd7c fixed ByteString#encoding to always return US_ASCII (for now)
Laurent Sansonetti authored
212 break;
2b7d5d5 import vincent's work
Laurent Sansonetti authored
213 case ENCODING_TYPE_UCNV:
214 enc_init_ucnv_encoding(encoding);
215 break;
216 default:
217 abort();
9c1d230 committing experimental branch content
Laurent Sansonetti authored
218 }
022cd7c fixed ByteString#encoding to always return US_ASCII (for now)
Laurent Sansonetti authored
219
2b7d5d5 import vincent's work
Laurent Sansonetti authored
220 // create constants
221 define_encoding_constant(public_name, encoding);
222 for (unsigned int i = 0; i < aliases_count; ++i) {
223 define_encoding_constant(aliases[i], encoding);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
224 }
225 }
226
2b7d5d5 import vincent's work
Laurent Sansonetti authored
227 static void
228 create_encodings(void)
229 {
230 add_encoding(ENCODING_BINARY, ENCODING_TYPE_SPECIAL, "ASCII-8BIT", 1, true, true, "BINARY", NULL);
231 add_encoding(ENCODING_ASCII, ENCODING_TYPE_UCNV, "US-ASCII", 1, true, true, "ASCII", "ANSI_X3.4-1968", "646", NULL);
232 add_encoding(ENCODING_UTF8, ENCODING_TYPE_UCNV, "UTF-8", 1, false, true, "CP65001", NULL);
233 add_encoding(ENCODING_UTF16BE, ENCODING_TYPE_UCNV, "UTF-16BE", 2, false, false, NULL);
234 add_encoding(ENCODING_UTF16LE, ENCODING_TYPE_UCNV, "UTF-16LE", 2, false, false, NULL);
235 add_encoding(ENCODING_UTF32BE, ENCODING_TYPE_UCNV, "UTF-32BE", 4, false, false, "UCS-4BE", NULL);
236 add_encoding(ENCODING_UTF32LE, ENCODING_TYPE_UCNV, "UTF-32LE", 4, false, false, "UCS-4LE", NULL);
237 add_encoding(ENCODING_ISO8859_1, ENCODING_TYPE_UCNV, "ISO-8859-1", 1, true, true, "ISO8859-1", NULL);
238 add_encoding(ENCODING_MACROMAN, ENCODING_TYPE_UCNV, "macRoman", 1, true, true, NULL);
239 // FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
240 //add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL);
241 //add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL);
242 //add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);
243
96ab900 more work
Laurent Sansonetti authored
244 default_external = rb_encodings[ENCODING_UTF8];
245 default_internal = rb_encodings[ENCODING_UTF8];
9c1d230 committing experimental branch content
Laurent Sansonetti authored
246 }
247
2b7d5d5 import vincent's work
Laurent Sansonetti authored
248 VALUE
249 mr_enc_s_is_compatible(VALUE klass, SEL sel, VALUE str1, VALUE str2);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
250
2b7d5d5 import vincent's work
Laurent Sansonetti authored
251 void
b881853 s/MR//
Laurent Sansonetti authored
252 Init_Encoding(void)
2b7d5d5 import vincent's work
Laurent Sansonetti authored
253 {
b881853 s/MR//
Laurent Sansonetti authored
254 rb_cEncoding = rb_define_class("Encoding", rb_cObject);
255 rb_undef_alloc_func(rb_cEncoding);
256
257 rb_objc_define_method(rb_cEncoding, "to_s", mr_enc_name, 0);
258 rb_objc_define_method(rb_cEncoding, "inspect", mr_enc_inspect, 0);
259 rb_objc_define_method(rb_cEncoding, "name", mr_enc_name, 0);
260 rb_objc_define_method(rb_cEncoding, "names", mr_enc_names, 0);
261 rb_objc_define_method(rb_cEncoding, "dummy?", mr_enc_dummy_p, 0);
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
262 rb_objc_define_method(rb_cEncoding, "ascii_compatible?",
263 mr_enc_ascii_compatible_p, 0);
264 rb_objc_define_method(CLASS_OF(rb_cEncoding), "list", mr_enc_s_list, 0);
265 rb_objc_define_method(CLASS_OF(rb_cEncoding), "name_list",
266 mr_enc_s_name_list, 0);
267 rb_objc_define_method(CLASS_OF(rb_cEncoding), "aliases",
268 mr_enc_s_aliases, 0);
b881853 s/MR//
Laurent Sansonetti authored
269 //rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1);
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
270 // it's defined on Encoding, but it requires String's internals so it's
271 // defined with String
272 rb_objc_define_method(CLASS_OF(rb_cEncoding), "compatible?",
273 mr_enc_s_is_compatible, 2);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
274
275 //rb_define_method(rb_cEncoding, "_dump", enc_dump, -1);
276 //rb_define_singleton_method(rb_cEncoding, "_load", enc_load, 1);
277
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
278 rb_objc_define_method(CLASS_OF(rb_cEncoding), "default_external",
279 mr_enc_s_default_external, 0);
b881853 s/MR//
Laurent Sansonetti authored
280 //rb_define_singleton_method(rb_cEncoding, "default_external=", set_default_external, 1);
0382b34 indented code, better type checking, removed rb_cCFString, started addin...
Laurent Sansonetti authored
281 rb_objc_define_method(CLASS_OF(rb_cEncoding), "default_internal",
282 mr_enc_s_default_internal, 0);
b881853 s/MR//
Laurent Sansonetti authored
283 //rb_define_singleton_method(rb_cEncoding, "default_internal=", set_default_internal, 1);
284 //rb_define_singleton_method(rb_cEncoding, "locale_charmap", rb_locale_charmap, 0);
2b7d5d5 import vincent's work
Laurent Sansonetti authored
285
286 create_encodings();
9c1d230 committing experimental branch content
Laurent Sansonetti authored
287 }
96ab900 more work
Laurent Sansonetti authored
288
289 // MRI C-API compatibility.
290
291 rb_encoding_t *
292 rb_enc_find(const char *name)
293 {
294 for (unsigned int i = 0; i < ENCODINGS_COUNT; i++) {
295 rb_encoding_t *enc = rb_encodings[i];
296 if (strcasecmp(enc->public_name, name) == 0) {
297 return enc;
298 }
299 for (unsigned int j = 0; j < enc->aliases_count; j++) {
300 const char *alias = enc->aliases[j];
301 if (strcasecmp(alias, name) == 0) {
302 return enc;
303 }
304 }
305 }
306 return NULL;
307 }
308
309 VALUE
310 rb_enc_from_encoding(rb_encoding_t *enc)
311 {
312 return (VALUE)enc;
313 }
314
315 rb_encoding_t *
316 rb_enc_get(VALUE obj)
317 {
318 if (IS_RSTR(obj)) {
319 return RSTR(obj)->encoding;
320 }
321 // TODO support symbols
322 return NULL;
323 }
324
325 rb_encoding_t *
326 rb_to_encoding(VALUE obj)
327 {
328 rb_encoding_t *enc;
329 if (CLASS_OF(obj) == rb_cEncoding) {
330 enc = RENC(obj);
331 }
332 else {
333 StringValue(obj);
334 enc = rb_enc_find(RSTRING_PTR(obj));
335 if (enc == NULL) {
336 rb_raise(rb_eArgError, "unknown encoding name - %s",
337 RSTRING_PTR(obj));
338 }
339 }
340 return enc;
341 }
342
343 const char *
344 rb_enc_name(rb_encoding_t *enc)
345 {
346 return RENC(enc)->public_name;
347 }
348
349 VALUE
350 rb_enc_name2(rb_encoding_t *enc)
351 {
352 return rb_usascii_str_new2(rb_enc_name(enc));
353 }
354
355 long
356 rb_enc_mbminlen(rb_encoding_t *enc)
357 {
358 return enc->min_char_size;
359 }
360
361 long
362 rb_enc_mbmaxlen(rb_encoding_t *enc)
363 {
364 return enc->single_byte_encoding ? 1 : 10; // XXX 10?
365 }
366
367 rb_encoding_t *
368 rb_locale_encoding(void)
369 {
370 // XXX
371 return rb_encodings[ENCODING_UTF8];
372 }
373
374 void
375 rb_enc_set_default_external(VALUE encoding)
376 {
377 assert(CLASS_OF(encoding) == rb_cEncoding);
378 default_external = RENC(encoding);
379 }
380
Something went wrong with that request. Please try again.