Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Newer
Older
100644 475 lines (423 sloc) 13.501 kb
9c1d230 committing experimental branch content
Laurent Sansonetti authored
1 /**********************************************************************
2
3 transcode.c -
4
5 $Author: naruse $
6 created at: Tue Oct 30 16:10:22 JST 2007
7
8 Copyright (C) 2007 Martin Duerst
9
10 **********************************************************************/
11
12 #include "ruby/ruby.h"
13
14 #if !WITH_OBJC
15
16 #include "ruby/encoding.h"
17 #define PType (int)
18 #include "transcode_data.h"
19 #include <ctype.h>
20
21 static VALUE sym_invalid, sym_ignore;
22 #define INVALID_IGNORE 0x1
23
24 /*
25 * Dispatch data and logic
26 */
27
28 static st_table *transcoder_table, *transcoder_lib_table;
29
30 #define TRANSCODER_INTERNAL_SEPARATOR '\t'
31
32 static char *
33 transcoder_key(const char *from_e, const char *to_e)
34 {
35 int to_len = strlen(to_e);
36 int from_len = strlen(from_e);
37 char *const key = xmalloc(to_len + from_len + 2);
38
39 memcpy(key, to_e, to_len);
40 memcpy(key + to_len + 1, from_e, from_len + 1);
41 key[to_len] = TRANSCODER_INTERNAL_SEPARATOR;
42 return key;
43 }
44
45 void
46 rb_register_transcoder(const rb_transcoder *tr)
47 {
48 st_data_t k, val = 0;
49 const char *const from_e = tr->from_encoding;
50 const char *const to_e = tr->to_encoding;
51 char *const key = transcoder_key(from_e, to_e);
52
53 if (st_lookup(transcoder_table, (st_data_t)key, &val)) {
54 xfree(key);
55 rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered",
56 from_e, to_e);
57 }
58 k = (st_data_t)key;
59 if (st_delete(transcoder_lib_table, &k, &val)) {
60 xfree((char *)k);
61 }
62 st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr);
63 }
64
65 static void
66 declare_transcoder(const char *to, const char *from, const char *lib)
67 {
68 const char *const key = transcoder_key(to, from);
69 st_data_t k = (st_data_t)key, val;
70
71 if (st_delete(transcoder_lib_table, &k, &val)) {
72 xfree((char *)k);
73 }
74 st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib);
75 }
76
77 #define MAX_TRANSCODER_LIBNAME_LEN 64
78 static const char transcoder_lib_prefix[] = "enc/trans/";
79
80 void
81 rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
82 {
83 if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) {
84 rb_raise(rb_eArgError, "invalid library name - %s",
85 lib ? lib : "(null)");
86 }
87 declare_transcoder(enc1, enc2, lib);
88 declare_transcoder(enc2, enc1, lib);
89 }
90
91 #define encoding_equal(enc1, enc2) (STRCASECMP(enc1, enc2) == 0)
92
93 static const rb_transcoder *
94 transcode_dispatch(const char* from_encoding, const char* to_encoding)
95 {
96 char *const key = transcoder_key(from_encoding, to_encoding);
97 st_data_t k, val = 0;
98
99 while (!st_lookup(transcoder_table, (k = (st_data_t)key), &val) &&
100 st_delete(transcoder_lib_table, &k, &val)) {
101 const char *const lib = (const char *)val;
102 int len = strlen(lib);
103 char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN];
104
105 xfree((char *)k);
106 if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL;
107 memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1);
108 memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1);
109 if (!rb_require(path)) return NULL;
110 }
111 if (!val) {
112 if (!st_lookup(transcoder_table, (st_data_t)key, &val)) {
113 /* multistep logic, via UTF-8 */
114 if (!encoding_equal(from_encoding, "UTF-8") &&
115 !encoding_equal(to_encoding, "UTF-8") &&
116 transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */
117 return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */
118 }
119 return NULL;
120 }
121 }
122 return (rb_transcoder *)val;
123 }
124
125
126 /*
127 * Transcoding engine logic
128 */
129 static void
130 transcode_loop(unsigned char **in_pos, unsigned char **out_pos,
131 unsigned char *in_stop, unsigned char *out_stop,
132 const rb_transcoder *my_transcoder,
133 rb_transcoding *my_transcoding,
134 const int opt)
135 {
136 unsigned char *in_p = *in_pos, *out_p = *out_pos;
137 const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start;
138 const BYTE_LOOKUP *next_table;
139 unsigned char *char_start;
140 unsigned int next_offset;
141 VALUE next_info;
142 unsigned char next_byte;
143 int from_utf8 = my_transcoder->from_utf8;
144 unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
145 while (in_p < in_stop) {
146 char_start = in_p;
147 next_table = conv_tree_start;
148 if (out_p >= out_s) {
149 int len = (out_p - *out_pos);
150 int new_len = (len + my_transcoder->max_output) * 2;
151 *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
152 out_p = *out_pos + len;
153 out_s = *out_pos + new_len - my_transcoder->max_output;
154 }
155 next_byte = (unsigned char)*in_p++;
156 follow_byte:
157 next_offset = next_table->base[next_byte];
158 next_info = (VALUE)next_table->info[next_offset];
159 follow_info:
160 switch (next_info & 0x1F) {
161 case NOMAP:
162 *out_p++ = next_byte;
163 continue;
164 case 0x00: case 0x04: case 0x08: case 0x0C:
165 case 0x10: case 0x14: case 0x18: case 0x1C:
166 if (in_p >= in_stop) {
167 /* todo: deal with the case of backtracking */
168 /* todo: deal with incomplete input (streaming) */
169 goto invalid;
170 }
171 next_byte = (unsigned char)*in_p++;
172 if (from_utf8) {
173 if ((next_byte&0xC0) == 0x80)
174 next_byte -= 0x80;
175 else {
176 in_p--; /* may need to add more code later to revert other things */
177 goto invalid;
178 }
179 }
180 next_table = (const BYTE_LOOKUP *)next_info;
181 goto follow_byte;
182 /* maybe rewrite the following cases to use fallthrough???? */
183 case ZERObt: /* drop input */
184 continue;
185 case ONEbt:
186 *out_p++ = getBT1(next_info);
187 continue;
188 case TWObt:
189 *out_p++ = getBT1(next_info);
190 *out_p++ = getBT2(next_info);
191 continue;
192 case FOURbt:
193 *out_p++ = getBT0(next_info);
194 case THREEbt: /* fall through */
195 *out_p++ = getBT1(next_info);
196 *out_p++ = getBT2(next_info);
197 *out_p++ = getBT3(next_info);
198 continue;
199 case FUNii:
200 next_info = (VALUE)(*my_transcoder->func_ii)(next_info);
201 goto follow_info;
202 case FUNsi:
203 next_info = (VALUE)(*my_transcoder->func_si)(char_start);
204 goto follow_info;
205 break;
206 case FUNio:
207 out_p += (VALUE)(*my_transcoder->func_io)(next_info, out_p);
208 break;
209 case FUNso:
210 out_p += (VALUE)(*my_transcoder->func_so)(char_start, out_p);
211 break;
212 case INVALID:
213 goto invalid;
214 case UNDEF:
215 /* todo: add code for alternate behaviors */
216 rb_raise(rb_eRuntimeError /*@@@change exception*/, "conversion undefined for byte sequence (maybe invalid byte sequence)");
217 continue;
218 }
219 continue;
220 invalid:
221 /* deal with invalid byte sequence */
222 /* todo: add more alternative behaviors */
223 if (opt&INVALID_IGNORE) {
224 continue;
225 }
226 rb_raise(rb_eRuntimeError /*change exception*/, "invalid byte sequence");
227 continue;
228 }
229 /* cleanup */
230 *in_pos = in_p;
231 *out_pos = out_p;
232 }
233
234
235 /*
236 * String-specific code
237 */
238
239 static unsigned char *
240 str_transcoding_resize(rb_transcoding *my_transcoding, int len, int new_len)
241 {
242 VALUE dest_string = my_transcoding->ruby_string_dest;
243 rb_str_resize(dest_string, new_len);
244 return (unsigned char *)RSTRING_BYTEPTR(dest_string);
245 }
246
247 static int
248 str_transcode(int argc, VALUE *argv, VALUE *self)
249 {
250 VALUE dest;
251 VALUE str = *self;
252 long blen, slen;
253 unsigned char *buf, *bp, *sp, *fromp;
254 rb_encoding *from_enc, *to_enc;
255 const char *from_e, *to_e;
256 int from_encidx, to_encidx;
257 VALUE from_encval, to_encval;
258 const rb_transcoder *my_transcoder;
259 rb_transcoding my_transcoding;
260 int final_encoding = 0;
261 VALUE opt;
262 int options = 0;
263
264 opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
265 if (!NIL_P(opt)) {
266 VALUE v;
267
268 argc--;
269 v = rb_hash_aref(opt, sym_invalid);
270 if (NIL_P(v)) {
271 rb_raise(rb_eArgError, "unknown value for invalid: setting");
272 }
273 else if (v==sym_ignore) {
274 options |= INVALID_IGNORE;
275 }
276 }
277 if (argc < 1 || argc > 2) {
278 rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
279 }
280 if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) {
281 to_enc = 0;
282 to_encidx = 0;
283 to_e = StringValueCStr(to_encval);
284 }
285 else {
286 to_enc = rb_enc_from_index(to_encidx);
287 to_e = rb_enc_name(to_enc);
288 }
289 if (argc==1) {
290 from_encidx = rb_enc_get_index(str);
291 from_enc = rb_enc_from_index(from_encidx);
292 from_e = rb_enc_name(from_enc);
293 }
294 else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) {
295 from_enc = 0;
296 from_e = StringValueCStr(from_encval);
297 }
298 else {
299 from_enc = rb_enc_from_index(from_encidx);
300 from_e = rb_enc_name(from_enc);
301 }
302
303 if (from_enc && from_enc == to_enc) {
304 return -1;
305 }
306 if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) {
307 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
308 return to_encidx;
309 }
310 }
311 if (encoding_equal(from_e, to_e)) {
312 return -1;
313 }
314
315 while (!final_encoding) { /* loop for multistep transcoding */
316 /* later, maybe use smaller intermediate strings for very long strings */
317 if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
318 rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
319 }
320
321 my_transcoding.transcoder = my_transcoder;
322
323 if (my_transcoder->preprocessor) {
324 fromp = sp = (unsigned char *)RSTRING_BYTEPTR(str);
325 slen = RSTRING_BYTELEN(str);
326 blen = slen + 30; /* len + margin */
327 dest = rb_str_tmp_new(blen);
328 bp = (unsigned char *)RSTRING_BYTEPTR(dest);
329 my_transcoding.ruby_string_dest = dest;
330 (*my_transcoder->preprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
331 if (fromp != sp+slen) {
332 rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
333 }
334 buf = (unsigned char *)RSTRING_BYTEPTR(dest);
335 *bp = '\0';
336 rb_str_set_len(dest, bp - buf);
337 str = dest;
338 }
339 fromp = sp = (unsigned char *)RSTRING_BYTEPTR(str);
340 slen = RSTRING_BYTELEN(str);
341 blen = slen + 30; /* len + margin */
342 dest = rb_str_tmp_new(blen);
343 bp = (unsigned char *)RSTRING_BYTEPTR(dest);
344 my_transcoding.ruby_string_dest = dest;
345 my_transcoding.flush_func = str_transcoding_resize;
346
347 transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding, options);
348 if (fromp != sp+slen) {
349 rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
350 }
351 buf = (unsigned char *)RSTRING_BYTEPTR(dest);
352 *bp = '\0';
353 rb_str_set_len(dest, bp - buf);
354 if (my_transcoder->postprocessor) {
355 str = dest;
356 fromp = sp = (unsigned char *)RSTRING_BYTEPTR(str);
357 slen = RSTRING_BYTELEN(str);
358 blen = slen + 30; /* len + margin */
359 dest = rb_str_tmp_new(blen);
360 bp = (unsigned char *)RSTRING_BYTEPTR(dest);
361 my_transcoding.ruby_string_dest = dest;
362 (*my_transcoder->postprocessor)(&fromp, &bp, (sp+slen), (bp+blen), &my_transcoding);
363 if (fromp != sp+slen) {
364 rb_raise(rb_eArgError, "not fully converted, %td bytes left", sp+slen-fromp);
365 }
366 buf = (unsigned char *)RSTRING_BYTEPTR(dest);
367 *bp = '\0';
368 rb_str_set_len(dest, bp - buf);
369 }
370
371 if (encoding_equal(my_transcoder->to_encoding, to_e)) {
372 final_encoding = 1;
373 }
374 else {
375 from_e = my_transcoder->to_encoding;
376 str = dest;
377 }
378 }
379 /* set encoding */
380 if (!to_enc) {
381 to_encidx = rb_define_dummy_encoding(to_e);
382 }
383 *self = dest;
384
385 return to_encidx;
386 }
387
388 /*
389 * call-seq:
390 * str.encode!(encoding [, options] ) => str
391 * str.encode!(to_encoding, from_encoding [, options] ) => str
392 *
393 * The first form transcodes the contents of <i>str</i> from
394 * str.encoding to +encoding+.
395 * The second form transcodes the contents of <i>str</i> from
396 * from_encoding to to_encoding.
397 * The options Hash gives details for conversion. See String#encode
398 * for details.
399 * Returns the string even if no changes were made.
400 */
401
402 static VALUE
05695a6 ported to rb_objc_define_method()
Laurent Sansonetti authored
403 str_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
404 {
405 VALUE newstr = str;
406 int encidx = str_transcode(argc, argv, &newstr);
407 int cr = 0;
408
409 if (encidx < 0) return str;
410 rb_str_shared_replace(str, newstr);
411 rb_enc_associate_index(str, encidx);
412
413 /* transcoded string never be broken. */
414 if (rb_enc_asciicompat(rb_enc_from_index(encidx))) {
415 rb_str_coderange_scan_restartable(RSTRING_BYTEPTR(str), RSTRING_END(str), 0, &cr);
416 }
417 else {
418 cr = ENC_CODERANGE_VALID;
419 }
420 ENC_CODERANGE_SET(str, cr);
421 return str;
422 }
423
424 /*
425 * call-seq:
426 * str.encode(encoding [, options] ) => str
427 * str.encode(to_encoding, from_encoding [, options] ) => str
428 *
429 * The first form returns a copy of <i>str</i> transcoded
430 * to encoding +encoding+.
431 * The second form returns a copy of <i>str</i> transcoded
432 * from from_encoding to to_encoding.
433 * The options Hash gives details for conversion. Details
434 * to be added.
435 */
436
437 #else // WITH_OBJC
438
439 static VALUE
05695a6 ported to rb_objc_define_method()
Laurent Sansonetti authored
440 str_encode_bang(VALUE str, SEL sel, int argc, VALUE *argv)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
441 {
442 /* TODO */
a574509 fixed the build (sorry guys)
Laurent Sansonetti authored
443 return str;
9c1d230 committing experimental branch content
Laurent Sansonetti authored
444 }
445
446 #endif
447
448 static VALUE
05695a6 ported to rb_objc_define_method()
Laurent Sansonetti authored
449 str_encode(VALUE str, SEL sel, int argc, VALUE *argv)
9c1d230 committing experimental branch content
Laurent Sansonetti authored
450 {
451 str = rb_str_dup(str);
a574509 fixed the build (sorry guys)
Laurent Sansonetti authored
452 return str_encode_bang(str, 0, argc, argv);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
453 }
454
455 VALUE
456 rb_str_transcode(VALUE str, VALUE to)
457 {
a574509 fixed the build (sorry guys)
Laurent Sansonetti authored
458 return str_encode(str, 0, 1, &to);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
459 }
460
461 void
462 Init_transcode(void)
463 {
464 #if !WITH_OBJC
465 transcoder_table = st_init_strcasetable();
466 transcoder_lib_table = st_init_strcasetable();
467
468 sym_invalid = ID2SYM(rb_intern("invalid"));
469 sym_ignore = ID2SYM(rb_intern("ignore"));
470 #endif
471
05695a6 ported to rb_objc_define_method()
Laurent Sansonetti authored
472 rb_objc_define_method(rb_cString, "encode", str_encode, -1);
473 rb_objc_define_method(rb_cString, "encode!", str_encode_bang, -1);
9c1d230 committing experimental branch content
Laurent Sansonetti authored
474 }
Something went wrong with that request. Please try again.