Permalink
Browse files

Started using C-blocks (note that they should not be used in

ObjC or sensitive places like dispatcher.cpp)

All this was to start cleaning-up string code.
I removed the function pointers for each encoding as I'm pretty sure no
one will use them to extend the encoding-handling and they make the code
harder to maintain.

Feature-wise currently the only change is that String#inspect is much
better when part of a string is invalid:
% ./miniruby -e 'p "あ\xFF"'
"あ\xFF"

git-svn-id: http://svn.macosforge.org/repository/ruby/MacRuby/trunk@5049 23306eb0-4c56-4727-a40e-e92c0eb68959
  • Loading branch information...
1 parent 895220f commit 21227436da733986dcbead973ab63f367c1da01b @vincentisambart vincentisambart committed Dec 18, 2010
Showing with 187 additions and 101 deletions.
  1. +0 −26 encoding.c
  2. +0 −13 encoding.h
  3. +38 −0 encoding_ucnv.h
  4. +2 −2 rakelib/builder/options.rb
  5. +71 −37 string.c
  6. +61 −23 ucnv.c
  7. +15 −0 vm.cpp
View
@@ -22,16 +22,6 @@ rb_encoding_t *default_internal = NULL;
static rb_encoding_t *default_external = NULL;
rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
-static void str_undefined_update_flags(rb_str_t *self) { abort(); }
-static void str_undefined_make_data_binary(rb_str_t *self) { abort(); }
-static bool str_undefined_try_making_data_uchars(rb_str_t *self) { abort(); }
-static long str_undefined_length(rb_str_t *self, bool ucs2_mode) { abort(); }
-static long str_undefined_bytesize(rb_str_t *self) { abort(); }
-static character_boundaries_t str_undefined_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode) { abort(); }
-static long str_undefined_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode) { abort(); }
-static void str_undefined_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length) { abort(); }
-static void str_undefined_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *pos, char **bytes, long *bytes_length) { abort(); }
-
static VALUE
mr_enc_s_list(VALUE klass, SEL sel)
{
@@ -273,22 +263,6 @@ add_encoding(
encoding->aliases_count = aliases_count;
encoding->aliases = aliases;
- // fill the default implementations with aborts
- encoding->methods.update_flags = str_undefined_update_flags;
- encoding->methods.make_data_binary = str_undefined_make_data_binary;
- encoding->methods.try_making_data_uchars =
- str_undefined_try_making_data_uchars;
- encoding->methods.length = str_undefined_length;
- encoding->methods.bytesize = str_undefined_bytesize;
- encoding->methods.get_character_boundaries =
- str_undefined_get_character_boundaries;
- encoding->methods.offset_in_bytes_to_index =
- str_undefined_offset_in_bytes_to_index;
- encoding->methods.transcode_to_utf16 =
- str_undefined_transcode_to_utf16;
- encoding->methods.transcode_from_utf16 =
- str_undefined_transcode_from_utf16;
-
switch (rb_encoding_type) {
case ENCODING_TYPE_SPECIAL:
break;
View
@@ -110,18 +110,6 @@ typedef struct {
long end_offset_in_bytes;
} character_boundaries_t;
-typedef struct {
- void (*update_flags)(rb_str_t *);
- void (*make_data_binary)(rb_str_t *);
- bool (*try_making_data_uchars)(rb_str_t *);
- long (*length)(rb_str_t *, bool);
- long (*bytesize)(rb_str_t *);
- character_boundaries_t (*get_character_boundaries)(rb_str_t *, long, bool);
- long (*offset_in_bytes_to_index)(rb_str_t *, long, bool);
- void (*transcode_to_utf16)(struct rb_encoding *, rb_str_t *, long *, UChar **, long *);
- void (*transcode_from_utf16)(struct rb_encoding *, UChar *, long, long *, char **, long *);
-} encoding_methods_t;
-
typedef struct rb_encoding {
struct RBasic basic;
unsigned int index;
@@ -131,7 +119,6 @@ typedef struct rb_encoding {
unsigned char min_char_size;
bool single_byte_encoding : 1;
bool ascii_compatible : 1;
- encoding_methods_t methods;
void *private_data;
} rb_encoding_t;
View
@@ -0,0 +1,38 @@
+/*
+ * MacRuby implementation of Ruby 1.9 String.
+ *
+ * This file is covered by the Ruby license. See COPYING for more details.
+ *
+ * Copyright (C) 2007-2010, Apple Inc. All rights reserved.
+ * Copyright (C) 1993-2007 Yukihiro Matsumoto
+ * Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
+ * Copyright (C) 2000 Information-technology Promotion Agency, Japan
+ */
+
+#ifndef __UCNV_H_
+#define __UCNV_H_
+
+#include "encoding.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+typedef void (^each_char_callback_t)(UChar32 c, const char* character_start, long character_length, bool *stop);
+
+void str_ucnv_update_flags(rb_str_t *self);
+void str_ucnv_make_data_binary(rb_str_t *self);
+bool str_ucnv_try_making_data_uchars(rb_str_t *self);
+long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
+long str_ucnv_bytesize(rb_str_t *self);
+character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
+long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
+void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
+void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
+void str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback);
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif
+
+#endif /* __UCNV_H_ */
@@ -152,8 +152,8 @@ def initialize(opt)
sdk = opt.delete(:sdk)
has_libauto = sdk ? File.exist?("#{sdk}/usr/lib/libauto.dylib") : true
archflags = archs.map { |x| "-arch #{x}" }.join(' ')
- @cflags = "-std=c99 -I. -I./include -fno-common -pipe -g -Wall -fexceptions -O#{OPTZ_LEVEL} -Wno-deprecated-declarations -Werror #{archflags}"
- @cxxflags = "-I. -I./include -g -Wall -Wno-deprecated-declarations -Werror #{archflags}"
+ @cflags = "-std=c99 -I. -I./include -pipe -fno-common -fexceptions -fblocks -g -O#{OPTZ_LEVEL} -Wall -Wno-deprecated-declarations -Werror #{archflags}"
+ @cxxflags = "-I. -I./include -fblocks -g -Wall -Wno-deprecated-declarations -Werror #{archflags}"
@ldflags = '-lpthread -ldl -lxml2 -lobjc -licucore -framework Foundation'
@ldflags << " -lauto" if has_libauto
if opt.delete(:static)
View
@@ -23,6 +23,7 @@
#include "ruby/node.h"
#include "vm.h"
#include "class.h"
+#include "encoding_ucnv.h"
#include <unicode/unum.h>
#include <unicode/utrans.h>
@@ -138,7 +139,7 @@ str_update_flags(rb_str_t *self)
str_update_flags_utf16(self);
}
else {
- self->encoding->methods.update_flags(self);
+ str_ucnv_update_flags(self);
}
}
@@ -387,7 +388,7 @@ str_make_data_binary(rb_str_t *self)
return;
}
- self->encoding->methods.make_data_binary(self);
+ str_ucnv_make_data_binary(self);
}
static bool
@@ -418,7 +419,7 @@ str_try_making_data_uchars(rb_str_t *self)
return false;
}
- return self->encoding->methods.try_making_data_uchars(self);
+ return str_ucnv_try_making_data_uchars(self);
}
static void
@@ -469,11 +470,52 @@ str_length(rb_str_t *self, bool ucs2_mode)
return div_round_up(self->length_in_bytes, 2);
}
else {
- return self->encoding->methods.length(self, ucs2_mode);
+ return str_ucnv_length(self, ucs2_mode);
}
}
}
+static void
+str_each_char(rb_str_t *self, each_char_callback_t callback)
+{
+ if (str_is_stored_in_uchars(self)) {
+ bool stop = false;
+ long length = BYTES_TO_UCHARS(self->length_in_bytes);
+ for (long i = 0; i < length;) {
+ UChar32 c;
+ long old_i = i;
+ U16_NEXT(self->data.uchars, i, length, c);
+ callback(c, (const char *)&self->data.uchars[old_i],
+ UCHARS_TO_BYTES(old_i-i), &stop);
+ if (stop) {
+ return;
+ }
+ };
+ }
+ else if (BINARY_ENC(self->encoding)
+ || (self->encoding == rb_encodings[ENCODING_ASCII])) {
+ const uint8_t *pos = (uint8_t*)self->data.bytes;
+ const uint8_t *end = pos + self->length_in_bytes;
+ bool stop = false;
+ for (; pos < end; ++pos) {
+ UChar32 c;
+ if (*pos > 127) {
+ c = U_SENTINEL;
+ }
+ else {
+ c = *pos;
+ }
+ callback(c, (const char *)pos, 1, &stop);
+ if (stop) {
+ return;
+ }
+ }
+ }
+ else {
+ str_ucnv_each_char(self, callback);
+ }
+}
+
static UChar
str_get_uchar(rb_str_t *self, long pos, bool ucs2_mode)
{
@@ -494,7 +536,7 @@ str_bytesize(rb_str_t *self)
return self->length_in_bytes;
}
else {
- return self->encoding->methods.bytesize(self);
+ return str_ucnv_bytesize(self);
}
}
else {
@@ -654,7 +696,7 @@ str_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode)
+ 2;
}
else {
- boundaries = self->encoding->methods.get_character_boundaries(self,
+ boundaries = str_ucnv_get_character_boundaries(self,
index, ucs2_mode);
}
}
@@ -1032,7 +1074,7 @@ str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes,
return BYTES_TO_UCHARS(offset_in_bytes);
}
else {
- return self->encoding->methods.offset_in_bytes_to_index(self,
+ return str_ucnv_offset_in_bytes_to_index(self,
offset_in_bytes, ucs2_mode);
}
}
@@ -1362,7 +1404,7 @@ str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_en
pos_in_src = self->length_in_bytes;
}
else {
- src_encoding_used->methods.transcode_to_utf16(src_encoding_used,
+ str_ucnv_transcode_to_utf16(src_encoding_used,
self, &pos_in_src, &utf16, &utf16_length);
}
@@ -1441,7 +1483,7 @@ str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_en
for (;;) {
long bytes_length;
char *bytes;
- dst_encoding_used->methods.transcode_from_utf16(dst_encoding_used,
+ str_ucnv_transcode_from_utf16(dst_encoding_used,
utf16, utf16_length, &utf16_pos, &bytes, &bytes_length);
if (bytes_length > 0) {
str_concat_bytes(dst_str, bytes, bytes_length);
@@ -2785,7 +2827,8 @@ str_inspect(rb_str_t *str, bool dump)
VALUE result;
if (len == 0) {
result = rb_str_new2("\"\"");
- goto bail;
+ OBJ_INFECT(result, str);
+ return result;
}
// Allocate an UTF-8 string with a good initial capacity.
@@ -2794,31 +2837,18 @@ str_inspect(rb_str_t *str, bool dump)
BINARY_ENC(str->encoding) ? (len * 5) + 2 : len + 2;
result = rb_unicode_str_new(NULL, result_init_len);
-#define GET_UCHAR(pos) \
- ((uchars \
- ? str->data.uchars[pos] : (unsigned char)str->data.bytes[pos]))
-
inspect_append(result, '"', false);
- for (long i = 0; i < len; i++) {
- const UChar c = GET_UCHAR(i);
-
- bool print;
- if (uchars) {
- print = iswprint(c);
- }
- else { // ASCII printable characters
- print = ((c >= 0x20) && (c <= 0x7E));
+ __block UChar32 prev = 0;
+ str_each_char(str, ^(UChar32 c, const char* char_start, long char_len, bool *stop) {
+ bool print = iswprint(c);
+ if (dump && prev == '#') {
+ inspect_append(result, prev, (c == '$' || c == '@' || c == '{'));
}
if (print) {
if (c == '"' || c == '\\') {
inspect_append(result, c, true);
}
- else if (dump && c == '#' && i + 1 < len) {
- const UChar c2 = GET_UCHAR(i + 1);
- const bool need_escape = c2 == '$' || c2 == '@' || c2 == '{';
- inspect_append(result, c, need_escape);
- }
- else {
+ else if (c != '#' || !dump) {
inspect_append(result, c, false);
}
}
@@ -2848,19 +2878,23 @@ str_inspect(rb_str_t *str, bool dump)
}
else {
char buf[10];
- snprintf(buf, sizeof buf, "\\x%02X", c);
- char *p = buf;
- while (*p != '\0') {
- inspect_append(result, *p, false);
- p++;
+ for (long i = 0; i < char_len; ++i) {
+ uint8_t byte = (uint8_t)char_start[i];
+ snprintf(buf, sizeof buf, "\\x%02X", byte);
+ char *p = buf;
+ while (*p != '\0') {
+ inspect_append(result, *p, false);
+ p++;
+ }
}
}
+ prev = c;
+ });
+ if (dump && prev == '#') {
+ inspect_append(result, prev, false);
}
inspect_append(result, '"', false);
-#undef GET_UCHAR
-
-bail:
OBJ_INFECT(result, str);
return result;
}
Oops, something went wrong. Retry.

0 comments on commit 2122743

Please sign in to comment.