Permalink
Browse files

changed the internal representation of strings

The strings could have 2 internal representations (UTF-16 or binary),
there is now only the binary one. It makes a few things harder, but
other things much simpler.

The main reason for doing it is that we could have problems in
multi-threaded applications, when multiple threads are using the same
string at the same time, even without using any operation modifiying the
string (as some operations were prefering using the string in UTF-16 and
others in binary mode).

git-svn-id: http://svn.macosforge.org/repository/ruby/MacRuby/trunk@5054 23306eb0-4c56-4727-a40e-e92c0eb68959
  • Loading branch information...
vincentisambart committed Dec 19, 2010
1 parent 27f0b22 commit e22f26d87420f64fe876eab933de9bd826bf5505
Showing with 428 additions and 785 deletions.
  1. +17 −15 encoding.c
  2. +18 −62 encoding.h
  3. +2 −5 encoding_ucnv.h
  4. +375 −560 string.c
  5. +1 −1 transcode.c
  6. +15 −142 ucnv.c
View
@@ -225,6 +225,7 @@ add_encoding(
bool single_byte_encoding, // in the encoding a character takes only
// one byte
bool ascii_compatible, // is the encoding ASCII compatible or not
+ bool little_endian, // for UTF-16/32, if the encoding is little endian
... // aliases for the encoding (should no include the public name)
// - must end with a NULL
)
@@ -234,14 +235,14 @@ add_encoding(
// create an array for the aliases
unsigned int aliases_count = 0;
va_list va_aliases;
- va_start(va_aliases, ascii_compatible);
+ va_start(va_aliases, little_endian);
while (va_arg(va_aliases, const char *) != NULL) {
++aliases_count;
}
va_end(va_aliases);
const char **aliases = (const char **)
malloc(sizeof(const char *) * aliases_count);
- va_start(va_aliases, ascii_compatible);
+ va_start(va_aliases, little_endian);
for (unsigned int i = 0; i < aliases_count; ++i) {
aliases[i] = va_arg(va_aliases, const char *);
}
@@ -260,6 +261,7 @@ add_encoding(
encoding->min_char_size = min_char_size;
encoding->single_byte_encoding = single_byte_encoding;
encoding->ascii_compatible = ascii_compatible;
+ encoding->little_endian = little_endian;
encoding->aliases_count = aliases_count;
encoding->aliases = aliases;
@@ -279,20 +281,20 @@ add_encoding(
void
Init_PreEncoding(void)
{
- add_encoding(ENCODING_BINARY, ENCODING_TYPE_SPECIAL, "ASCII-8BIT", 1, true, true, "BINARY", NULL);
- add_encoding(ENCODING_ASCII, ENCODING_TYPE_UCNV, "US-ASCII", 1, true, true, "ASCII", "ANSI_X3.4-1968", "646", NULL);
- add_encoding(ENCODING_UTF8, ENCODING_TYPE_UCNV, "UTF-8", 1, false, true, "CP65001", "locale", NULL);
- add_encoding(ENCODING_UTF16BE, ENCODING_TYPE_UCNV, "UTF-16BE", 2, false, false, NULL);
- add_encoding(ENCODING_UTF16LE, ENCODING_TYPE_UCNV, "UTF-16LE", 2, false, false, NULL);
- add_encoding(ENCODING_UTF32BE, ENCODING_TYPE_UCNV, "UTF-32BE", 4, false, false, "UCS-4BE", NULL);
- add_encoding(ENCODING_UTF32LE, ENCODING_TYPE_UCNV, "UTF-32LE", 4, false, false, "UCS-4LE", NULL);
- add_encoding(ENCODING_ISO8859_1, ENCODING_TYPE_UCNV, "ISO-8859-1", 1, true, true, "ISO8859-1", NULL);
- add_encoding(ENCODING_MACROMAN, ENCODING_TYPE_UCNV, "macRoman", 1, true, true, NULL);
- add_encoding(ENCODING_MACCYRILLIC, ENCODING_TYPE_UCNV, "macCyrillic", 1, true, true, NULL);
- add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, "CP950", NULL);
+ add_encoding(ENCODING_BINARY, ENCODING_TYPE_SPECIAL, "ASCII-8BIT", 1, true, true, false, "BINARY", NULL);
+ add_encoding(ENCODING_ASCII, ENCODING_TYPE_UCNV, "US-ASCII", 1, true, true, false, "ASCII", "ANSI_X3.4-1968", "646", NULL);
+ add_encoding(ENCODING_UTF8, ENCODING_TYPE_UCNV, "UTF-8", 1, false, true, false, "CP65001", "locale", NULL);
+ add_encoding(ENCODING_UTF16BE, ENCODING_TYPE_UCNV, "UTF-16BE", 2, false, false, false, NULL);
+ add_encoding(ENCODING_UTF16LE, ENCODING_TYPE_UCNV, "UTF-16LE", 2, false, false, true, NULL);
+ add_encoding(ENCODING_UTF32BE, ENCODING_TYPE_UCNV, "UTF-32BE", 4, false, false, false, "UCS-4BE", NULL);
+ add_encoding(ENCODING_UTF32LE, ENCODING_TYPE_UCNV, "UTF-32LE", 4, false, false, true, "UCS-4LE", NULL);
+ add_encoding(ENCODING_ISO8859_1, ENCODING_TYPE_UCNV, "ISO-8859-1", 1, true, true, false, "ISO8859-1", NULL);
+ add_encoding(ENCODING_MACROMAN, ENCODING_TYPE_UCNV, "macRoman", 1, true, true, false, NULL);
+ add_encoding(ENCODING_MACCYRILLIC, ENCODING_TYPE_UCNV, "macCyrillic", 1, true, true, false, NULL);
+ add_encoding(ENCODING_BIG5, ENCODING_TYPE_UCNV, "Big5", 1, false, true, false, "CP950", NULL);
// FIXME: the ICU conversion tables do not seem to match Ruby's Japanese conversion tables
- add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, "eucJP", NULL);
- add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, "SJIS", NULL);
+ add_encoding(ENCODING_EUCJP, ENCODING_TYPE_UCNV, "EUC-JP", 1, false, true, false, "eucJP", NULL);
+ add_encoding(ENCODING_SJIS, ENCODING_TYPE_UCNV, "Shift_JIS", 1, false, true, false, "SJIS", NULL);
//add_encoding(ENCODING_EUCJP, ENCODING_TYPE_RUBY, "EUC-JP", 1, false, true, "eucJP", NULL);
//add_encoding(ENCODING_SJIS, ENCODING_TYPE_RUBY, "Shift_JIS", 1, false, true, "SJIS", NULL);
//add_encoding(ENCODING_CP932, ENCODING_TYPE_RUBY, "Windows-31J", 1, false, true, "CP932", "csWindows31J", NULL);
View
@@ -34,19 +34,21 @@ extern "C" {
#define ENCODING_UTF32_NON_NATIVE ENCODING_UTF32LE
#endif
-#define NATIVE_UTF16_ENC(encoding) \
+#define IS_NATIVE_UTF16_ENC(encoding) \
((encoding) == rb_encodings[ENCODING_UTF16_NATIVE])
-#define NON_NATIVE_UTF16_ENC(encoding) \
+#define IS_NON_NATIVE_UTF16_ENC(encoding) \
((encoding) == rb_encodings[ENCODING_UTF16_NON_NATIVE])
-#define UTF16_ENC(encoding) \
- (NATIVE_UTF16_ENC(encoding) || NON_NATIVE_UTF16_ENC(encoding))
-#define NATIVE_UTF32_ENC(encoding) \
+#define IS_UTF16_ENC(encoding) \
+ (IS_NATIVE_UTF16_ENC(encoding) || IS_NON_NATIVE_UTF16_ENC(encoding))
+#define IS_NATIVE_UTF32_ENC(encoding) \
((encoding) == rb_encodings[ENCODING_UTF32_NATIVE])
-#define NON_NATIVE_UTF32_ENC(encoding) \
+#define IS_NON_NATIVE_UTF32_ENC(encoding) \
((encoding) == rb_encodings[ENCODING_UTF32_NON_NATIVE])
-#define UTF32_ENC(encoding) \
- (NATIVE_UTF32_ENC(encoding) || NON_NATIVE_UTF32_ENC(encoding))
-#define BINARY_ENC(encoding) ((encoding) == rb_encodings[ENCODING_BINARY])
+#define IS_UTF32_ENC(encoding) \
+ (IS_NATIVE_UTF32_ENC(encoding) || IS_NON_NATIVE_UTF32_ENC(encoding))
+#define IS_UTF8_ENC(encoding) ((encoding) == rb_encodings[ENCODING_UTF8])
+#define IS_ASCII_ENC(encoding) ((encoding) == rb_encodings[ENCODING_ASCII])
+#define IS_BINARY_ENC(encoding) ((encoding) == rb_encodings[ENCODING_BINARY])
typedef uint8_t str_flag_t;
@@ -55,10 +57,7 @@ typedef struct {
struct rb_encoding *encoding;
long capacity_in_bytes;
long length_in_bytes;
- union {
- char *bytes;
- UChar *uchars;
- } data;
+ char *bytes;
str_flag_t flags;
} rb_str_t;
@@ -119,6 +118,7 @@ typedef struct rb_encoding {
unsigned char min_char_size;
bool single_byte_encoding : 1;
bool ascii_compatible : 1;
+ bool little_endian : 1; // only meaningful for UTF-16 or UTF-32
void *private_data;
} rb_encoding_t;
@@ -145,15 +145,10 @@ enum {
extern rb_encoding_t *rb_encodings[ENCODINGS_COUNT];
-#define STRING_HAS_SUPPLEMENTARY 0x020
-#define STRING_HAS_SUPPLEMENTARY_SET 0x010
#define STRING_ASCII_ONLY_SET 0x010
#define STRING_ASCII_ONLY 0x008
#define STRING_VALID_ENCODING_SET 0x004
#define STRING_VALID_ENCODING 0x002
-#define STRING_STORED_IN_UCHARS 0x001
-
-#define STRING_REQUIRED_FLAGS STRING_STORED_IN_UCHARS
#define BYTES_TO_UCHARS(len) ((len) / sizeof(UChar))
#define UCHARS_TO_BYTES(len) ((len) * sizeof(UChar))
@@ -169,10 +164,9 @@ div_round_up(long a, long b)
void str_update_flags(rb_str_t *self);
static inline void
-str_unset_facultative_flags(rb_str_t *self)
+str_reset_flags(rb_str_t *self)
{
- self->flags &= ~STRING_HAS_SUPPLEMENTARY_SET & ~STRING_ASCII_ONLY_SET
- & ~STRING_VALID_ENCODING_SET;
+ self->flags = 0;
}
static inline bool
@@ -182,13 +176,6 @@ str_known_to_have_an_invalid_encoding(rb_str_t *self)
| STRING_VALID_ENCODING)) == STRING_VALID_ENCODING_SET;
}
-static inline bool
-str_known_not_to_have_any_supplementary(rb_str_t *self)
-{
- return (self->flags & (STRING_HAS_SUPPLEMENTARY_SET
- | STRING_HAS_SUPPLEMENTARY)) == STRING_HAS_SUPPLEMENTARY_SET;
-}
-
static inline bool
str_check_flag_and_update_if_needed(rb_str_t *self, str_flag_t flag_set,
str_flag_t flag)
@@ -226,31 +213,8 @@ str_is_ruby_ascii_only(rb_str_t *self)
return str_is_ascii_only(self);
}
-static inline bool
-str_is_stored_in_uchars(rb_str_t *self)
-{
- return self->flags & STRING_STORED_IN_UCHARS;
-}
-
-static inline void
-str_negate_stored_in_uchars(rb_str_t *self)
-{
- self->flags ^= STRING_STORED_IN_UCHARS;
-}
-
static inline void
-str_set_stored_in_uchars(rb_str_t *self, bool status)
-{
- if (status) {
- self->flags |= STRING_STORED_IN_UCHARS;
- }
- else {
- self->flags &= ~STRING_STORED_IN_UCHARS;
- }
-}
-
-static inline void
-str_set_facultative_flag(rb_str_t *self, bool status, str_flag_t flag_set,
+str_set_flag(rb_str_t *self, bool status, str_flag_t flag_set,
str_flag_t flag)
{
if (status) {
@@ -261,24 +225,16 @@ str_set_facultative_flag(rb_str_t *self, bool status, str_flag_t flag_set,
}
}
-static inline void
-str_set_has_supplementary(rb_str_t *self, bool status)
-{
- str_set_facultative_flag(self, status, STRING_HAS_SUPPLEMENTARY_SET,
- STRING_HAS_SUPPLEMENTARY);
-}
-
static inline void
str_set_ascii_only(rb_str_t *self, bool status)
{
- str_set_facultative_flag(self, status, STRING_ASCII_ONLY_SET,
- STRING_ASCII_ONLY);
+ str_set_flag(self, status, STRING_ASCII_ONLY_SET, STRING_ASCII_ONLY);
}
static inline void
str_set_valid_encoding(rb_str_t *self, bool status)
{
- str_set_facultative_flag(self, status, STRING_VALID_ENCODING_SET,
+ str_set_flag(self, status, STRING_VALID_ENCODING_SET,
STRING_VALID_ENCODING);
}
View
@@ -18,18 +18,15 @@
extern "C" {
#endif
-typedef void (^each_char_callback_t)(UChar32 c, const char* character_start, long character_length, bool *stop);
+typedef void (^each_uchar32_callback_t)(UChar32 c, long start_index, long length, bool *stop);
void str_ucnv_update_flags(rb_str_t *self);
-void str_ucnv_make_data_binary(rb_str_t *self);
-bool str_ucnv_try_making_data_uchars(rb_str_t *self);
long str_ucnv_length(rb_str_t *self, bool ucs2_mode);
-long str_ucnv_bytesize(rb_str_t *self);
character_boundaries_t str_ucnv_get_character_boundaries(rb_str_t *self, long index, bool ucs2_mode);
long str_ucnv_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes, bool ucs2_mode);
void str_ucnv_transcode_to_utf16(struct rb_encoding *src_enc, rb_str_t *self, long *pos, UChar **utf16, long *utf16_length);
void str_ucnv_transcode_from_utf16(struct rb_encoding *dst_enc, UChar *utf16, long utf16_length, long *utf16_pos, char **bytes, long *bytes_length);
-void str_ucnv_each_char(rb_str_t *self, each_char_callback_t callback);
+void str_ucnv_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback);
#if defined(__cplusplus)
} // extern "C"
Oops, something went wrong.

0 comments on commit e22f26d

Please sign in to comment.