introduce a better unichar API, which should be as fast as before the…

… recent string datastore change, also fixing the rubygems crasher and also various memory leaks git-svn-id: http://svn.macosforge.org/repository/ruby/MacRuby/trunk@5114 23306eb0-4c56-4727-a40e-e92c0eb68959
MacRuby · Jan 5, 2011 · d1673a2 · d1673a2
1 parent 0b4fbcc
commit d1673a2
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 257 deletions.
diff --git a/encoding.h b/encoding.h
@@ -295,9 +295,46 @@ VALUE rstr_concat(VALUE self, SEL sel, VALUE other);
 // The following functions should always been prefered over anything else,
 // especially if this "else" is RSTRING_PTR and RSTRING_LEN.
 // They also work on CFStrings.
+
 VALUE rb_unicode_str_new(const UniChar *ptr, const size_t len);
-void rb_str_get_uchars(VALUE str, UChar **chars_p, long *chars_len_p,
-	bool *need_free_p);
+
+#define STR_UCHARS_STATIC_BUFSIZE 35
+
+typedef struct {
+    UChar static_buf[STR_UCHARS_STATIC_BUFSIZE];
+    UChar *chars;
+    long len;
+} rb_str_uchars_buf_t;
+
+void rb_str_get_uchars_always(VALUE str, rb_str_uchars_buf_t *buf);
+
+static inline void
+rb_str_get_uchars(VALUE str, rb_str_uchars_buf_t *buf)
+{
+    if (IS_RSTR(str)) {
+	rb_str_t *rstr = RSTR(str);
+	if (rstr->encoding->ascii_compatible && str_is_ascii_only(rstr)
+		&& rstr->length_in_bytes < STR_UCHARS_STATIC_BUFSIZE) {
+	    // Fast path.
+	    for (long i = 0; i < rstr->length_in_bytes; i++) {
+		buf->static_buf[i] = rstr->bytes[i];
+	    }
+	    buf->chars = buf->static_buf;
+	    buf->len = rstr->length_in_bytes;
+	    return;
+	}
+    }
+    rb_str_get_uchars_always(str, buf);
+}
+
+UChar *rb_str_xcopy_uchars(VALUE str, long *len_p);
+
+#define RB_STR_GET_UCHARS(str, _chars, _len) \
+    rb_str_uchars_buf_t __buf; \
+    rb_str_get_uchars(str, &__buf); \
+    UChar *_chars = __buf.chars; \
+    long _len = __buf.len
+
 long rb_str_chars_len(VALUE str);
 UChar rb_str_get_uchar(VALUE str, long pos);
 void rb_str_append_uchar(VALUE str, UChar c);

diff --git a/parse.y b/parse.y
@@ -5094,15 +5094,13 @@ rb_parser_compile_string(VALUE vparser, const char *f, VALUE s, int line)
     struct parser_params *parser;
     Data_Get_Struct(vparser, struct parser_params, parser);
 
-    UChar *chars = NULL;
     long chars_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(s, &chars, &chars_len, &need_free);
+    UChar *chars = rb_str_xcopy_uchars(s, &chars_len); 
 
     struct lex_get_str_context *ctx = (struct lex_get_str_context *)
 	xmalloc(sizeof(struct lex_get_str_context));
     GC_WB(&ctx->str, s);
-    ctx->chars = chars;
+    GC_WB(&ctx->chars, chars);
     ctx->chars_len = chars_len;
 
     lex_gets = lex_get_str;
@@ -5111,14 +5109,7 @@ rb_parser_compile_string(VALUE vparser, const char *f, VALUE s, int line)
     lex_pbeg = lex_p = lex_pend = 0;
     compile_for_eval = rb_parse_in_eval();
 
-    NODE *node = yycompile(parser, f, line);
-
-    if (need_free && chars != NULL) {
-	orig_free(chars);
-	chars = NULL;
-    }
-
-    return node;
+    return yycompile(parser, f, line);
 }
 
 NODE*
@@ -9901,27 +9892,13 @@ ripper_initialize(VALUE self, SEL sel, int argc, VALUE *argv)
     Data_Get_Struct(self, struct parser_params, parser);
     rb_scan_args(argc, argv, "12", &src, &fname, &lineno);
 
-    UChar *chars = NULL;
     long chars_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(src, &chars, &chars_len, &need_free);
-
-    if (need_free) {
-	UChar *tmp = (UChar *)xmalloc(sizeof(UChar) * chars_len);
-	memcpy(tmp, chars, sizeof(UChar) * chars_len);
-	orig_free(chars);
-	chars = tmp;
-    }
+    UChar *chars = rb_str_xcopy_uchars(src, &chars_len);
 
     struct lex_get_str_context *ctx = (struct lex_get_str_context *)
 	xmalloc(sizeof(struct lex_get_str_context));
     GC_WB(&ctx->str, src);
-    if (need_free) {
-	GC_WB(&ctx->chars, chars);
-    }
-    else {
-	ctx->chars = chars;
-    }
+    GC_WB(&ctx->chars, chars);
     ctx->chars_len = chars_len;
 
     parser->parser_lex_gets = lex_get_str;

diff --git a/re.c b/re.c
@@ -91,20 +91,16 @@ regexp_finalize_imp(void *rcv, SEL sel)
 
 // Work around ICU limitations.
 static void
-sanitize_regexp_string(UChar **chars_p, long *chars_len_p, bool *need_free_p)
+sanitize_regexp_string(UChar **chars_p, long *chars_len_p)
 {
     UChar *chars = *chars_p;
     long chars_len = *chars_len_p;
-    bool need_free = *need_free_p;
 
 #define copy_if_needed() \
     do { \
-	if (!need_free) { \
-	    UChar *tmp = (UChar *)malloc(sizeof(UChar) * chars_len); \
-	    memcpy(tmp, chars, sizeof(UChar) * chars_len); \
-	    chars = tmp; \
-	    need_free = true; \
-	} \
+	UChar *tmp = (UChar *)xmalloc(sizeof(UChar) * chars_len); \
+	memcpy(tmp, chars, sizeof(UChar) * chars_len); \
+	chars = tmp; \
     } \
     while (0)
 
@@ -183,18 +179,14 @@ printf("\n");
 
     *chars_p = chars;
     *chars_len_p = chars_len;
-    *need_free_p = need_free;
 }
 
 static bool
 init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
 {
     option |= REGEXP_OPT_DEFAULT;
 
-    UChar *chars = NULL;
-    long chars_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(str, &chars, &chars_len, &need_free);
+    RB_STR_GET_UCHARS(str, chars, chars_len);
 
     UChar null_char = '\0';
     if (chars_len == 0) {
@@ -203,21 +195,16 @@ init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
 	// of -1 which indicates it's terminated by \0.
 	chars = &null_char;
 	chars_len = -1;
-	need_free = false;
     }
     else {
-	sanitize_regexp_string(&chars, &chars_len, &need_free);
+	sanitize_regexp_string(&chars, &chars_len);
     }
 
     UParseError pe;
     UErrorCode status = U_ZERO_ERROR;
     URegularExpression *pattern = uregex_open(chars, chars_len, option,
 	    &pe, &status);
 
-    if (need_free) {
-	free(chars);
-    }
-
     if (pattern == NULL) {
 	if (excp != NULL) {
 	    char error[1024];
@@ -669,7 +656,7 @@ regexp_equal(VALUE rcv, SEL sel, VALUE other)
 typedef struct rb_regexp_matcher {
     struct RBasic basic;
     URegularExpression *pattern;
-    UChar *text_to_free;
+    UChar *text_chars;
     rb_encoding_t *encoding;
     VALUE frozen_str;
 } rb_regexp_matcher_t;
@@ -681,10 +668,6 @@ reg_matcher_cleanup(rb_regexp_matcher_t *m)
 	uregex_close(m->pattern);
 	m->pattern = NULL;
     }
-    if (m->text_to_free != NULL) {
-	free(m->text_to_free);
-	m->text_to_free = NULL;
-    }
 }
 
 static IMP regexp_matcher_finalize_imp_super = NULL; 
@@ -697,6 +680,7 @@ regexp_matcher_finalize_imp(void *rcv, SEL sel)
 	((void(*)(void *, SEL))regexp_matcher_finalize_imp_super)(rcv, sel);
     }
 }
+
 VALUE
 rb_reg_matcher_new(VALUE re, VALUE str)
 {
@@ -712,28 +696,22 @@ rb_reg_matcher_new(VALUE re, VALUE str)
 		u_errorName(status));
     }
 
-    UChar *chars = NULL;
     long chars_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(str, &chars, &chars_len, &need_free);
+    UChar *chars = rb_str_xcopy_uchars(str, &chars_len);
 
-    UChar null_char = '\0';
     if (chars_len == 0) {
 	// uregex_setText() will complain if we pass a NULL pattern or a
 	// pattern length of 0, so we do pass an empty pattern with a length
 	// of -1 which indicates it's terminated by \0.
-	chars = &null_char;
+	chars = (UChar *)xmalloc(sizeof(UChar));
+	*chars = '\0';
 	chars_len = -1;
-	need_free = false;
     }
 
     uregex_setText(match_pattern, chars, chars_len, &status);
 
     if (status != U_ZERO_ERROR) {
 	uregex_close(match_pattern);
-	if (need_free) {
-	    free(chars);
-	}
 	rb_raise(rb_eRegexpError, "can't set pattern text: %s",
 		u_errorName(status));	
     }
@@ -744,7 +722,7 @@ rb_reg_matcher_new(VALUE re, VALUE str)
 
     // Apparently uregex_setText doesn't copy the given string, so we need
     // to keep it around until we finally destroy the matcher object.
-    matcher->text_to_free = need_free ? chars : NULL;
+    GC_WB(&matcher->text_chars, chars);
 
     return (VALUE)matcher;
 }
@@ -756,7 +734,7 @@ rb_reg_matcher_destroy(VALUE matcher)
     xfree((void *)matcher);
 }
 
-static int
+int
 rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
 	bool findFirst)
 {
@@ -857,18 +835,6 @@ rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
     return res[0].beg;
 }
 
-int
-rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse)
-{
-    return rb_reg_matcher_search_find(re, matcher, pos, reverse, true);
-}
-
-int
-rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse)
-{
-    return rb_reg_matcher_search_find(re, matcher, pos, reverse, false);
-}
-
 static long
 reg_match_pos(VALUE re, VALUE *strp, long pos)
 {
@@ -975,7 +941,6 @@ regexp_match3(VALUE rcv, SEL sel)
 	rb_backref_set(Qnil);
 	return Qnil;
     }
-
     const long start = rb_reg_search(rcv, line, 0, 0);
     if (start < 0) {
 	return Qnil;
@@ -2124,12 +2089,9 @@ rb_reg_new(const char *cstr, long len, int options)
 VALUE
 rb_reg_quote(VALUE pat)
 {
-    UChar *chars = NULL;
-    long chars_len = 0;
-    bool need_free = false;
     VALUE result;
 
-    rb_str_get_uchars(pat, &chars, &chars_len, &need_free);
+    RB_STR_GET_UCHARS(pat, chars, chars_len);
 
     long pos = 0;
     for (; pos < chars_len; pos++) {
@@ -2197,9 +2159,6 @@ rb_reg_quote(VALUE pat)
     }
 
 bail:
-    if (need_free) {
-	free(chars);
-    }
     return result;
 }
 

diff --git a/re.h b/re.h
@@ -24,10 +24,27 @@ VALUE rb_reg_regcomp(VALUE str);
 VALUE rb_regexp_source(VALUE re);
 
 VALUE rb_reg_matcher_new(VALUE re, VALUE str);
+int rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
+	bool findFirst);
 void rb_reg_matcher_destroy(VALUE matcher);
-int rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse);
-int rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse);
-#define rb_reg_matcher_search rb_reg_matcher_search_next
+
+static inline int
+rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse)
+{
+    return rb_reg_matcher_search_find(re, matcher, pos, reverse, true);
+}
+
+static inline int
+rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse)
+{
+    return rb_reg_matcher_search_find(re, matcher, pos, reverse, false);
+}
+
+static inline int
+rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse)
+{
+    return rb_reg_matcher_search_next(re, matcher, pos, reverse);
+}
 
 static inline int
 rb_reg_search(VALUE re, VALUE str, int pos, bool reverse)

diff --git a/sprintf.c b/sprintf.c
@@ -372,14 +372,9 @@ cstr_update(UChar **str, long *str_len, long start, long num, VALUE replace)
 		sizeof(UChar) * (len - start - num));
     }
     if (replace_len > 0) {
-	UChar *replace_chars = NULL;
-	bool need_free = false;
-	rb_str_get_uchars(replace, &replace_chars, &replace_len, &need_free);
-	assert(replace_len > 0);
+	RB_STR_GET_UCHARS(replace, replace_chars, replace_len2);
+	assert(replace_len2 == replace_len);
 	bcopy(replace_chars, *str + start, sizeof(UChar) * replace_len);
-	if (need_free) {
-	    free(replace_chars);
-	}
     }
     return replace_len - num;
 }
@@ -413,19 +408,11 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt)
 {
     bool tainted = OBJ_TAINTED(fmt);
 
-    UChar *format_str = NULL;
     long format_len = 0;
-    bool need_free = false;
-    rb_str_get_uchars(fmt, &format_str, &format_len, &need_free);
+    UChar *format_str = rb_str_xcopy_uchars(fmt, &format_len);
     if (format_len == 0) {
 	goto bail;
     }
-    UChar *tmp = (UChar *)xmalloc(format_len * sizeof(UChar));
-    memcpy(tmp, format_str, format_len * sizeof(UChar));
-    if (need_free) {
-	free(format_str);
-    }
-    format_str = tmp;
 
     long num, pos;
     int j = 0;