Skip to content

Commit

Permalink
introduce a better unichar API, which should be as fast as before the…
Browse files Browse the repository at this point in the history
… recent string datastore change, also fixing the rubygems crasher and also various memory leaks

git-svn-id: http://svn.macosforge.org/repository/ruby/MacRuby/trunk@5114 23306eb0-4c56-4727-a40e-e92c0eb68959
  • Loading branch information
lrz committed Jan 5, 2011
1 parent 0b4fbcc commit d1673a2
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 257 deletions.
41 changes: 39 additions & 2 deletions encoding.h
Expand Up @@ -295,9 +295,46 @@ VALUE rstr_concat(VALUE self, SEL sel, VALUE other);
// The following functions should always been prefered over anything else,
// especially if this "else" is RSTRING_PTR and RSTRING_LEN.
// They also work on CFStrings.

VALUE rb_unicode_str_new(const UniChar *ptr, const size_t len);
void rb_str_get_uchars(VALUE str, UChar **chars_p, long *chars_len_p,
bool *need_free_p);

#define STR_UCHARS_STATIC_BUFSIZE 35

typedef struct {
UChar static_buf[STR_UCHARS_STATIC_BUFSIZE];
UChar *chars;
long len;
} rb_str_uchars_buf_t;

void rb_str_get_uchars_always(VALUE str, rb_str_uchars_buf_t *buf);

static inline void
rb_str_get_uchars(VALUE str, rb_str_uchars_buf_t *buf)
{
if (IS_RSTR(str)) {
rb_str_t *rstr = RSTR(str);
if (rstr->encoding->ascii_compatible && str_is_ascii_only(rstr)
&& rstr->length_in_bytes < STR_UCHARS_STATIC_BUFSIZE) {
// Fast path.
for (long i = 0; i < rstr->length_in_bytes; i++) {
buf->static_buf[i] = rstr->bytes[i];
}
buf->chars = buf->static_buf;
buf->len = rstr->length_in_bytes;
return;
}
}
rb_str_get_uchars_always(str, buf);
}

UChar *rb_str_xcopy_uchars(VALUE str, long *len_p);

#define RB_STR_GET_UCHARS(str, _chars, _len) \
rb_str_uchars_buf_t __buf; \
rb_str_get_uchars(str, &__buf); \
UChar *_chars = __buf.chars; \
long _len = __buf.len

long rb_str_chars_len(VALUE str);
UChar rb_str_get_uchar(VALUE str, long pos);
void rb_str_append_uchar(VALUE str, UChar c);
Expand Down
33 changes: 5 additions & 28 deletions parse.y
Expand Up @@ -5094,15 +5094,13 @@ rb_parser_compile_string(VALUE vparser, const char *f, VALUE s, int line)
struct parser_params *parser;
Data_Get_Struct(vparser, struct parser_params, parser);

UChar *chars = NULL;
long chars_len = 0;
bool need_free = false;
rb_str_get_uchars(s, &chars, &chars_len, &need_free);
UChar *chars = rb_str_xcopy_uchars(s, &chars_len);

struct lex_get_str_context *ctx = (struct lex_get_str_context *)
xmalloc(sizeof(struct lex_get_str_context));
GC_WB(&ctx->str, s);
ctx->chars = chars;
GC_WB(&ctx->chars, chars);
ctx->chars_len = chars_len;

lex_gets = lex_get_str;
Expand All @@ -5111,14 +5109,7 @@ rb_parser_compile_string(VALUE vparser, const char *f, VALUE s, int line)
lex_pbeg = lex_p = lex_pend = 0;
compile_for_eval = rb_parse_in_eval();

NODE *node = yycompile(parser, f, line);

if (need_free && chars != NULL) {
orig_free(chars);
chars = NULL;
}

return node;
return yycompile(parser, f, line);
}

NODE*
Expand Down Expand Up @@ -9901,27 +9892,13 @@ ripper_initialize(VALUE self, SEL sel, int argc, VALUE *argv)
Data_Get_Struct(self, struct parser_params, parser);
rb_scan_args(argc, argv, "12", &src, &fname, &lineno);

UChar *chars = NULL;
long chars_len = 0;
bool need_free = false;
rb_str_get_uchars(src, &chars, &chars_len, &need_free);

if (need_free) {
UChar *tmp = (UChar *)xmalloc(sizeof(UChar) * chars_len);
memcpy(tmp, chars, sizeof(UChar) * chars_len);
orig_free(chars);
chars = tmp;
}
UChar *chars = rb_str_xcopy_uchars(src, &chars_len);

struct lex_get_str_context *ctx = (struct lex_get_str_context *)
xmalloc(sizeof(struct lex_get_str_context));
GC_WB(&ctx->str, src);
if (need_free) {
GC_WB(&ctx->chars, chars);
}
else {
ctx->chars = chars;
}
GC_WB(&ctx->chars, chars);
ctx->chars_len = chars_len;

parser->parser_lex_gets = lex_get_str;
Expand Down
69 changes: 14 additions & 55 deletions re.c
Expand Up @@ -91,20 +91,16 @@ regexp_finalize_imp(void *rcv, SEL sel)

// Work around ICU limitations.
static void
sanitize_regexp_string(UChar **chars_p, long *chars_len_p, bool *need_free_p)
sanitize_regexp_string(UChar **chars_p, long *chars_len_p)
{
UChar *chars = *chars_p;
long chars_len = *chars_len_p;
bool need_free = *need_free_p;

#define copy_if_needed() \
do { \
if (!need_free) { \
UChar *tmp = (UChar *)malloc(sizeof(UChar) * chars_len); \
memcpy(tmp, chars, sizeof(UChar) * chars_len); \
chars = tmp; \
need_free = true; \
} \
UChar *tmp = (UChar *)xmalloc(sizeof(UChar) * chars_len); \
memcpy(tmp, chars, sizeof(UChar) * chars_len); \
chars = tmp; \
} \
while (0)

Expand Down Expand Up @@ -183,18 +179,14 @@ printf("\n");

*chars_p = chars;
*chars_len_p = chars_len;
*need_free_p = need_free;
}

static bool
init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
{
option |= REGEXP_OPT_DEFAULT;

UChar *chars = NULL;
long chars_len = 0;
bool need_free = false;
rb_str_get_uchars(str, &chars, &chars_len, &need_free);
RB_STR_GET_UCHARS(str, chars, chars_len);

UChar null_char = '\0';
if (chars_len == 0) {
Expand All @@ -203,21 +195,16 @@ init_from_string(rb_regexp_t *regexp, VALUE str, int option, VALUE *excp)
// of -1 which indicates it's terminated by \0.
chars = &null_char;
chars_len = -1;
need_free = false;
}
else {
sanitize_regexp_string(&chars, &chars_len, &need_free);
sanitize_regexp_string(&chars, &chars_len);
}

UParseError pe;
UErrorCode status = U_ZERO_ERROR;
URegularExpression *pattern = uregex_open(chars, chars_len, option,
&pe, &status);

if (need_free) {
free(chars);
}

if (pattern == NULL) {
if (excp != NULL) {
char error[1024];
Expand Down Expand Up @@ -669,7 +656,7 @@ regexp_equal(VALUE rcv, SEL sel, VALUE other)
typedef struct rb_regexp_matcher {
struct RBasic basic;
URegularExpression *pattern;
UChar *text_to_free;
UChar *text_chars;
rb_encoding_t *encoding;
VALUE frozen_str;
} rb_regexp_matcher_t;
Expand All @@ -681,10 +668,6 @@ reg_matcher_cleanup(rb_regexp_matcher_t *m)
uregex_close(m->pattern);
m->pattern = NULL;
}
if (m->text_to_free != NULL) {
free(m->text_to_free);
m->text_to_free = NULL;
}
}

static IMP regexp_matcher_finalize_imp_super = NULL;
Expand All @@ -697,6 +680,7 @@ regexp_matcher_finalize_imp(void *rcv, SEL sel)
((void(*)(void *, SEL))regexp_matcher_finalize_imp_super)(rcv, sel);
}
}

VALUE
rb_reg_matcher_new(VALUE re, VALUE str)
{
Expand All @@ -712,28 +696,22 @@ rb_reg_matcher_new(VALUE re, VALUE str)
u_errorName(status));
}

UChar *chars = NULL;
long chars_len = 0;
bool need_free = false;
rb_str_get_uchars(str, &chars, &chars_len, &need_free);
UChar *chars = rb_str_xcopy_uchars(str, &chars_len);

UChar null_char = '\0';
if (chars_len == 0) {
// uregex_setText() will complain if we pass a NULL pattern or a
// pattern length of 0, so we do pass an empty pattern with a length
// of -1 which indicates it's terminated by \0.
chars = &null_char;
chars = (UChar *)xmalloc(sizeof(UChar));
*chars = '\0';
chars_len = -1;
need_free = false;
}

uregex_setText(match_pattern, chars, chars_len, &status);

if (status != U_ZERO_ERROR) {
uregex_close(match_pattern);
if (need_free) {
free(chars);
}
rb_raise(rb_eRegexpError, "can't set pattern text: %s",
u_errorName(status));
}
Expand All @@ -744,7 +722,7 @@ rb_reg_matcher_new(VALUE re, VALUE str)

// Apparently uregex_setText doesn't copy the given string, so we need
// to keep it around until we finally destroy the matcher object.
matcher->text_to_free = need_free ? chars : NULL;
GC_WB(&matcher->text_chars, chars);

return (VALUE)matcher;
}
Expand All @@ -756,7 +734,7 @@ rb_reg_matcher_destroy(VALUE matcher)
xfree((void *)matcher);
}

static int
int
rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
bool findFirst)
{
Expand Down Expand Up @@ -857,18 +835,6 @@ rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
return res[0].beg;
}

int
rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse)
{
return rb_reg_matcher_search_find(re, matcher, pos, reverse, true);
}

int
rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse)
{
return rb_reg_matcher_search_find(re, matcher, pos, reverse, false);
}

static long
reg_match_pos(VALUE re, VALUE *strp, long pos)
{
Expand Down Expand Up @@ -975,7 +941,6 @@ regexp_match3(VALUE rcv, SEL sel)
rb_backref_set(Qnil);
return Qnil;
}

const long start = rb_reg_search(rcv, line, 0, 0);
if (start < 0) {
return Qnil;
Expand Down Expand Up @@ -2124,12 +2089,9 @@ rb_reg_new(const char *cstr, long len, int options)
VALUE
rb_reg_quote(VALUE pat)
{
UChar *chars = NULL;
long chars_len = 0;
bool need_free = false;
VALUE result;

rb_str_get_uchars(pat, &chars, &chars_len, &need_free);
RB_STR_GET_UCHARS(pat, chars, chars_len);

long pos = 0;
for (; pos < chars_len; pos++) {
Expand Down Expand Up @@ -2197,9 +2159,6 @@ rb_reg_quote(VALUE pat)
}

bail:
if (need_free) {
free(chars);
}
return result;
}

Expand Down
23 changes: 20 additions & 3 deletions re.h
Expand Up @@ -24,10 +24,27 @@ VALUE rb_reg_regcomp(VALUE str);
VALUE rb_regexp_source(VALUE re);

VALUE rb_reg_matcher_new(VALUE re, VALUE str);
int rb_reg_matcher_search_find(VALUE re, VALUE matcher, int pos, bool reverse,
bool findFirst);
void rb_reg_matcher_destroy(VALUE matcher);
int rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse);
int rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse);
#define rb_reg_matcher_search rb_reg_matcher_search_next

static inline int
rb_reg_matcher_search_first(VALUE re, VALUE matcher, int pos, bool reverse)
{
return rb_reg_matcher_search_find(re, matcher, pos, reverse, true);
}

static inline int
rb_reg_matcher_search_next(VALUE re, VALUE matcher, int pos, bool reverse)
{
return rb_reg_matcher_search_find(re, matcher, pos, reverse, false);
}

static inline int
rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse)
{
return rb_reg_matcher_search_next(re, matcher, pos, reverse);
}

static inline int
rb_reg_search(VALUE re, VALUE str, int pos, bool reverse)
Expand Down
19 changes: 3 additions & 16 deletions sprintf.c
Expand Up @@ -372,14 +372,9 @@ cstr_update(UChar **str, long *str_len, long start, long num, VALUE replace)
sizeof(UChar) * (len - start - num));
}
if (replace_len > 0) {
UChar *replace_chars = NULL;
bool need_free = false;
rb_str_get_uchars(replace, &replace_chars, &replace_len, &need_free);
assert(replace_len > 0);
RB_STR_GET_UCHARS(replace, replace_chars, replace_len2);
assert(replace_len2 == replace_len);
bcopy(replace_chars, *str + start, sizeof(UChar) * replace_len);
if (need_free) {
free(replace_chars);
}
}
return replace_len - num;
}
Expand Down Expand Up @@ -413,19 +408,11 @@ rb_str_format(int argc, const VALUE *argv, VALUE fmt)
{
bool tainted = OBJ_TAINTED(fmt);

UChar *format_str = NULL;
long format_len = 0;
bool need_free = false;
rb_str_get_uchars(fmt, &format_str, &format_len, &need_free);
UChar *format_str = rb_str_xcopy_uchars(fmt, &format_len);
if (format_len == 0) {
goto bail;
}
UChar *tmp = (UChar *)xmalloc(format_len * sizeof(UChar));
memcpy(tmp, format_str, format_len * sizeof(UChar));
if (need_free) {
free(format_str);
}
format_str = tmp;

long num, pos;
int j = 0;
Expand Down

0 comments on commit d1673a2

Please sign in to comment.