Skip to content

Commit

Permalink
improve the performance of String#length with multi-byte character st…
Browse files Browse the repository at this point in the history
…ring

* before
                user     system      total        real
ascii       0.000000   0.000000   0.000000 (  0.002556)
utf8        2.340000   0.000000   2.340000 (  2.344955)
utf8 (dup)  3.580000   0.640000   4.220000 (  3.584336)

* after
                user     system      total        real
ascii       0.000000   0.000000   0.000000 (  0.002540)
utf8        0.010000   0.000000   0.010000 (  0.003444)
utf8 (dup)  0.600000   0.780000   1.380000 (  1.324915)

Test Script
----
# -*- coding: utf-8 -*-
require 'benchmark'

str_ascii = "abcdefghij" * 100000
str_utf8 = "あいうえお" * 100000
Benchmark.bm(10) do |x|
  x.report "ascii" do
    1000.times do
      str_ascii.length
    end
  end

  x.report "utf8" do
    1000.times do
      str_utf8.length
    end
  end

  x.report "utf8 (dup)" do
    1000.times do
      str_utf8.dup.length
    end
  end

end
  • Loading branch information
Watson1978 committed Jun 7, 2012
1 parent 9c78016 commit 67e47b9
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
8 changes: 8 additions & 0 deletions encoding.h
Expand Up @@ -58,6 +58,7 @@ typedef struct {
struct rb_encoding *encoding;
long capacity_in_bytes;
long length_in_bytes;
long cached_length;
char *bytes;
str_flag_t flags;
} rb_str_t;
Expand Down Expand Up @@ -192,9 +193,16 @@ div_round_up(long a, long b)

void str_update_flags(rb_str_t *self);

static inline void
str_reset_cache(rb_str_t *self)
{
self->cached_length = 0;
}

static inline void
str_reset_flags(rb_str_t *self)
{
str_reset_cache(self);
self->flags = 0;
}

Expand Down
11 changes: 11 additions & 0 deletions string.c
Expand Up @@ -289,6 +289,7 @@ str_replace_with_string(rb_str_t *self, rb_str_t *source)
str_update_flags(source);
}
self->flags = source->flags;
self->cached_length = source->cached_length;
}

static void
Expand Down Expand Up @@ -419,6 +420,11 @@ str_length_with_cache(rb_str_t *self, character_boundaries_cache_t *cache)
return cache->cached_length;
}

// TODO: might not need character_boundaries_cache_t *cache in above
if (self->cached_length != 0) {
return self->cached_length;
}

// slow paths
long length = 0;
if (IS_UTF8_ENC(self->encoding)) {
Expand All @@ -445,6 +451,8 @@ str_length_with_cache(rb_str_t *self, character_boundaries_cache_t *cache)
if (cache != NULL) {
cache->cached_length = length;
}

self->cached_length = length;
return length;
}

Expand Down Expand Up @@ -855,6 +863,7 @@ str_splice(rb_str_t *self, long pos, long len, rb_str_t *str)
const long bytes_to_splice = end.end_offset_in_bytes
- beg.start_offset_in_bytes;

str_reset_cache(self);
long bytes_to_add = 0;
if (str != NULL) {
if (!str->flags) {
Expand Down Expand Up @@ -898,6 +907,7 @@ str_delete(rb_str_t *self, long pos, long len)
{
if (str_is_ruby_ascii_only(self) &&
self->length_in_bytes <= pos + len) {
str_reset_cache(self);
self->length_in_bytes = pos;
return;
}
Expand All @@ -917,6 +927,7 @@ str_concat_bytes(rb_str_t *self, const char *bytes, long len)

const long new_length_in_bytes = self->length_in_bytes + len;

str_reset_cache(self);
str_resize_bytes(self, new_length_in_bytes);
memcpy(self->bytes + self->length_in_bytes, bytes, len);
self->length_in_bytes = new_length_in_bytes;
Expand Down

0 comments on commit 67e47b9

Please sign in to comment.