Permalink
Browse files

honor the original string encoding when generating substrings out of …

…a regexp matcher + misc fixes (patch by Vincent Isambart)

git-svn-id: http://svn.macosforge.org/repository/ruby/MacRuby/trunk@4401 23306eb0-4c56-4727-a40e-e92c0eb68959
  • Loading branch information...
1 parent c57d706 commit f7384831ba636354ad21c2e51f49d5821a49c7d1 @lrz lrz committed Aug 3, 2010
Showing with 35 additions and 5 deletions.
  1. +10 −3 encoding.c
  2. +4 −0 re.cpp
  3. +13 −2 string.c
  4. +6 −0 symbol.c
  5. +2 −0 symbol.h
View
13 encoding.c
@@ -14,6 +14,7 @@
#include "ruby/macruby.h"
#include "ruby/encoding.h"
#include "encoding.h"
+#include "symbol.h"
VALUE rb_cEncoding;
@@ -403,10 +404,16 @@ rb_enc_from_encoding(rb_encoding_t *enc)
rb_encoding_t *
rb_enc_get(VALUE obj)
{
- if (IS_RSTR(obj)) {
- return RSTR(obj)->encoding;
+ switch (TYPE(obj)) {
+ case T_STRING:
+ if (IS_RSTR(obj)) {
+ return RSTR(obj)->encoding;
+ }
+ return rb_encodings[ENCODING_UTF8];
+
+ case T_SYMBOL:
+ return rb_enc_get(rb_sym_str(obj));
}
- // TODO support symbols
return NULL;
}
View
4 re.cpp
@@ -10,6 +10,7 @@
#include "unicode/regex.h"
#include "unicode/unistr.h"
#include "ruby/macruby.h"
+#include "ruby/encoding.h"
#include "encoding.h"
#include "objc.h"
#include "re.h"
@@ -586,6 +587,7 @@ typedef struct rb_regexp_matcher {
struct RBasic basic;
UnicodeString *unistr;
RegexMatcher *matcher;
+ rb_encoding_t *str_enc;
} rb_regexp_matcher_t;
static IMP regexp_matcher_finalize_imp_super = NULL;
@@ -628,6 +630,7 @@ rb_reg_matcher_new(VALUE re, VALUE str)
matcher->matcher = regexp_matcher;
matcher->unistr = unistr;
+ matcher->str_enc = rb_enc_get(str);
return (VALUE)matcher;
}
@@ -718,6 +721,7 @@ rb_reg_matcher_search(VALUE re, VALUE matcher, int pos, bool reverse)
}
rb_str_set_len(RMATCH(match)->str, 0);
+ rb_str_force_encoding(RMATCH(match)->str, re_matcher->str_enc);
rb_str_append_uchars(RMATCH(match)->str, re_matcher->unistr->getBuffer(),
re_matcher->unistr->length());
View
15 string.c
@@ -828,9 +828,20 @@ str_concat_bytes(rb_str_t *self, const char *bytes, long len)
static void
str_concat_uchars(rb_str_t *self, const UChar *chars, long len)
{
- assert(str_try_making_data_uchars(self));
+ if (str_try_making_data_uchars(self)) {
+ str_concat_bytes(self, (const char *)chars, UCHARS_TO_BYTES(len));
+ }
+ else {
+ assert(BINARY_ENC(RSTR(self)->encoding));
+ const long new_length_in_bytes = RSTR(self)->length_in_bytes + len;
- str_concat_bytes(self, (const char *)chars, UCHARS_TO_BYTES(len));
+ str_resize_bytes(self, new_length_in_bytes);
+ char *ptr = (RSTR(self)->data.bytes + RSTR(self)->length_in_bytes);
+ for (int i = 0; i < len; ++i) {
+ ptr[i] = chars[i];
+ }
+ self->length_in_bytes = new_length_in_bytes;
+ }
}
static void
View
6 symbol.c
@@ -837,3 +837,9 @@ rb_id_attrset(ID id)
id |= ID_ATTRSET;
return id;
}
+
+VALUE
+rb_sym_str(VALUE sym)
+{
+ return RSYM(sym)->str;
+}
View
2 symbol.h
@@ -47,6 +47,8 @@ struct rb_op_tbl_entry {
// Defined in parse.y.
extern struct rb_op_tbl_entry rb_op_tbl[];
+VALUE rb_sym_str(VALUE sym);
+
#if defined(__cplusplus)
} // extern "C"
#endif

0 comments on commit f738483

Please sign in to comment.