Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Fetching contributors…

Cannot retrieve contributors at this time

7100 lines (6364 sloc) 176.533 kb
/*
* MacRuby Strings.
*
* This file is covered by the Ruby license. See COPYING for more details.
*
* Copyright (C) 2012, The MacRuby Team. All rights reserved.
* Copyright (C) 2007-2011, Apple Inc. All rights reserved.
* Copyright (C) 1993-2007 Yukihiro Matsumoto
* Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
* Copyright (C) 2000 Information-technology Promotion Agency, Japan
*/
#include <stdio.h>
#include <stdarg.h>
#include <wctype.h>
#include <unistd.h>
#include "macruby_internal.h"
#include "ruby/encoding.h"
#include "encoding.h"
#include "re.h"
#include "objc.h"
#include "id.h"
#include "ruby/node.h"
#include "vm.h"
#include "class.h"
#include "encoding_ucnv.h"
#include <unicode/unum.h>
#include <unicode/utrans.h>
#include <unicode/uchar.h>
#define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
#define SET_CLASS(dst, src) \
do{ \
if (RSTR(dst) != NULL && RSTR(src) != NULL) { \
RBASIC(dst)->klass = RBASIC(src)->klass; \
} \
} while(0)
VALUE rb_cRubyString;
VALUE rb_fs;
static SEL selMATCH;
// rb_str_t primitives.
static void
str_update_flags_utf16(rb_str_t *self)
{
assert(IS_UTF16_ENC(self->encoding));
bool ascii_only = true;
bool valid_encoding = true;
// if the length is an odd number, it can't be valid UTF-16
if (ODD_NUMBER(self->length_in_bytes)) {
valid_encoding = false;
}
UChar *uchars = (UChar *)self->bytes;
long uchars_count = BYTES_TO_UCHARS(self->length_in_bytes);
bool native_byte_order = IS_NATIVE_UTF16_ENC(self->encoding);
UChar32 lead = 0;
for (int i = 0; i < uchars_count; ++i) {
UChar32 c;
if (native_byte_order) {
c = uchars[i];
}
else {
uint8_t *bytes = (uint8_t *)&uchars[i];
c = (uint16_t)bytes[0] << 8 | (uint16_t)bytes[1];
}
if (U16_IS_SURROGATE(c)) { // surrogate
if (U16_IS_SURROGATE_LEAD(c)) { // lead surrogate
// a lead surrogate should not be
// after an other lead surrogate
if (lead != 0) {
valid_encoding = false;
}
lead = c;
}
else { // trail surrogate
// a trail surrogate must follow a lead surrogate
if (lead == 0) {
valid_encoding = false;
}
else {
c = U16_GET_SUPPLEMENTARY(lead, c);
if (!U_IS_UNICODE_CHAR(c)) {
valid_encoding = false;
}
}
lead = 0;
}
}
else { // not a surrogate
// a non-surrogate character should not be after a lead surrogate
// and it should be a valid Unicode character
// Warning: Ruby 1.9 does not do the IS_UNICODE_CHAR check
// (for 1.9, 0xffff is valid though it's not a Unicode character)
if ((lead != 0) || !U_IS_UNICODE_CHAR(c)) {
valid_encoding = false;
}
if (c > 127) {
ascii_only = false;
}
}
}
// the last character should not be a lead surrogate
if (lead != 0) {
valid_encoding = false;
}
if (valid_encoding) {
str_set_valid_encoding(self, true);
str_set_ascii_only(self, ascii_only);
}
else {
str_set_valid_encoding(self, false);
str_set_ascii_only(self, false);
}
}
void
str_update_flags(rb_str_t *self)
{
if (self->length_in_bytes == 0) {
str_set_valid_encoding(self, true);
str_set_ascii_only(self, true);
}
else if (IS_BINARY_ENC(self->encoding)) {
str_set_valid_encoding(self, true);
bool ascii_only = true;
for (long i = 0; i < self->length_in_bytes; ++i) {
if ((uint8_t)self->bytes[i] > 127) {
ascii_only = false;
break;
}
}
str_set_ascii_only(self, ascii_only);
}
else if (IS_UTF8_ENC(self->encoding)) {
bool ascii_only = true;
bool valid_encoding = true;
for (int i = 0; i < self->length_in_bytes; ) {
UChar32 c;
U8_NEXT(self->bytes, i, self->length_in_bytes, c);
if (c == U_SENTINEL) {
valid_encoding = false;
ascii_only = false;
break;
}
else if (c > 127) {
ascii_only = false;
}
}
str_set_valid_encoding(self, valid_encoding);
str_set_ascii_only(self, ascii_only);
}
else if (IS_UTF16_ENC(self->encoding)) {
str_update_flags_utf16(self);
}
else {
str_ucnv_update_flags(self);
}
}
static rb_encoding_t *
str_compatible_encoding(rb_str_t *str1, rb_str_t *str2)
{
if (str1->encoding == str2->encoding) {
return str1->encoding;
}
if (str2->length_in_bytes == 0) {
return str1->encoding;
}
if (str1->length_in_bytes == 0) {
return str2->encoding;
}
if (!str1->encoding->ascii_compatible
|| !str2->encoding->ascii_compatible) {
return NULL;
}
if (str_is_ruby_ascii_only(str2)) {
return str1->encoding;
}
if (str_is_ruby_ascii_only(str1)) {
return str2->encoding;
}
return NULL;
}
static rb_encoding_t *
str_must_have_compatible_encoding(rb_str_t *str1, rb_str_t *str2)
{
rb_encoding_t *new_encoding = str_compatible_encoding(str1, str2);
if (new_encoding == NULL) {
rb_raise(rb_eEncCompatError,
"incompatible character encodings: %s and %s",
str1->encoding->public_name, str2->encoding->public_name);
}
return new_encoding;
}
static void
str_modifiable(VALUE str)
{
if (OBJ_FROZEN(str)) {
rb_error_frozen("string");
}
if (!OBJ_UNTRUSTED(str) && rb_safe_level() >= 4) {
rb_raise(rb_eSecurityError, "Insecure: can't modify string");
}
}
static rb_str_t *
str_alloc(VALUE klass)
{
assert(rb_klass_is_rstr(klass));
assert(klass != 0);
NEWOBJ(str, rb_str_t);
str->basic.flags = 0;
str->basic.klass = klass;
str->encoding = rb_encodings[ENCODING_UTF8];
str->capacity_in_bytes = 0;
str->length_in_bytes = 0;
str->bytes = NULL;
str_reset_flags(str);
return str;
}
static VALUE
str_new_empty(VALUE str)
{
VALUE v = rb_str_new5(str, 0, 0);
OBJ_INFECT(v, str);
return v;
}
static VALUE
str_new_like(VALUE obj)
{
return (VALUE)str_alloc(rb_obj_class(obj));
}
static void str_resize_bytes(rb_str_t *self, long new_capacity);
static void str_concat_bytes(rb_str_t *self, const char *bytes, long len);
static void
str_replace_with_bytes(rb_str_t *self, const char *bytes, long len,
rb_encoding_t *enc)
{
assert(len >= 0);
assert(enc != NULL);
str_reset_flags(self);
self->encoding = enc;
if (len > 0) {
str_resize_bytes(self, len);
if (bytes != NULL) {
memcpy(self->bytes, bytes, len);
self->length_in_bytes = len;
}
else {
self->length_in_bytes = 0;
}
}
else {
self->length_in_bytes = 0;
}
}
void
str_replace_with_string(rb_str_t *self, rb_str_t *source)
{
if (self == source) {
return;
}
str_replace_with_bytes(self, source->bytes, source->length_in_bytes,
source->encoding);
if (!source->flags) {
str_update_flags(source);
}
self->flags = source->flags;
self->cached_length = source->cached_length;
}
static void
str_append_uchar32(rb_str_t *self, UChar32 c)
{
str_reset_flags(self);
if ((c <= 127) && self->encoding->ascii_compatible) {
str_resize_bytes(self, self->length_in_bytes + 1);
self->bytes[self->length_in_bytes] = c;
self->length_in_bytes++;
}
else if (IS_UTF8_ENC(self->encoding)) {
long len = U8_LENGTH(c);
if (len > 0) {
str_resize_bytes(self, self->length_in_bytes + len);
U8_APPEND_UNSAFE(self->bytes, self->length_in_bytes, c);
// U8_APPEND_UNSAFE increments length_in_bytes
// by the number of bytes appended
}
}
else if (IS_NATIVE_UTF32_ENC(self->encoding)) {
str_concat_bytes(self, (char *)&c, 4);
}
else if (IS_NATIVE_UTF16_ENC(self->encoding) && U_IS_BMP(c)) {
UChar uchar = c;
str_concat_bytes(self, (char *)&uchar, 2);
}
else {
rb_str_t *str = RSTR(rb_enc_str_new((char *)&c, 4,
rb_encodings[ENCODING_UTF32_NATIVE]));
str = str_simple_transcode(str, self->encoding);
str_concat_bytes(self, str->bytes, str->length_in_bytes);
}
}
static void str_concat_uchars(rb_str_t *self, const UChar *chars, long len);
static void
str_replace_with_uchars(rb_str_t *self, const UChar *chars, long len)
{
assert(len >= 0);
str_reset_flags(self);
self->length_in_bytes = 0;
self->encoding = rb_encodings[ENCODING_UTF8];
if (len > 0) {
if (chars == NULL) {
str_resize_bytes(self, len);
}
else if (self->bytes == (char *)chars) {
self->length_in_bytes = UCHARS_TO_BYTES(len);
}
else {
str_concat_uchars(self, chars, len);
}
}
}
static void
str_replace_with_cfstring(rb_str_t *self, CFStringRef source)
{
const long len = CFStringGetLength(source);
UniChar *chars = NULL;
if (len > 0) {
chars = (UniChar *)malloc(sizeof(UniChar) * len);
assert(chars != NULL);
CFStringGetCharacters(source, CFRangeMake(0, len), chars);
}
str_replace_with_uchars(self, chars, len);
if (chars != NULL) {
free(chars);
}
}
static void
str_replace(rb_str_t *self, VALUE arg)
{
switch (TYPE(arg)) {
case T_STRING:
if (IS_RSTR(arg)) {
str_replace_with_string(self, RSTR(arg));
}
else {
str_replace_with_cfstring(self, (CFStringRef)arg);
}
break;
default:
str_replace(self, rb_str_to_str(arg));
break;
}
}
static rb_str_t *
str_dup(rb_str_t *source)
{
rb_str_t *destination = str_alloc(rb_cRubyString);
str_replace_with_string(destination, source);
if (source->flags & STRING_ASCII_ONLY_SET) {
str_set_ascii_only(destination, str_is_ascii_only(source));
}
return destination;
}
rb_str_t *
str_new_from_cfstring(CFStringRef source)
{
rb_str_t *destination = str_alloc(rb_cRubyString);
str_replace_with_cfstring(destination, source);
return destination;
}
static long
str_length_with_cache(rb_str_t *self, character_boundaries_cache_t *cache)
{
// fast paths
if (self->length_in_bytes == 0) {
return 0;
}
else if (self->encoding->single_byte_encoding
|| (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
return self->length_in_bytes;
}
else if (IS_UTF16_ENC(self->encoding)) {
return div_round_up(self->length_in_bytes, 2);
}
if (cache != NULL
&& cache->cached_length >= 0) {
return cache->cached_length;
}
// TODO: might not need character_boundaries_cache_t *cache in above
if (self->cached_length != 0) {
return self->cached_length;
}
// slow paths
long length = 0;
if (IS_UTF8_ENC(self->encoding)) {
int i = 0;
while (i < self->length_in_bytes) {
UChar32 c;
int old_i = i;
U8_NEXT(self->bytes, i, self->length_in_bytes, c);
if (c == U_SENTINEL) {
length += i - old_i;
}
else if (U_IS_BMP(c)) {
length++;
}
else {
length += 2;
}
}
}
else {
length = str_ucnv_length(self, true);
}
if (cache != NULL) {
cache->cached_length = length;
}
self->cached_length = length;
return length;
}
static long
str_length(rb_str_t *self)
{
return str_length_with_cache(self, NULL);
}
NORETURN(static void
str_invalid_byte_sequence(rb_str_t *str))
{
rb_raise(rb_eArgError, "invalid byte sequence in %s", str->encoding->public_name);
}
// Note that each_uchar32 iterates on Unicode characters
// With a character not in the BMP the callback will only be called once!
// start_offset_in_bytes MUST be at a character boundary
static void
str_each_uchar32_starting_from(rb_str_t *self,
long start_offset_in_bytes,
each_uchar32_callback_t callback)
{
if (IS_BINARY_ENC(self->encoding) || IS_ASCII_ENC(self->encoding)) {
bool stop = false;
for (long i = start_offset_in_bytes; i < self->length_in_bytes; ++i) {
UChar32 c = (uint8_t)self->bytes[i];
if (!IS_BINARY_ENC(self->encoding) && c > 127) {
c = U_SENTINEL;
}
callback(c, i, 1, &stop);
if (stop) {
return;
}
}
}
else if (IS_UTF8_ENC(self->encoding)) {
bool stop = false;
for (int i = start_offset_in_bytes; i < self->length_in_bytes; ) {
UChar32 c;
int old_i = i;
U8_NEXT(self->bytes, i, self->length_in_bytes, c);
int char_length = i - old_i;
if (c == U_SENTINEL) {
for (long j = 0; j < char_length; ++j) {
callback(c, old_i+j, 1, &stop);
if (stop) {
return;
}
}
}
else {
callback(c, old_i, char_length, &stop);
if (stop) {
return;
}
}
};
}
else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
assert(!ODD_NUMBER(start_offset_in_bytes));
bool stop = false;
long length = BYTES_TO_UCHARS(self->length_in_bytes);
UChar *uchars = (UChar *)self->bytes;
for (long i = 0; i < length;) {
UChar32 c;
long old_i = i;
U16_NEXT(uchars, i, length, c);
callback(c, UCHARS_TO_BYTES(old_i),
UCHARS_TO_BYTES(i-old_i), &stop);
if (stop) {
return;
}
// in case the length changed
// (it should not happen but never know)
length = BYTES_TO_UCHARS(self->length_in_bytes);
};
}
else {
str_ucnv_each_uchar32_starting_from(self,
start_offset_in_bytes, callback);
}
}
static void
str_each_uchar32(rb_str_t *self, each_uchar32_callback_t callback)
{
str_each_uchar32_starting_from(self, 0, callback);
}
static UChar
str_get_uchar(rb_str_t *self, long pos)
{
assert(pos >= 0 && pos < str_length(self));
UChar c = self->bytes[pos];
if ((c <= 127) && str_is_ruby_ascii_only(self)) {
return c;
}
if (IS_NATIVE_UTF16_ENC(self->encoding)) {
return c;
}
__block UChar return_value = 0;
__block long i = 0;
str_each_uchar32(self, ^(UChar32 c, long start_index, long char_len, bool *stop) {
if (c == U_SENTINEL || U_IS_BMP(c)) {
if (i == pos) {
return_value = c;
*stop = true;
}
else {
++i;
}
}
else {
if (i == pos) {
return_value = U16_LEAD(c);
*stop = true;
}
else if (i+1 == pos) {
return_value = U16_TRAIL(c);
*stop = true;
}
else {
i += 2;
}
}
});
return return_value;
}
static long
str_bytesize(rb_str_t *self)
{
return self->length_in_bytes;
}
static rb_str_t *
str_new_similar_empty_string(rb_str_t *self)
{
rb_str_t *str = str_alloc(rb_obj_class((VALUE)self));
str->encoding = self->encoding;
return str;
}
static rb_str_t *
str_new_copy_of_part(rb_str_t *self, long offset_in_bytes,
long length_in_bytes)
{
assert(length_in_bytes > 0);
rb_str_t *str = str_alloc(rb_obj_class((VALUE)self));
str->encoding = self->encoding;
str->capacity_in_bytes = str->length_in_bytes = length_in_bytes;
GC_WB(&str->bytes, xmalloc(length_in_bytes));
memcpy(str->bytes, &self->bytes[offset_in_bytes],
length_in_bytes);
if ((self->flags & STRING_ASCII_ONLY_SET) && (self->flags & STRING_ASCII_ONLY)) {
// if the source string is ASCII only,
// then a part of that string is also ASCII only
str_set_ascii_only(str, true);
}
return str;
}
// you cannot cut a surrogate in an encoding that is not UTF-16
// (it's in theory possible to store the surrogate in
// UTF-8 or UTF-32 but that would be incorrect Unicode)
NORETURN(static void
str_cannot_cut_surrogate(void))
{
rb_raise(rb_eIndexError, "You can't cut a surrogate in two in an encoding that is not UTF-16");
}
static character_boundaries_t
str_get_character_boundaries(rb_str_t *self, long index,
character_boundaries_cache_t *cache)
{
character_boundaries_t boundaries = {-1, -1};
// fast paths
if (self->encoding->single_byte_encoding
|| (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
if (index < 0) {
index += self->length_in_bytes;
if (index < 0) {
return boundaries;
}
}
boundaries.start_offset_in_bytes = index;
boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 1;
return boundaries; // getting the offset is fast so no use caching it
}
else if (IS_UTF16_ENC(self->encoding)) {
if (index < 0) {
index += div_round_up(self->length_in_bytes, 2);
if (index < 0) {
return boundaries;
}
}
boundaries.start_offset_in_bytes = UCHARS_TO_BYTES(index);
boundaries.end_offset_in_bytes = boundaries.start_offset_in_bytes + 2;
return boundaries; // getting the offset is fast so no use caching it
}
// slow path
if (index < 0) {
index += str_length_with_cache(self, cache);
if (index < 0) {
return boundaries;
}
}
bool can_use_cache = (cache != NULL
&& cache->cached_boundaries_index >= 0);
if (can_use_cache && cache->cached_boundaries_index == index) {
return cache->cached_boundaries;
}
if (IS_UTF8_ENC(self->encoding)) {
long pos = 0;
int index_in_bytes = 0;
if (can_use_cache && cache->cached_boundaries_index < index) {
// if we are in the middle of a non-BMP character,
// end_offset_in_bytes or start_offset_in_bytes might be -1
if (cache->cached_boundaries.end_offset_in_bytes == -1) {
index_in_bytes = cache->cached_boundaries.start_offset_in_bytes;
pos = cache->cached_boundaries_index;
}
else {
index_in_bytes = cache->cached_boundaries.end_offset_in_bytes;
pos = cache->cached_boundaries_index + 1;
}
}
while (index_in_bytes < self->length_in_bytes) {
UChar32 c;
int old_index_in_bytes = index_in_bytes;
long new_pos = pos;
U8_NEXT(self->bytes, index_in_bytes, self->length_in_bytes, c);
if (c == U_SENTINEL) {
new_pos += index_in_bytes - old_index_in_bytes;
if (new_pos > index) {
boundaries.start_offset_in_bytes =
old_index_in_bytes + (index - pos);
boundaries.end_offset_in_bytes =
boundaries.start_offset_in_bytes + 1;
break;
}
}
else if (U_IS_BMP(c)) {
new_pos++;
if (new_pos > index) {
boundaries.start_offset_in_bytes = old_index_in_bytes;
boundaries.end_offset_in_bytes = index_in_bytes;
break;
}
}
else {
new_pos += 2;
if (new_pos > index) {
if (index == pos) {
boundaries.start_offset_in_bytes = old_index_in_bytes;
}
else {
assert(index == pos + 1);
boundaries.end_offset_in_bytes = index_in_bytes;
}
break;
}
}
pos = new_pos;
}
}
else {
boundaries = str_ucnv_get_character_boundaries(self, index, true);
}
if (cache != NULL) {
cache->cached_boundaries_index = index;
cache->cached_boundaries = boundaries;
}
return boundaries;
}
static rb_str_t *
str_get_characters(rb_str_t *self, long first, long last,
character_boundaries_cache_t *cache)
{
if (self->length_in_bytes == 0) {
if (first == 0) {
return str_new_similar_empty_string(self);
}
else {
return NULL;
}
}
character_boundaries_cache_t local_cache;
if (cache == NULL) {
reset_character_boundaries_cache(&local_cache);
cache = &local_cache;
}
character_boundaries_t first_boundaries =
str_get_character_boundaries(self, first, cache);
character_boundaries_t last_boundaries =
str_get_character_boundaries(self, last, cache);
if ((first_boundaries.start_offset_in_bytes == -1) ||
(last_boundaries.end_offset_in_bytes == -1)) {
// you cannot cut a surrogate in an encoding that is not UTF-16
str_cannot_cut_surrogate();
}
if (first_boundaries.start_offset_in_bytes == self->length_in_bytes) {
return str_new_similar_empty_string(self);
}
else if (first_boundaries.start_offset_in_bytes > self->length_in_bytes) {
return NULL;
}
if (last_boundaries.end_offset_in_bytes >= self->length_in_bytes) {
last_boundaries.end_offset_in_bytes = self->length_in_bytes;
}
return str_new_copy_of_part(self, first_boundaries.start_offset_in_bytes,
last_boundaries.end_offset_in_bytes
- first_boundaries.start_offset_in_bytes);
}
static void
str_resize_bytes(rb_str_t *self, long new_capacity)
{
if (new_capacity < 0) {
rb_raise(rb_eArgError, "negative string size (or size too big)");
}
if (self->capacity_in_bytes < new_capacity) {
size_t capacity = new_capacity * 1.2;
if (capacity > 0){
new_capacity = capacity;
}
if (self->bytes == NULL) {
GC_WB(&self->bytes, xmalloc(new_capacity));
}
else {
char *bytes = xrealloc(self->bytes, new_capacity);
if (bytes != self->bytes) {
GC_WB(&self->bytes, bytes);
}
}
self->capacity_in_bytes = new_capacity;
}
}
static void
str_ensure_null_terminator(rb_str_t *self)
{
if (self->length_in_bytes > 0
&& (self->capacity_in_bytes == self->length_in_bytes
|| self->bytes[self->length_in_bytes] != '\0')) {
str_resize_bytes(self, self->length_in_bytes + 1);
self->bytes[self->length_in_bytes] = '\0';
}
}
static void
str_splice(rb_str_t *self, long pos, long len, rb_str_t *str)
{
// self[pos..pos+len] = str
assert(pos >= 0 && len >= 0);
if (str != NULL) {
str_must_have_compatible_encoding(self, str);
}
character_boundaries_t beg, end;
if (pos + len == 0) {
// Positioning before the string.
const long offset = 0;
beg.start_offset_in_bytes = beg.end_offset_in_bytes = offset;
end.start_offset_in_bytes = end.end_offset_in_bytes = offset;
}
else if (len == 0 && str_length(self) == pos) {
// Positioning after the string.
const long offset = self->length_in_bytes;
beg.start_offset_in_bytes = beg.end_offset_in_bytes = offset;
end.start_offset_in_bytes = end.end_offset_in_bytes = offset;
}
else {
character_boundaries_cache_t local_cache;
reset_character_boundaries_cache(&local_cache);
// Positioning in the string.
beg = str_get_character_boundaries(self, pos, &local_cache);
end = str_get_character_boundaries(self, pos + len - 1, &local_cache);
if ((beg.start_offset_in_bytes == -1) ||
(end.end_offset_in_bytes == -1)) {
// you cannot cut a surrogate in an encoding that is not UTF-16
str_cannot_cut_surrogate();
}
}
const long bytes_to_splice = end.end_offset_in_bytes
- beg.start_offset_in_bytes;
str_reset_cache(self);
long bytes_to_add = 0;
if (str != NULL) {
if (!str->flags) {
str_update_flags(str);
}
if (str->length_in_bytes > 0 && self->flags != str->flags) {
str_reset_flags(self);
}
if (str->length_in_bytes > bytes_to_splice) {
str_resize_bytes(self, self->length_in_bytes
+ (str->length_in_bytes - bytes_to_splice));
}
bytes_to_add = str->length_in_bytes;
}
if (beg.start_offset_in_bytes == end.end_offset_in_bytes
&& end.end_offset_in_bytes == self->length_in_bytes) {
if (bytes_to_add > 0) {
// We are splicing at the very end.
memcpy(self->bytes + self->length_in_bytes, str->bytes,
bytes_to_add);
}
}
else {
// We are splicing in the middle.
memmove(self->bytes + beg.start_offset_in_bytes + bytes_to_add,
self->bytes + end.end_offset_in_bytes,
self->length_in_bytes - end.end_offset_in_bytes);
if (bytes_to_add > 0) {
memcpy(self->bytes + beg.start_offset_in_bytes,
str->bytes, bytes_to_add);
}
}
self->length_in_bytes = self->length_in_bytes - bytes_to_splice
+ bytes_to_add;
}
static void
str_delete(rb_str_t *self, long pos, long len)
{
if (str_is_ruby_ascii_only(self) &&
self->length_in_bytes <= pos + len) {
str_reset_cache(self);
self->length_in_bytes = pos;
return;
}
str_splice(self, pos, len, NULL);
}
static void
str_insert(rb_str_t *self, long pos, rb_str_t *str)
{
str_splice(self, pos, 0, str);
}
static void
str_concat_bytes(rb_str_t *self, const char *bytes, long len)
{
assert(bytes != NULL && len >= 0);
const long new_length_in_bytes = self->length_in_bytes + len;
str_reset_cache(self);
str_resize_bytes(self, new_length_in_bytes);
memcpy(self->bytes + self->length_in_bytes, bytes, len);
self->length_in_bytes = new_length_in_bytes;
}
static void
str_concat_uchars(rb_str_t *self, const UChar *chars, long len)
{
if (len == 0) {
return;
}
str_reset_flags(self);
if (IS_UTF8_ENC(self->encoding)) {
long new_length_in_bytes = self->length_in_bytes;
for (long i = 0; i < len; ) {
UChar32 c;
U16_NEXT(chars, i, len, c);
new_length_in_bytes += U8_LENGTH(c);
}
str_resize_bytes(self, new_length_in_bytes);
for (long pos_in_src = 0, pos_in_dst = self->length_in_bytes;
pos_in_src < len; ) {
UChar32 c;
UBool is_error;
U16_NEXT(chars, pos_in_src, len, c);
U8_APPEND((uint8_t *)self->bytes, pos_in_dst,
new_length_in_bytes, c, is_error);
}
self->length_in_bytes = new_length_in_bytes;
}
else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
str_concat_bytes(self, (char *)chars, UCHARS_TO_BYTES(len));
}
else if (IS_BINARY_ENC(self->encoding) || IS_ASCII_ENC(self->encoding)) {
const long new_length_in_bytes = RSTR(self)->length_in_bytes + len;
str_resize_bytes(self, new_length_in_bytes);
char *ptr = (RSTR(self)->bytes + RSTR(self)->length_in_bytes);
for (int i = 0; i < len; ++i) {
ptr[i] = chars[i];
}
self->length_in_bytes = new_length_in_bytes;
}
else {
rb_str_t *str = RSTR(rb_enc_str_new((char *)chars,
UCHARS_TO_BYTES(len),
rb_encodings[ENCODING_UTF16_NATIVE]));
str = str_simple_transcode(str, self->encoding);
str_concat_bytes(self, str->bytes, str->length_in_bytes);
}
}
static void
str_concat_string(rb_str_t *self, rb_str_t *str)
{
if (str->length_in_bytes == 0) {
return;
}
rb_encoding_t *enc = str_must_have_compatible_encoding(self, str);
if (!str->flags) {
str_update_flags(str);
}
if (self->flags != str->flags) {
str_reset_flags(self);
}
self->encoding = enc;
str_concat_bytes(self, str->bytes, str->length_in_bytes);
}
static void
str_concat_string_part(rb_str_t *self, rb_str_t *str, long start, long len,
character_boundaries_cache_t *cache_for_str)
{
assert(len >= 0 && start >= 0);
if (len == 0) {
return;
}
rb_encoding_t *enc = str_must_have_compatible_encoding(self, str);
str_reset_flags(self);
self->encoding = enc;
character_boundaries_t first_boundaries =
str_get_character_boundaries(str, start, cache_for_str);
character_boundaries_t last_boundaries;
if (len == 1) {
last_boundaries = first_boundaries;
}
else {
last_boundaries = str_get_character_boundaries(str, start+len-1,
cache_for_str);
}
if ((first_boundaries.start_offset_in_bytes == -1) ||
(last_boundaries.end_offset_in_bytes == -1)) {
// you cannot cut a surrogate in an encoding that is not UTF-16
str_cannot_cut_surrogate();
}
str_concat_bytes(self, &str->bytes[first_boundaries.start_offset_in_bytes],
last_boundaries.end_offset_in_bytes -
first_boundaries.start_offset_in_bytes);
}
static int
str_compare(rb_str_t *self, rb_str_t *str)
{
if (self == str) {
return 0;
}
if (self->length_in_bytes == 0 && str->length_in_bytes == 0) {
// both strings are empty
return 0;
}
if (str_compatible_encoding(self, str) == NULL) {
// incompatible encodings
return -1;
}
const long min_len = self->length_in_bytes < str->length_in_bytes
? self->length_in_bytes : str->length_in_bytes;
const int res = memcmp(self->bytes, str->bytes, min_len);
if (res == 0) {
if (self->length_in_bytes == str->length_in_bytes) {
return 0;
}
return self->length_in_bytes > str->length_in_bytes
? 1 : -1;
}
return res > 0 ? 1 : -1;
}
int
rstr_compare(rb_str_t *str1, rb_str_t *str2)
{
return str_compare(str1, str2);
}
static int
str_case_compare(rb_str_t *self, rb_str_t *str)
{
if (self == str) {
return 0;
}
if (self->length_in_bytes == 0 && str->length_in_bytes == 0) {
// both strings are empty
return 0;
}
if (str_compatible_encoding(self, str) == NULL) {
// incompatible encodings
return -1;
}
const long min_length = self->length_in_bytes < str->length_in_bytes
? self->length_in_bytes : str->length_in_bytes;
for (long i = 0; i < min_length; i++) {
char c1 = self->bytes[i];
char c2 = str->bytes[i];
if (c1 != c2) {
c1 = isascii(c1) ? toupper(c1) : c1;
c2 = isascii(c2) ? toupper(c2) : c2;
if (c1 != c2) {
return c1 < c2 ? -1 : 1;
}
}
}
if (self->length_in_bytes == str->length_in_bytes) {
return 0;
}
return self->length_in_bytes > str->length_in_bytes ? 1 : -1;
}
static long
str_offset_in_bytes_to_index(rb_str_t *self, long offset_in_bytes)
{
if ((offset_in_bytes >= self->length_in_bytes) || (offset_in_bytes < 0)) {
return -1;
}
if (offset_in_bytes == 0) {
return 0;
}
if (self->encoding->single_byte_encoding
|| (self->encoding->ascii_compatible && str_is_ascii_only(self))) {
return offset_in_bytes;
}
else if (IS_UTF16_ENC(self->encoding)) {
if (!ODD_NUMBER(offset_in_bytes)) {
// if we are in the middle of a character, there's no valid index
return -1;
}
return BYTES_TO_UCHARS(offset_in_bytes);
}
else {
return str_ucnv_offset_in_bytes_to_index(self, offset_in_bytes, true);
}
}
static long
str_index_for_string_with_cache(rb_str_t *self, rb_str_t *searched,
long start_index, long end_index, bool backward_search,
character_boundaries_cache_t *cache)
{
str_must_have_compatible_encoding(self, searched);
if (searched->length_in_bytes == 0 && self->length_in_bytes == 0) {
return start_index;
}
if (searched->length_in_bytes > self->length_in_bytes) {
return -1;
}
long start_offset_in_bytes;
if (start_index == 0) {
start_offset_in_bytes = 0;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
start_index, cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
}
else {
// you cannot cut a surrogate in an encoding that is not UTF-16
str_cannot_cut_surrogate();
}
}
start_offset_in_bytes = boundaries.start_offset_in_bytes;
}
if (self == searched) {
if (start_offset_in_bytes == 0) {
return 0;
}
else {
return -1;
}
}
if (start_offset_in_bytes >= self->length_in_bytes) {
return -1;
}
long last_offset_in_bytes;
if (end_index < 0
|| end_index == str_length_with_cache(self, cache)) {
last_offset_in_bytes = self->length_in_bytes;
}
else {
character_boundaries_t boundaries = str_get_character_boundaries(self,
end_index, cache);
if (boundaries.start_offset_in_bytes == -1) {
if (boundaries.end_offset_in_bytes == -1) {
return -1;
}
else {
// you cannot cut a surrogate in an encoding that is not UTF-16
str_cannot_cut_surrogate();
}
}
last_offset_in_bytes = boundaries.start_offset_in_bytes;
}
long min_end_offset = self->length_in_bytes - searched->length_in_bytes;
if (last_offset_in_bytes > min_end_offset) {
last_offset_in_bytes = min_end_offset;
}
if (!backward_search) {
if (searched->length_in_bytes == 0) {
assert(start_index >= 0);
return start_index;
}
__block long returned_index = -1;
__block long current_index = start_index;
str_each_uchar32_starting_from(self, start_offset_in_bytes,
^(UChar32 c, long character_start_offset, long char_len, bool *stop) {
if (character_start_offset > last_offset_in_bytes) {
// not enough characters left: we could not find the string
*stop = true;
return;
}
if (memcmp(self->bytes + character_start_offset,
searched->bytes, searched->length_in_bytes) == 0) {
returned_index = current_index;
*stop = true;
return;
}
if (U_IS_BMP(c)) {
++current_index;
}
else {
current_index += 2;
}
});
return returned_index;
}
// backward search
if (searched->length_in_bytes == 0) {
if (end_index < 0) {
return str_length_with_cache(self, cache);
}
else {
return end_index;
}
}
for (;;) {
long offset_found = -1;
for (long offset = last_offset_in_bytes;
offset >= start_offset_in_bytes;
--offset) {
if (memcmp(self->bytes + offset, searched->bytes,
searched->length_in_bytes) == 0) {
offset_found = offset;
break;
}
}
if (offset_found < 0) {
// not found
return -1;
}
long index = str_offset_in_bytes_to_index(RSTR(self), offset_found);
if (index != -1) {
// the offset was valid, at the start of a character
return index;
}
last_offset_in_bytes = offset_found - 1;
}
}
static long
str_index_for_string(rb_str_t *self, rb_str_t *searched, long start_index,
long end_index, bool backward_search)
{
character_boundaries_cache_t local_cache;
reset_character_boundaries_cache(&local_cache);
return str_index_for_string_with_cache(self, searched, start_index,
end_index, backward_search, &local_cache);
}
static bool
str_include_string(rb_str_t *self, rb_str_t *searched)
{
return str_index_for_string_with_cache(self, searched,
0, -1, false, NULL) != -1;
}
rb_str_t *
str_need_string(VALUE str)
{
switch (TYPE(str)) {
case T_SYMBOL:
str = rb_sym_to_s(str);
break;
case T_STRING:
break;
default:
str = rb_str_to_str(str);
break;
}
return IS_RSTR(str)
? (rb_str_t *)str : str_new_from_cfstring((CFStringRef)str);
}
static void
str_extract_uchars_range(rb_str_t *self, long range_start_offset_in_uchars,
long range_length_in_uchars, UChar *buffer)
{
if (range_length_in_uchars <= 0) {
return;
}
if (self->encoding->ascii_compatible && str_is_ascii_only(self)) {
char *source_bytes = &self->bytes[range_start_offset_in_uchars];
for (long i = 0; i < range_length_in_uchars; ++i) {
buffer[i] = source_bytes[i];
}
}
else if (IS_UTF8_ENC(self->encoding)) {
long pos_in_src = 0;
long pos_in_dst = 0;
for (int i = 0; i < self->length_in_bytes; ) {
UChar32 c;
int old_i = i;
U8_NEXT(self->bytes, i, self->length_in_bytes, c);
if (c == U_SENTINEL) {
int diff = i - old_i;
if (pos_in_src + diff > range_start_offset_in_uchars) {
int start = range_start_offset_in_uchars - pos_in_src;
if (start < 0) {
start = 0;
}
for (int j = start; j < diff && pos_in_dst < range_length_in_uchars; ++j) {
buffer[pos_in_dst++] = self->bytes[old_i+j];
}
}
pos_in_src += diff;
}
else if (U_IS_BMP(c)) {
if (pos_in_src >= range_start_offset_in_uchars) {
buffer[pos_in_dst++] = c;
}
++pos_in_src;
}
else {
if (pos_in_src >= range_start_offset_in_uchars) {
buffer[pos_in_dst++] = U16_LEAD(c);
if (pos_in_dst < range_length_in_uchars) {
buffer[pos_in_dst++] = U16_TRAIL(c);
}
}
else if (pos_in_src + 1 >= range_length_in_uchars) {
buffer[pos_in_dst++] = U16_TRAIL(c);
}
pos_in_src += 2;
}
if (pos_in_dst >= range_length_in_uchars) {
break;
}
}
}
else if (IS_NATIVE_UTF16_ENC(self->encoding)) {
memcpy(buffer,
&self->bytes[UCHARS_TO_BYTES(range_start_offset_in_uchars)],
UCHARS_TO_BYTES(range_length_in_uchars));
}
else {
__block long pos_in_src = 0;
__block long pos_in_dst = 0;
str_each_uchar32(self, ^(UChar32 c, long start_index, long char_len, bool *stop) {
if (pos_in_src >= range_start_offset_in_uchars) {
if (c == U_SENTINEL) {
if (char_len == 1) {
buffer[pos_in_dst++] = self->bytes[start_index];
}
else {
UChar accumulator = 0;
if (self->encoding->little_endian) {
for (long i = char_len-1; i >= 0; --i) {
accumulator = accumulator << 8
| self->bytes[start_index+i];
}
}
else {
for (long i = 0; i < char_len; ++i) {
accumulator = accumulator << 8
| self->bytes[start_index+i];
}
}
buffer[pos_in_dst++] = accumulator;
}
}
else if (U_IS_BMP(c)) {
buffer[pos_in_dst++] = c;
}
else {
buffer[pos_in_dst++] = U16_LEAD(c);
if (pos_in_dst < range_length_in_uchars) {
buffer[pos_in_dst++] = U16_TRAIL(c);
}
}
}
if ((c == U_SENTINEL) || U_IS_BMP(c)) {
pos_in_src++;
}
else {
if (pos_in_src + 1 == range_start_offset_in_uchars) {
buffer[pos_in_dst++] = U16_TRAIL(c);
}
pos_in_src += 2;
}
if (pos_in_dst >= range_length_in_uchars) {
*stop = true;
}
});
assert(pos_in_dst == range_length_in_uchars);
}
}
void
rb_str_get_uchars_always(VALUE str, rb_str_uchars_buf_t *buf)
{
long len = 0;
buf->chars = NULL;
if (IS_RSTR(str)) {
rb_str_t *rstr = RSTR(str);
if (rstr->length_in_bytes > 0) {
len = str_length(rstr);
if (IS_NATIVE_UTF16_ENC(rstr->encoding)) {
buf->chars = (UChar *)rstr->bytes;
}
else {
if (len > STR_UCHARS_STATIC_BUFSIZE) {
buf->chars = (UChar *)xmalloc(sizeof(UChar) * len);
assert(buf->chars != NULL);
}
else {
buf->chars = buf->static_buf;
}
str_extract_uchars_range(rstr, 0, len, buf->chars);
}
}
}
else {
len = CFStringGetLength((CFStringRef)str);
if (len > 0) {
if (len > STR_UCHARS_STATIC_BUFSIZE) {
buf->chars = (UChar *)xmalloc(sizeof(UChar) * len);
assert(buf->chars != NULL);
}
else {
buf->chars = buf->static_buf;
}
CFStringGetCharacters((CFStringRef)str, CFRangeMake(0, len),
buf->chars);
}
}
buf->len = len;
}
UChar *
rb_str_xcopy_uchars(VALUE str, long *len_p)
{
UChar *chars = NULL;
long len = 0;
if (IS_RSTR(str)) {
rb_str_t *rstr = RSTR(str);
len = str_length(rstr);
if (len > 0) {
chars = (UChar *)xmalloc(sizeof(UChar) * len);
str_extract_uchars_range(rstr, 0, len, chars);
}
}
else {
len = CFStringGetLength((CFStringRef)str);
if (len > 0) {
chars = (UChar *)xmalloc(sizeof(UChar) * len);
CFStringGetCharacters((CFStringRef)str, CFRangeMake(0, len),
chars);
}
}
if (len_p != NULL) {
*len_p = len;
}
return chars;
}
static VALUE
rstr_substr_with_cache(VALUE str, long beg, long len,
character_boundaries_cache_t *cache)
{
if (len < 0) {
return Qnil;
}
const long n = str_length_with_cache(RSTR(str), cache);
if (beg < 0) {
beg += n;
}
if (beg > n || beg < 0) {
return Qnil;
}
if (len == 0 || beg == n) {
return str_new_empty(str);
}
if (beg + len > n) {
len = n - beg;
}
rb_str_t *substr = str_get_characters(RSTR(str), beg, beg + len - 1, cache);
OBJ_INFECT(substr, str);
return substr == NULL ? Qnil : (VALUE)substr;
}
static VALUE
rstr_substr(VALUE str, long beg, long len)
{
return rstr_substr_with_cache(str, beg, len, NULL);
}
static void
rstr_splice(VALUE self, long beg, long len, VALUE str)
{
rb_str_t *strstr = str_need_string(str);
if (len < 0) {
rb_raise(rb_eIndexError, "negative length %ld", len);
}
const long slen = str_length(RSTR(self));
if (slen < beg) {
out_of_range:
rb_raise(rb_eIndexError, "index %ld out of string", beg);
}
if (beg < 0) {
if (-beg > slen) {
goto out_of_range;
}
beg += slen;
}
if (slen < len || slen < beg + len) {
len = slen - beg;
}
rstr_modify(self);
str_splice(RSTR(self), beg, len, strstr);
OBJ_INFECT(self, strstr);
}
static void
rstr_append(VALUE str, VALUE substr)
{
str_concat_string(RSTR(str), str_need_string(substr));
OBJ_INFECT(str, substr);
}
static void inline
str_concat_ascii_cstr(rb_str_t *self, char *cstr)
{
str_reset_flags(self);
long len = strlen(cstr);
if (self->encoding->ascii_compatible) {
str_concat_bytes(self, cstr, len);
}
else {
rb_str_t *str = RSTR(rb_enc_str_new(cstr, len, rb_encodings[ENCODING_ASCII]));
str = str_simple_transcode(str, self->encoding);
str_concat_bytes(self, str->bytes, str->length_in_bytes);
}
}
rb_str_t *
str_transcode(rb_str_t *self, rb_encoding_t *src_encoding, rb_encoding_t *dst_encoding,
int behavior_for_invalid, int behavior_for_undefined, rb_str_t *replacement_str)
{
if ((behavior_for_invalid == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)
|| (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING)) {
assert(replacement_str != NULL);
assert(replacement_str->encoding != NULL);
assert((replacement_str->length_in_bytes == 0) || (replacement_str->encoding == dst_encoding));
}
rb_str_t *dst_str = str_alloc(rb_cRubyString);
dst_str->encoding = dst_encoding;
if ((self->length_in_bytes == 0) &&
(behavior_for_undefined != TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR)) {
return dst_str;
}
rb_encoding_t *src_encoding_used;
rb_encoding_t *dst_encoding_used;
if (IS_BINARY_ENC(dst_encoding)) {
dst_encoding_used = rb_encodings[ENCODING_ASCII];
}
else {
dst_encoding_used = dst_encoding;
}
if (IS_BINARY_ENC(src_encoding)) {
src_encoding_used = rb_encodings[ENCODING_ASCII];
}
else {
src_encoding_used = src_encoding;
}
if (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR) {
str_concat_ascii_cstr(dst_str, "\"");
}
long pos_in_src = 0;
for (;;) {
UChar *utf16;
long utf16_length;
// we need to transcode even if the source encoding is native UTF-16
// because it could contain invalid bytes
str_ucnv_transcode_to_utf16(src_encoding_used,
self, &pos_in_src, &utf16, &utf16_length);
if (utf16_length > 0) {
if ((behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT)
|| (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR)) {
long new_utf16_length = 0;
for (long i = 0; i < utf16_length; ++i) {
switch (utf16[i]) {
case '&':
new_utf16_length += 5;
break;
case '<':
case '>':
new_utf16_length += 4;
break;
case '"':
if (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR) {
new_utf16_length += 6;
}
else {
++new_utf16_length;
}
break;
default:
++new_utf16_length;
}
}
if (new_utf16_length != utf16_length) {
UChar *new_utf16 = xmalloc(UCHARS_TO_BYTES(new_utf16_length));
long new_utf16_pos = 0;
for (long i = 0; i < utf16_length; ++i) {
switch (utf16[i]) {
case '&':
new_utf16[new_utf16_pos++] = '&';
new_utf16[new_utf16_pos++] = 'a';
new_utf16[new_utf16_pos++] = 'm';
new_utf16[new_utf16_pos++] = 'p';
new_utf16[new_utf16_pos++] = ';';
break;
case '<':
new_utf16[new_utf16_pos++] = '&';
new_utf16[new_utf16_pos++] = 'l';
new_utf16[new_utf16_pos++] = 't';
new_utf16[new_utf16_pos++] = ';';
break;
case '>':
new_utf16[new_utf16_pos++] = '&';
new_utf16[new_utf16_pos++] = 'g';
new_utf16[new_utf16_pos++] = 't';
new_utf16[new_utf16_pos++] = ';';
break;
case '"':
if (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR) {
new_utf16[new_utf16_pos++] = '&';
new_utf16[new_utf16_pos++] = 'q';
new_utf16[new_utf16_pos++] = 'u';
new_utf16[new_utf16_pos++] = 'o';
new_utf16[new_utf16_pos++] = 't';
new_utf16[new_utf16_pos++] = ';';
}
else {
new_utf16[new_utf16_pos++] = utf16[i];
}
break;
default:
new_utf16[new_utf16_pos++] = utf16[i];
}
}
utf16_length = new_utf16_length;
utf16 = new_utf16;
}
}
long utf16_pos = 0;
for (;;) {
long bytes_length;
char *bytes;
str_ucnv_transcode_from_utf16(dst_encoding_used,
utf16, utf16_length, &utf16_pos, &bytes, &bytes_length);
if (bytes_length > 0) {
str_concat_bytes(dst_str, bytes, bytes_length);
}
if (utf16_pos < utf16_length) {
// undefined char
UChar32 c;
U16_NEXT(utf16, utf16_pos, utf16_length, c);
switch (behavior_for_undefined) {
case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
rb_raise(rb_eUndefinedConversionError, "U+%04X from %s to %s", c, src_encoding->public_name, dst_encoding->public_name);
break;
case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
if (replacement_str->length_in_bytes > 0) {
str_concat_bytes(dst_str, replacement_str->bytes, replacement_str->length_in_bytes);
}
break;
case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_TEXT:
case TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR:
{
char xml[10];
snprintf(xml, 10, "&#x%X;", c);
str_concat_ascii_cstr(dst_str, xml);
}
break;
default:
abort();
}
}
if (utf16_pos == utf16_length) {
break;
}
}
}
if (pos_in_src < self->length_in_bytes) {
// invalid bytes
long invalid_bytes_length = src_encoding->min_char_size;
if (invalid_bytes_length + pos_in_src > self->length_in_bytes) {
invalid_bytes_length = self->length_in_bytes - pos_in_src;
}
switch (behavior_for_invalid) {
case TRANSCODE_BEHAVIOR_RAISE_EXCEPTION:
{
char *bytes_list = xmalloc(invalid_bytes_length * 4);
char *bytes_list_pos = bytes_list;
for (long i = 0; i < invalid_bytes_length; ++i) {
snprintf(bytes_list_pos, (invalid_bytes_length * 4), "\\x%02X", (unsigned char)self->bytes[pos_in_src+i]);
bytes_list_pos += 4;
}
rb_raise(rb_eInvalidByteSequenceError, "\"%s\" on %s", bytes_list, src_encoding->public_name);
}
break;
case TRANSCODE_BEHAVIOR_REPLACE_WITH_STRING:
if (replacement_str->length_in_bytes > 0) {
str_concat_bytes(dst_str, replacement_str->bytes, replacement_str->length_in_bytes);
}
break;
default:
abort();
}
pos_in_src += invalid_bytes_length;
}
if (pos_in_src == self->length_in_bytes) {
break;
}
}
if (behavior_for_undefined == TRANSCODE_BEHAVIOR_REPLACE_WITH_XML_ATTR) {
str_concat_ascii_cstr(dst_str, "\"");
}
return dst_str;
}
//----------------------------------------------
// Functions called by MacRuby
VALUE
mr_enc_s_is_compatible(VALUE klass, SEL sel, VALUE str1, VALUE str2)
{
rb_encoding_t *encoding = str_compatible_encoding(str_need_string(str1),
str_need_string(str2));
if (encoding == NULL) {
return Qnil;
}
return (VALUE)encoding;
}
static VALUE
rstr_alloc(VALUE klass, SEL sel)
{
return (VALUE)str_alloc(klass);
}
/*
* call-seq:
* String.try_convert(obj) -> string or nil
*
* Try to convert <i>obj</i> into a String, using to_str method.
* Returns converted regexp or nil if <i>obj</i> cannot be converted
* for any reason.
*
* String.try_convert("str") # => str
* String.try_convert(/re/) # => nil
*/
static VALUE
rstr_try_convert(VALUE self, SEL sel, VALUE other)
{
return rb_check_string_type(other);
}
/*
* call-seq:
* str.replace(other_str) => str
*
* Replaces the contents and taintedness of <i>str</i> with the corresponding
* values in <i>other_str</i>.
*
* s = "hello" #=> "hello"
* s.replace "world" #=> "world"
*/
static VALUE
rstr_replace(VALUE self, SEL sel, VALUE arg)
{
rstr_modify(self);
if (self != arg) {
str_replace(RSTR(self), arg);
OBJ_INFECT(self, arg);
}
return self;
}
/*
* call-seq:
* String.new(str="") => new_str
*
* Returns a new string object containing a copy of <i>str</i>.
*/
static VALUE
rstr_initialize(VALUE self, SEL sel, int argc, VALUE *argv)
{
VALUE orig;
if (argc > 0 && rb_scan_args(argc, argv, "01", &orig) == 1) {
rstr_replace(self, 0, orig);
}
return self;
}
static VALUE
rstr_dup(VALUE str, SEL sel)
{
VALUE klass = CLASS_OF(str);
while (RCLASS_SINGLETON(klass)) {
klass = RCLASS_SUPER(klass);
}
assert(rb_klass_is_rstr(klass));
VALUE dup = rstr_alloc(klass, 0);
rb_obj_invoke_initialize_copy(dup, str);
OBJ_INFECT(dup, str);
return dup;
}
/*
* call-seq:
* string.clear -> string
*
* Makes string empty.
*
* a = "abcde"
* a.clear #=> ""
*/
static VALUE
rstr_clear(VALUE self, SEL sel)
{
rstr_modify(self);
RSTR(self)->length_in_bytes = 0;
return self;
}
/*
* call-seq:
* str.length => integer
* str.size => integer
*
* Returns the character length of <i>str</i>.
*/
static VALUE
rstr_length(VALUE self, SEL sel)
{
return INT2NUM(str_length(RSTR(self)));
}
/*
* call-seq:
* str.empty? => true or false
*
* Returns <code>true</code> if <i>str</i> has a length of zero.
*
* "hello".empty? #=> false
* "".empty? #=> true
*/
static VALUE
rstr_empty(VALUE self, SEL sel)
{
return RSTR(self)->length_in_bytes == 0 ? Qtrue : Qfalse;
}
/*
* call-seq:
* str.bytesize => integer
*
* Returns the length of <i>str</i> in bytes.
*/
static VALUE
rstr_bytesize(VALUE self, SEL sel)
{
return INT2NUM(str_bytesize(RSTR(self)));
}
static VALUE
rstr_encoding(VALUE self, SEL sel)
{
return (VALUE)RSTR(self)->encoding;
}
/*
* call-seq:
* str.getbyte(index) => 0 .. 255
*
* returns the <i>index</i>th byte as an integer.
*/
static VALUE
rstr_getbyte(VALUE self, SEL sel, VALUE index)
{
unsigned char c = 0;
long idx = NUM2LONG(index);
if (idx < 0) {
idx += RSTR(self)->length_in_bytes;
if (idx < 0) {
return Qnil;
}
}
if (idx >= RSTR(self)->length_in_bytes) {
return Qnil;
}
c = RSTR(self)->bytes[idx];
return INT2FIX(c);
}
/*
* call-seq:
* str.setbyte(index, int) => int
*
* modifies the <i>index</i>th byte as <i>int</i>.
*/
static VALUE
rstr_setbyte(VALUE self, SEL sel, VALUE idx, VALUE value)
{
rstr_modify(self);
long index = NUM2LONG(idx);
int byte = NUM2INT(value);
if ((index < -RSTR(self)->length_in_bytes)
|| (index >= RSTR(self)->length_in_bytes)) {
rb_raise(rb_eIndexError, "index %ld out of string", index);
}
if (index < 0) {
index += RSTR(self)->length_in_bytes;
}
str_reset_flags(RSTR(self));
RSTR(self)->bytes[index] = byte;
return value;
}
/*
* call-seq:
* str.to_data => NSData
*
* returns an NSData object wrapping the receiver's internal storage.
*
*/
static VALUE
rstr_to_data(VALUE self, SEL sel)
{
CFDataRef data = CFDataCreate(NULL, (const UInt8 *)RSTR(self)->bytes,
RSTR(self)->length_in_bytes);
CFMakeCollectable(data);
return (VALUE)data;
}
/*
* call-seq:
* str.pointer => Pointer
*
* returns a Pointer object wrapping the receiver's internal storage (be
* very careful, changing the pointer will change the original string's
* content!).
*/
static VALUE
rstr_pointer(VALUE self, SEL sel)
{
return rb_pointer_new("C", RSTR(self)->bytes,
RSTR(self)->length_in_bytes);
}
/*
* call-seq:
* str.force_encoding(encoding) => str
*
* Changes the encoding to +encoding+ and returns self.
*/
void
rb_str_force_encoding(VALUE str, rb_encoding_t *enc)
{
assert(IS_RSTR(str));
if (enc != RSTR(str)->encoding) {
RSTR(str)->encoding = enc;
str_reset_flags(RSTR(str));
}
}
static VALUE
rstr_force_encoding(VALUE self, SEL sel, VALUE encoding)
{
rstr_modify(self);
rb_str_force_encoding(self, rb_to_encoding(encoding));
return self;
}
/*
* call-seq:
* str.valid_encoding? => true or false
*
* Returns true for a string which encoded correctly.
*
* "\xc2\xa1".force_encoding("UTF-8").valid_encoding? => true
* "\xc2".force_encoding("UTF-8").valid_encoding? => false
* "\x80".force_encoding("UTF-8").valid_encoding? => false
*/
static VALUE
rstr_is_valid_encoding(VALUE self, SEL sel)
{
return str_is_valid_encoding(RSTR(self)) ? Qtrue : Qfalse;
}
/*
* call-seq:
* str.ascii_only? => true or false
*
* Returns true for a string which has only ASCII characters.
*
* "abc".force_encoding("UTF-8").ascii_only? => true
* "abc\u{6666}".force_encoding("UTF-8").ascii_only? => false
*/
static VALUE
rstr_is_ascii_only(VALUE self, SEL sel)
{
return str_is_ruby_ascii_only(RSTR(self)) ? Qtrue : Qfalse;
}
/*
* call-seq:
* str[fixnum] => new_str or nil
* str[fixnum, fixnum] => new_str or nil
* str[range] => new_str or nil
* str[regexp] => new_str or nil
* str[regexp, fixnum] => new_str or nil
* str[other_str] => new_str or nil
* str.slice(fixnum) => new_str or nil
* str.slice(fixnum, fixnum) => new_str or nil
* str.slice(range) => new_str or nil
* str.slice(regexp) => new_str or nil
* str.slice(regexp, fixnum) => new_str or nil
* str.slice(other_str) => new_str or nil
*
* Element Reference---If passed a single <code>Fixnum</code>, returns a
* substring of one character at that position. If passed two <code>Fixnum</code>
* objects, returns a substring starting at the offset given by the first, and
* a length given by the second. If given a range, a substring containing
* characters at offsets given by the range is returned. In all three cases, if
* an offset is negative, it is counted from the end of <i>str</i>. Returns
* <code>nil</code> if the initial offset falls outside the string, the length
* is negative, or the beginning of the range is greater than the end.
*
* If a <code>Regexp</code> is supplied, the matching portion of <i>str</i> is
* returned. If a numeric parameter follows the regular expression, that
* component of the <code>MatchData</code> is returned instead. If a
* <code>String</code> is given, that string is returned if it occurs in
* <i>str</i>. In both cases, <code>nil</code> is returned if there is no
* match.
*
* a = "hello there"
* a[1] #=> "e"
* a[1,3] #=> "ell"
* a[1..3] #=> "ell"
* a[-3,2] #=> "er"
* a[-4..-2] #=> "her"
* a[12..-1] #=> nil
* a[-2..-4] #=> ""
* a[/[aeiou](.)\1/] #=> "ell"
* a[/[aeiou](.)\1/, 0] #=> "ell"
* a[/[aeiou](.)\1/, 1] #=> "l"
* a[/[aeiou](.)\1/, 2] #=> nil
* a["lo"] #=> "lo"
* a["bye"] #=> nil
*/
static VALUE
rb_str_subpat(VALUE str, VALUE re, int nth)
{
if (rb_reg_search(re, str, 0, false) >= 0) {
return rb_reg_nth_match(nth, rb_backref_get());
}
return Qnil;
}
VALUE
rstr_aref(VALUE str, SEL sel, int argc, VALUE *argv)
{
VALUE result = Qnil;
bool tainted = OBJ_TAINTED(str);
if (argc == 2) {
if (TYPE(argv[0]) == T_REGEXP) {
result = rb_str_subpat(str, argv[0], NUM2INT(argv[1]));
}
else {
result = rstr_substr(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]));
}
goto bail;
}
if (argc != 1) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
VALUE indx = argv[0];
switch (TYPE(indx)) {
case T_FIXNUM:
result = rstr_substr(str, FIX2LONG(indx), 1);
if (NIL_P(result) || rb_str_chars_len(result) == 0) {
return Qnil;
}
break;
case T_REGEXP:
result = rb_str_subpat(str, indx, 0);
break;
case T_STRING:
{
tainted = false;
if (IS_RSTR(indx)) {
rb_str_t *searched = RSTR(indx);
if (str_include_string(RSTR(str), searched)) {
result = (VALUE)str_dup(searched);
SET_CLASS(result, searched);
goto bail;
}
}
else {
rb_str_t *searched =
str_new_from_cfstring((CFStringRef)indx);
if (str_include_string(RSTR(str), searched)) {
// no need to duplicate the string as we just
// created it
result = (VALUE)searched;
goto bail;
}
}
return Qnil;
}
default:
{
long beg = 0, len = 0;
switch (rb_range_beg_len(indx, &beg, &len,
str_length(RSTR(str)), 0)) {
case Qfalse:
break;
case Qnil:
return Qnil;
default:
result = rstr_substr(str, beg, len);
goto bail;
}
result = rstr_substr(str, NUM2LONG(indx), 1);
if (NIL_P(result) || rb_str_chars_len(result) == 0) {
return Qnil;
}
break;
}
}
bail:
if (!tainted) {
for (int i = 0; i < argc; i++) {
if (OBJ_TAINTED(argv[i])) {
tainted = true;
break;
}
}
}
if (tainted) {
OBJ_TAINT(result);
}
return result;
}
/*
* call-seq:
* str[fixnum] = new_str
* str[fixnum, fixnum] = new_str
* str[range] = aString
* str[regexp] = new_str
* str[regexp, fixnum] = new_str
* str[other_str] = new_str
*
* Element Assignment---Replaces some or all of the content of <i>str</i>. The
* portion of the string affected is determined using the same criteria as
* <code>String#[]</code>. If the replacement string is not the same length as
* the text it is replacing, the string will be adjusted accordingly. If the
* regular expression or string is used as the index doesn't match a position
* in the string, <code>IndexError</code> is raised. If the regular expression
* form is used, the optional second <code>Fixnum</code> allows you to specify
* which portion of the match to replace (effectively using the
* <code>MatchData</code> indexing rules. The forms that take a
* <code>Fixnum</code> will raise an <code>IndexError</code> if the value is
* out of range; the <code>Range</code> form will raise a
* <code>RangeError</code>, and the <code>Regexp</code> and <code>String</code>
* forms will silently ignore the assignment.
*/
static void
rb_str_subpat_set(VALUE str, VALUE re, int nth, VALUE val)
{
if (rb_reg_search(re, str, 0, false) < 0) {
rb_raise(rb_eIndexError, "regexp not matched");
}
VALUE match = rb_backref_get();
int count = 0;
rb_match_result_t *results = rb_reg_match_results(match, &count);
assert(count > 0);
if (nth >= count) {
out_of_range:
rb_raise(rb_eIndexError, "index %d out of regexp", nth);
}
if (nth < 0) {
if (-nth >= count) {
goto out_of_range;
}
nth += count;
}
const long start = results[nth].beg;
if (start == -1) {
rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
}
const long end = results[nth].end;
const long len = end - start;
rstr_splice(str, start, len, val);
}
static VALUE
rstr_aset(VALUE str, SEL sel, int argc, VALUE *argv)
{
if (argc == 3) {
if (TYPE(argv[0]) == T_REGEXP) {
rb_str_subpat_set(str, argv[0], NUM2INT(argv[1]), argv[2]);
}
else {
rstr_splice(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]),
argv[2]);
}
return argv[2];
}
if (argc != 2) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
VALUE indx = argv[0];
VALUE val = argv[1];
long pos = 0;
switch (TYPE(indx)) {
case T_FIXNUM:
pos = FIX2LONG(indx);
num_index:
rstr_splice(str, pos, 1, val);
return val;
case T_REGEXP:
rb_str_subpat_set(str, indx, 0, val);
return val;
case T_STRING:
pos = str_index_for_string(RSTR(str), str_need_string(indx),
0, -1, false);
if (pos < 0) {
rb_raise(rb_eIndexError, "string not matched");
}
rstr_splice(str, pos, rb_str_chars_len(indx), val);
return val;
default:
/* check if indx is Range */
{
long beg, len;
if (rb_range_beg_len(indx, &beg, &len,
str_length(RSTR(str)), 2)) {
rstr_splice(str, beg, len, val);
return val;
}
}
pos = NUM2LONG(indx);
goto num_index;
}
}
/*
* call-seq:
* str.slice!(fixnum) => fixnum or nil
* str.slice!(fixnum, fixnum) => new_str or nil
* str.slice!(range) => new_str or nil
* str.slice!(regexp) => new_str or nil
* str.slice!(other_str) => new_str or nil
*
* Deletes the specified portion from <i>str</i>, and returns the portion
* deleted.
*
* string = "this is a string"
* string.slice!(2) #=> "i"
* string.slice!(3..6) #=> " is "
* string.slice!(/s.*t/) #=> "sa st"
* string.slice!("r") #=> "r"
* string #=> "thing"
*/
static VALUE
rstr_slice_bang(VALUE str, SEL sel, int argc, VALUE *argv)
{
if (argc < 1 || 2 < argc) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
rstr_modify(str);
int i;
VALUE buf[3];
for (i=0; i < argc; i++) {
buf[i] = argv[i];
}
buf[i] = rb_str_new(NULL, 0);
VALUE result = rstr_aref(str, 0, argc, buf);
if (!NIL_P(result)) {
rstr_aset(str, 0, argc + 1, buf);
}
return result;
}
/*
* call-seq:
* str.insert(index, other_str) => str
*
* Inserts <i>other_str</i> before the character at the given
* <i>index</i>, modifying <i>str</i>. Negative indices count from the
* end of the string, and insert <em>after</em> the given character.
* The intent is insert <i>aString</i> so that it starts at the given
* <i>index</i>.
*
* "abcd".insert(0, 'X') #=> "Xabcd"
* "abcd".insert(3, 'X') #=> "abcXd"
* "abcd".insert(4, 'X') #=> "abcdX"
* "abcd".insert(-3, 'X') #=> "abXcd"
* "abcd".insert(-1, 'X') #=> "abcdX"
*/
static VALUE
rstr_insert(VALUE str, SEL sel, VALUE idx, VALUE substr)
{
long pos = NUM2LONG(idx);
if (pos == -1) {
rstr_append(str, substr);
}
else {
if (pos < 0) {
pos++;
}
rstr_splice(str, pos, 0, substr);
}
return str;
}
/*
* call-seq:
* str.index(substring [, offset]) => fixnum or nil
* str.index(fixnum [, offset]) => fixnum or nil
* str.index(regexp [, offset]) => fixnum or nil
*
* Returns the index of the first occurrence of the given <i>substring</i>,
* character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
* <code>nil</code> if not found. If the second parameter is present, it
* specifies the position in the string to begin the search.
*
* "hello".index('e') #=> 1
* "hello".index('lo') #=> 3
* "hello".index('a') #=> nil
* "hello".index(?e) #=> 1
* "hello".index(101) #=> 1
* "hello".index(/[aeiou]/, -3) #=> 4
*/
static VALUE
rstr_index(VALUE self, SEL sel, int argc, VALUE *argv)
{
const long len = str_length(RSTR(self));
VALUE sub, initpos;
long pos;
if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
pos = NUM2LONG(initpos);
if (pos < 0) {
pos += len;
}
if (pos < 0) {
if (TYPE(sub) == T_REGEXP) {
rb_backref_set(Qnil);
}
return Qnil;
}
}
else {
pos = 0;
}
switch (TYPE(sub)) {
case T_REGEXP:
if (pos > len) {
return Qnil;
}
pos = rb_reg_search(sub, self, pos, false);
break;
default:
StringValue(sub);
// fall through
case T_STRING:
if (pos == len && rb_str_chars_len(sub) == 0) {
// Do nothing... RubySpec compliance...
}
else {
pos = str_index_for_string(RSTR(self), str_need_string(sub),
pos, -1, false);
}
break;
}
return pos >= 0 ? LONG2NUM(pos) : Qnil;
}
/*
* call-seq:
* str.rindex(substring [, fixnum]) => fixnum or nil
* str.rindex(fixnum [, fixnum]) => fixnum or nil
* str.rindex(regexp [, fixnum]) => fixnum or nil
*
* Returns the index of the last occurrence of the given <i>substring</i>,
* character (<i>fixnum</i>), or pattern (<i>regexp</i>) in <i>str</i>. Returns
* <code>nil</code> if not found. If the second parameter is present, it
* specifies the position in the string to end the search---characters beyond
* this point will not be considered.
*
* "hello".rindex('e') #=> 1
* "hello".rindex('l') #=> 3
* "hello".rindex('a') #=> nil
* "hello".rindex(?e) #=> 1
* "hello".rindex(101) #=> 1
* "hello".rindex(/[aeiou]/, -2) #=> 1
*/
static VALUE
rstr_rindex(VALUE self, SEL sel, int argc, VALUE *argv)
{
const long len = str_length(RSTR(self));
VALUE sub, initpos;
long pos;
if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
pos = NUM2LONG(initpos);
if (pos < 0) {
pos += len;
if (pos < 0) {
if (TYPE(sub) == T_REGEXP) {
rb_backref_set(Qnil);
}
return Qnil;
}
}
if (pos >= len) {
pos = len;
}
}
else {
pos = len;
}
switch (TYPE(sub)) {
case T_REGEXP:
pos = rb_reg_search(sub, self, pos, true);
break;
default:
StringValue(sub);
// fall through
case T_STRING:
if (rb_str_chars_len(sub) > 0) {
pos = str_index_for_string(RSTR(self), str_need_string(sub),
0, pos, true);
}
break;
}
return pos >= 0 ? LONG2NUM(pos) : Qnil;
}
/*
* call-seq:
* str + other_str => new_str
*
* Concatenation---Returns a new <code>String</code> containing
* <i>other_str</i> concatenated to <i>str</i>.
*
* "Hello from " + self.to_s #=> "Hello from main"
*/
static VALUE
rstr_plus(VALUE self, SEL sel, VALUE other)
{
StringValue(other);
rb_str_t *newstr = str_dup(RSTR(self));
rb_str_t *otherstr = str_need_string(other);
str_concat_string(newstr, otherstr);
if (OBJ_TAINTED(self) || OBJ_TAINTED(other)) {
OBJ_TAINT(newstr);
}
return (VALUE)newstr;
}
/*
* call-seq:
* str * integer => new_str
*
* Copy---Returns a new <code>String</code> containing <i>integer</i> copies of
* the receiver.
*
* "Ho! " * 3 #=> "Ho! Ho! Ho! "
*/
static VALUE
rstr_times(VALUE self, SEL sel, VALUE times)
{
const long n = NUM2LONG(times);
if (n < 0) {
rb_raise(rb_eArgError, "negative argument");
}
if (n > 0 && LONG_MAX/n < RSTR(self)->length_in_bytes) {
rb_raise(rb_eArgError, "argument too big");
}
VALUE new = str_new_like(self);
str_resize_bytes(RSTR(new), n * RSTR(self)->length_in_bytes);
if (n) {
str_concat_string(RSTR(new), RSTR(self));
long i;
for (i = 1; i <= n/2; i *= 2) {
str_concat_string(RSTR(new), RSTR(new));
}
memcpy(RSTR(new)->bytes + RSTR(new)->length_in_bytes,
RSTR(new)->bytes,
(n - i) * RSTR(self)->length_in_bytes);
RSTR(new)->length_in_bytes = n * RSTR(self)->length_in_bytes;
}
OBJ_INFECT(new, self);
return new;
}
/*
* call-seq:
* str % arg => new_str
*
* Format---Uses <i>str</i> as a format specification, and returns the result
* of applying it to <i>arg</i>. If the format specification contains more than
* one substitution, then <i>arg</i> must be an <code>Array</code> containing
* the values to be substituted. See <code>Kernel::sprintf</code> for details
* of the format string.
*
* "%05d" % 123 #=> "00123"
* "%-5s: %08x" % [ "ID", self.object_id ] #=> "ID : 200e14d6"
*/
static VALUE
rstr_format(VALUE str, SEL sel, VALUE arg)
{
VALUE tmp = rb_check_array_type(arg);
if (!NIL_P(tmp)) {
return rb_str_format(RARRAY_LENINT(tmp), RARRAY_PTR(tmp), str);
}
return rb_str_format(1, &arg, str);
}
/*
* call-seq:
* str << fixnum => str
* str.concat(fixnum) => str
* str << obj => str
* str.concat(obj) => str
*
* Append---Concatenates the given object to <i>str</i>. If the object is a
* <code>Fixnum</code>, it is considered as a codepoint, and is converted
* to a character before concatenation.
*
* a = "hello "
* a << "world" #=> "hello world"
* a.concat(33) #=> "hello world!"
*/
VALUE
rstr_concat(VALUE self, SEL sel, VALUE other)
{
rstr_modify(self);
long codepoint = 0;
switch (TYPE(other)) {
case T_FIXNUM:
codepoint = FIX2LONG(other);
break;
case T_BIGNUM:
codepoint = rb_big2ulong(other);
break;
default:
rstr_append(self, other);
return self;
}
if (IS_UTF8_ENC(RSTR(self)->encoding)) {
const int bytelen = U8_LENGTH(codepoint);
if (bytelen <= 0) {
goto out_of_range;
}
uint8_t *buf = (uint8_t *)malloc(bytelen);
assert(buf != NULL);
int offset = 0;
UBool error = false;
U8_APPEND(buf, offset, bytelen, codepoint, error);
if (error) {
free(buf);
goto out_of_range;
}
str_reset_flags(RSTR(self));
str_concat_bytes(RSTR(self), (const char *)buf, bytelen);
free(buf);
}
else {
rb_raise(rb_eArgError,
"receiver encoding `%s' not supported for codepoint insertion",
RSTRING_PTR(rb_inspect((VALUE)RSTR(self)->encoding)));
}
return self;
out_of_range:
rb_raise(rb_eArgError, "codepoint %ld out of range", codepoint);
}
/*
* call-seq:
* str == obj => true or false
*
* Equality---If <i>obj</i> is not a <code>String</code>, returns
* <code>false</code>. Otherwise, returns <code>true</code> if <i>str</i>
* <code><=></code> <i>obj</i> returns zero.
*/
static VALUE
rstr_equal(VALUE self, SEL sel, VALUE other)
{
if (self == other) {
return Qtrue;
}
if (TYPE(other) != T_STRING) {
if (!rb_respond_to(other, rb_intern("to_str"))) {
return Qfalse;
}
return rb_equal(other, self);
}
return str_compare(RSTR(self), str_need_string(other)) == 0
? Qtrue : Qfalse;
}
/*
* call-seq:
* str <=> other_str => -1, 0, +1
*
* Comparison---Returns -1 if <i>other_str</i> is less than, 0 if
* <i>other_str</i> is equal to, and +1 if <i>other_str</i> is greater than
* <i>str</i>. If the strings are of different lengths, and the strings are
* equal when compared up to the shortest length, then the longer string is
* considered greater than the shorter one. In older versions of Ruby, setting
* <code>$=</code> allowed case-insensitive comparisons; this is now deprecated
* in favor of using <code>String#casecmp</code>.
*
* <code><=></code> is the basis for the methods <code><</code>,
* <code><=</code>, <code>></code>, <code>>=</code>, and <code>between?</code>,
* included from module <code>Comparable</code>. The method
* <code>String#==</code> does not use <code>Comparable#==</code>.
*
* "abcdef" <=> "abcde" #=> 1
* "abcdef" <=> "abcdef" #=> 0
* "abcdef" <=> "abcdefg" #=> -1
* "abcdef" <=> "ABCDEF" #=> 1
*/
static VALUE
rstr_cmp(VALUE self, SEL sel, VALUE other)
{
long result;
if (TYPE(other) != T_STRING) {
if (!rb_respond_to(other, rb_intern("to_str"))) {
return Qnil;
}
else if (!rb_vm_respond_to(other, selCmp, false)) {
return Qnil;
}
else {
VALUE tmp = rb_vm_call(other, selCmp, 1, &self);
if (NIL_P(tmp)) {
return Qnil;
}
if (!FIXNUM_P(tmp)) {
return rb_vm_call(LONG2FIX(0), selMINUS, 1, &tmp);
}
result = -FIX2LONG(tmp);
}
}
else {
result = str_compare(RSTR(self), str_need_string(other));
}
return LONG2NUM(result);
}
/*
* call-seq:
* str.casecmp(other_str) => -1, 0, +1 or nil
*
* Case-insensitive version of <code>String#<=></code>.
*
* "abcdef".casecmp("abcde") #=> 1
* "aBcDeF".casecmp("abcdef") #=> 0
* "abcdef".casecmp("abcdefg") #=> -1
* "abcdef".casecmp("ABCDEF") #=> 0
*/
static VALUE
rstr_casecmp(VALUE str, SEL sel, VALUE other)
{
return INT2FIX(str_case_compare(RSTR(str), str_need_string(other)));
}
/*
* call-seq:
* str.eql?(other) => true or false
*
* Two strings are equal if they have the same length and content.
*/
static VALUE
rstr_eql(VALUE self, SEL sel, VALUE other)
{
if (self == other) {
return Qtrue;
}
if (TYPE(other) != T_STRING) {
return Qfalse;
}
return str_compare(RSTR(self), str_need_string(other)) == 0
? Qtrue : Qfalse;
}
/*
* call-seq:
* str.include? other_str => true or false
* str.include? fixnum => true or false
*
* Returns <code>true</code> if <i>str</i> contains the given string or
* character.
*
* "hello".include? "lo" #=> true
* "hello".include? "ol" #=> false
* "hello".include? ?h #=> true
*/
static VALUE
rstr_includes(VALUE self, SEL sel, VALUE searched)
{
return str_include_string(RSTR(self), str_need_string(searched))
? Qtrue : Qfalse;
}
/*
* call-seq:
* str.start_with?([prefix]+) => true or false
*
* Returns true if <i>str</i> starts with the prefix given.
*/
static VALUE
rstr_start_with(VALUE str, SEL sel, int argc, VALUE *argv)
{
for (int i = 0; i < argc; i++) {
VALUE tmp = rb_check_string_type(argv[i]);
if (NIL_P(tmp)) {
continue;
}
const long pos = str_index_for_string(RSTR(str), str_need_string(tmp),
0, rb_str_chars_len(tmp), false);
if (pos == 0) {
return Qtrue;
}
}
return Qfalse;
}
/*
* call-seq:
* str.end_with?([suffix]+) => true or false
*
* Returns true if <i>str</i> ends with the suffix given.
*/
static VALUE
rstr_end_with(VALUE str, SEL sel, int argc, VALUE *argv)
{
const long len = rb_str_chars_len(str);
for (int i = 0; i < argc; i++) {
VALUE tmp = rb_check_string_type(argv[i]);
if (NIL_P(tmp)) {
continue;
}
const long sublen = rb_str_chars_len(tmp);
if (sublen > len) {
continue;
}
const long pos = str_index_for_string(RSTR(str), str_need_string(tmp),
len - sublen, len, false);
if (pos == len - sublen) {
return Qtrue;
}
}
return Qfalse;
}
/*
* call-seq:
* str.to_s => str
* str.to_str => str
*
* Returns the receiver.
*/
static VALUE
rstr_to_s(VALUE self, SEL sel)
{
if (CLASS_OF(self) != rb_cRubyString) {
VALUE dup = (VALUE)str_dup(RSTR(self));
OBJ_INFECT(dup, self);
return dup;
}
return self;
}
/*
* call-seq:
* str.intern => symbol
* str.to_sym => symbol
*
* Returns the <code>Symbol</code> corresponding to <i>str</i>, creating the
* symbol if it did not previously exist. See <code>Symbol#id2name</code>.
*
* "Koala".intern #=> :Koala
* s = 'cat'.to_sym #=> :cat
* s == :cat #=> true
* s = '@cat'.to_sym #=> :@cat
* s == :@cat #=> true
*
* This can also be used to create symbols that cannot be represented using the
* <code>:xxx</code> notation.
*
* 'cat and dog'.to_sym #=> :"cat and dog"
*/
static VALUE
rstr_intern(VALUE self, SEL sel)
{
if (OBJ_TAINTED(self) && rb_safe_level() >= 1) {
rb_raise(rb_eSecurityError, "Insecure: can't intern tainted string");
}
return rb_str_intern_fast(self);
}
VALUE
rb_str_intern(VALUE self)
{
return rstr_intern(self, 0);
}
/*
* call-seq:
* str.inspect => string
*
* Returns a printable version of _str_, surrounded by quote marks,
* with special characters escaped.
*
* str = "hello"
* str[3] = "\b"
* str.inspect #=> "\"hel\\bo\""
*/
static void
inspect_append(VALUE result, UChar32 c, bool escape)
{
if (escape) {
str_append_uchar32(RSTR(result), '\\');
}
str_append_uchar32(RSTR(result), c);
}
static VALUE
str_inspect(rb_str_t *str, bool dump)
{
VALUE result;
if (str->length_in_bytes == 0) {
result = rb_str_new2("\"\"");
OBJ_INFECT(result, str);
return result;
}
const long result_init_len = str->length_in_bytes * 3 / 2;
result = rb_unicode_str_new(NULL, result_init_len);
inspect_append(result, '"', false);
__block UChar32 prev = 0;
str_each_uchar32(str, ^(UChar32 c, long start_index, long char_len, bool *stop) {
// 1.9 considers U+00AD (soft-hyphen) printable whereas ICU does not
bool print = u_isprint(c) || (c == 0xAD);
if (IS_BINARY_ENC(str->encoding) && c > 127) {
print = false;
}
if (dump && prev == '#') {
inspect_append(result, prev, (c == '$' || c == '@' || c == '{'));
}
if (print) {
if (c == '"' || c == '\\') {
inspect_append(result, c, true);
}
else if (c != '#' || !dump) {
inspect_append(result, c, false);
}
}
else if (c == '\n') {
inspect_append(result, 'n', true);
}
else if (c == '\r') {
inspect_append(result, 'r', true);
}
else if (c == '\t') {
inspect_append(result, 't', true);
}
else if (c == '\f') {
inspect_append(result, 'f', true);
}
else if (c == '\013') {
inspect_append(result, 'v', true);
}
else if (c == '\010') {
inspect_append(result, 'b', true);
}
else if (c == '\007') {
inspect_append(result, 'a', true);
}
else if (c == 033) {
inspect_append(result, 'e', true);
}
else {
char buf[10];
for (long i = 0; i < char_len; ++i) {
uint8_t byte = (uint8_t)str->bytes[start_index+i];
snprintf(buf, sizeof buf, "\\x%02X", byte);
char *p = buf;
while (*p != '\0') {
inspect_append(result, *p, false);
p++;
}
}
}
prev = c;
});
if (dump && prev == '#') {
inspect_append(result, prev, false);
}
inspect_append(result, '"', false);
OBJ_INFECT(result, str);
return result;
}
static VALUE
rstr_inspect(VALUE self, SEL sel)
{
return str_inspect(RSTR(self), false);
}
/*
* call-seq:
* str.dump => new_str
*
* Produces a version of <i>str</i> with all nonprinting characters replaced by
* <code>\nnn</code> notation and all special characters escaped.
*/
static VALUE
rstr_dump(VALUE self, SEL sel)
{
VALUE res = str_inspect(RSTR(self), true);
*(VALUE *)res = *(VALUE *)self;
return res;
}
/*
* call-seq:
* str.match(pattern) => matchdata or nil
*
* Converts <i>pattern</i> to a <code>Regexp</code> (if it isn't already one),
* then invokes its <code>match</code> method on <i>str</i>. If the second
* parameter is present, it specifies the position in the string to begin the
* search.
*
* 'hello'.match('(.)\1') #=> #<MatchData "ll" 1:"l">
* 'hello'.match('(.)\1')[0] #=> "ll"
* 'hello'.match(/(.)\1/)[0] #=> "ll"
* 'hello'.match('xx') #=> nil
*
* If a block is given, invoke the block with MatchData if match succeed, so
* that you can write
*
* str.match(pat) {|m| ...}
*
* instead of
*
* if m = str.match(pat)
* ...
* end
*
* The return value is a value from block execution in this case.
*/
static VALUE
get_pat(VALUE pat, bool quote)
{
switch (TYPE(pat)) {
case T_REGEXP:
return pat;
case T_STRING:
break;
default:
{
VALUE val = rb_check_string_type(pat);
if (NIL_P(val)) {
Check_Type(pat, T_REGEXP);
}
pat = val;
}
}
if (quote) {
pat = rb_reg_quote(pat);
}
return rb_reg_regcomp(pat);
}
static VALUE
rstr_match2(VALUE self, SEL sel, int argc, VALUE *argv)
{
if (argc < 1) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1)", argc);
}
VALUE re = get_pat(argv[0], false);
argv[0] = self;
VALUE result = rb_vm_call(re, selMATCH, argc, argv);
if (!NIL_P(result) && rb_block_given_p()) {
return rb_yield(result);
}
return result;
}
/*
* call-seq:
* str =~ obj => fixnum or nil
*
* Match---If <i>obj</i> is a <code>Regexp</code>, use it as a pattern to match
* against <i>str</i>,and returns the position the match starts, or
* <code>nil</code> if there is no match. Otherwise, invokes
* <i>obj.=~</i>, passing <i>str</i> as an argument. The default
* <code>=~</code> in <code>Object</code> returns <code>false</code>.
*
* "cat o' 9 tails" =~ /\d/ #=> 7
* "cat o' 9 tails" =~ 9 #=> nil
*/
static VALUE
rstr_match(VALUE self, SEL sel, VALUE other)
{
switch (TYPE(other)) {
case T_STRING:
rb_raise(rb_eTypeError, "type mismatch: String given");
case T_REGEXP:
return regexp_match(other, 0, self);
default:
return rb_vm_call(other, selEqTilde, 1, &self);
}
}
VALUE
rb_str_match(VALUE self, VALUE other)
{
return rstr_match(self, 0, other);
}
/*
* call-seq:
* str.scan(pattern) => array
* str.scan(pattern) {|match, ...| block } => str
*
* Both forms iterate through <i>str</i>, matching the pattern (which may be a
* <code>Regexp</code> or a <code>String</code>). For each match, a result is
* generated and either added to the result array or passed to the block. If
* the pattern contains no groups, each individual result consists of the
* matched string, <code>$&</code>. If the pattern contains groups, each
* individual result is itself an array containing one entry per group.
*
* a = "cruel world"
* a.scan(/\w+/) #=> ["cruel", "world"]
* a.scan(/.../) #=> ["cru", "el ", "wor"]
* a.scan(/(...)/) #=> [["cru"], ["el "], ["wor"]]
* a.scan(/(..)(..)/) #=> [["cr", "ue"], ["l ", "wo"]]
*
* And the block form:
*
* a.scan(/\w+/) {|w| print "<<#{w}>> " }
* print "\n"
* a.scan(/(.)(.)/) {|x,y| print y, x }
* print "\n"
*
* <em>produces:</em>
*
* <<cruel>> <<world>>
* rceu lowlr
*/
static VALUE
rstr_scan(VALUE self, SEL sel, VALUE pat)
{
const bool block_given = rb_block_given_p();
const bool untrusted = OBJ_UNTRUSTED(self) || OBJ_UNTRUSTED(pat);
const bool tainted = OBJ_TAINTED(self) || OBJ_TAINTED(pat);
pat = get_pat(pat, true);
character_boundaries_cache_t local_cache;
reset_character_boundaries_cache(&local_cache);
VALUE ary = 0;
if (!block_given) {
ary = rb_ary_new();
}
VALUE matcher = rb_reg_matcher_new(pat, self);
VALUE match = Qnil;
long start = 0;
while (rb_reg_matcher_search(pat, matcher, start, false) >= 0) {
match = rb_backref_get();
int count = 0;
rb_match_result_t *results = rb_reg_match_results(match, &count);
assert(count > 0);
if (results[0].beg == results[0].end) {
start = results[0].end + 1;
}
else {
start = results[0].end;
}
VALUE scan_result;
if (count == 1) {
scan_result = rb_reg_nth_match_with_cache(0, match, &local_cache);
if (tainted) {
OBJ_TAINT(scan_result);
}
if (untrusted) {
OBJ_UNTRUST(scan_result);
}
}
else {
scan_result = rb_ary_new2(count);
for (int i = 1; i < count; i++) {
VALUE substr = rb_reg_nth_match_with_cache(i, match,
&local_cache);
if (tainted) {
OBJ_TAINT(substr);
}
if (untrusted) {
OBJ_UNTRUST(substr);
}
rb_ary_push(scan_result, substr);
}
}
if (block_given) {
rb_match_busy(match);
rb_yield(scan_result);
rb_backref_set(match);
RETURN_IF_BROKEN();
}
else {
rb_ary_push(ary, scan_result);
}
}
rb_backref_set(match);
rb_reg_matcher_destroy(matcher);
return block_given ? self : ary;
}
/*
* call-seq:
* str.split(pattern=$;, [limit]) => anArray
*
* Divides <i>str</i> into substrings based on a delimiter, returning an array
* of these substrings.
*
* If <i>pattern</i> is a <code>String</code>, then its contents are used as
* the delimiter when splitting <i>str</i>. If <i>pattern</i> is a single
* space, <i>str</i> is split on whitespace, with leading whitespace and runs
* of contiguous whitespace characters ignored.
*
* If <i>pattern</i> is a <code>Regexp</code>, <i>str</i> is divided where the
* pattern matches. Whenever the pattern matches a zero-length string,
* <i>str</i> is split into individual characters. If <i>pattern</i> contains
* groups, the respective matches will be returned in the array as well.
*
* If <i>pattern</i> is omitted, the value of <code>$;</code> is used. If
* <code>$;</code> is <code>nil</code> (which is the default), <i>str</i> is
* split on whitespace as if ` ' were specified.
*
* If the <i>limit</i> parameter is omitted, trailing null fields are
* suppressed. If <i>limit</i> is a positive number, at most that number of
* fields will be returned (if <i>limit</i> is <code>1</code>, the entire
* string is returned as the only entry in an array). If negative, there is no
* limit to the number of fields returned, and trailing null fields are not
* suppressed.
*
* " now's the time".split #=> ["now's", "the", "time"]
* " now's the time".split(' ') #=> ["now's", "the", "time"]
* " now's the time".split(/ /) #=> ["", "now's", "", "the", "time"]
* "1, 2.34,56, 7".split(%r{,\s*}) #=> ["1", "2.34", "56", "7"]
* "hello".split(//) #=> ["h", "e", "l", "l", "o"]
* "hello".split(//, 3) #=> ["h", "e", "llo"]
* "hi mom".split(%r{\s*}) #=> ["h", "i", "m", "o", "m"]
*
* "mellow yellow".split("ello") #=> ["m", "w y", "w"]
* "1,2,,3,4,,".split(',') #=> ["1", "2", "", "3", "4"]
* "1,2,,3,4,,".split(',', 4) #=> ["1", "2", "", "3,4,,"]
* "1,2,,3,4,,".split(',', -4) #=> ["1", "2", "", "3", "4", "", ""]
*/
static VALUE str_strip(VALUE str, int direction);
#define IS_SPLIT_AWK_SPACE(c) (c == ' ' || c == '\t' || c == '\n' || c == '\v')
static VALUE
rstr_split(VALUE str, SEL sel, int argc, VALUE *argv)
{
character_boundaries_cache_t local_cache;
reset_character_boundaries_cache(&local_cache);
const long len = str_length_with_cache(RSTR(str), &local_cache);
int lim = 0;
VALUE spat, limit;
if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
lim = NUM2INT(limit);
if (lim <= 0) {
limit = Qnil;
}
else if (lim == 1) {
if (len == 0) {
return rb_ary_new2(0);
}
return rb_ary_new4(1, &str);
}
}
VALUE result = rb_ary_new();
if (len == 0) {
return result;
}
bool awk_split = false, spat_string = false;
long spat_len = 0;
if (NIL_P(spat)) {
if (!NIL_P(rb_fs)) {
spat = rb_fs;
goto fs_set;
}
awk_split = true;
}
else {
fs_set:
if (TYPE(spat) == T_STRING) {
spat_string = true;
spat_len = rb_str_chars_len(spat);
if (spat_len == 1 && rb_str_get_uchar(spat, 0) == ' ') {
awk_split = true;
}
}
else {
spat = get_pat(spat, true);
}
}
const int lim_orig = lim;
long beg = 0;
if (awk_split) {
RB_STR_GET_UCHARS(str, chars, chars_len);
for (long i = 0; i < chars_len; i++) {
UChar c = chars[i];
if (IS_SPLIT_AWK_SPACE(c)) {
if ((i + 1 < chars_len) && IS_SPLIT_AWK_SPACE(chars[i+1])) {
continue;
}
VALUE substr = rstr_substr_with_cache(str, beg, i - beg,
&local_cache);
str_strip(substr, 0);
if (rb_str_chars_len(substr) > 0) {
rb_ary_push(result, substr);
lim--;
}
beg = i + 1;
if (limit != Qnil && lim <= 1) {
break;
}
}
}
}
else if (spat_string) {
if (spat_len == 0) {
__block int block_lim = lim;
__block long block_beg = 0;
str_each_uchar32(RSTR(str), ^(UChar32 c, long start_index, long char_len, bool *stop) {
VALUE substr = (VALUE)str_new_copy_of_part(RSTR(str), start_index, char_len);
rb_ary_push(result, substr);
if (U_IS_BMP(c)) {
++block_beg;
}
else {
block_beg += 2;
}
if (limit != Qnil && --block_lim <= 1) {
*stop = true;
}
});
lim = block_lim;
beg = block_beg;
}
else {
rb_str_t *spat_str = str_need_string(spat);
const long spat_len = str_length(spat_str);
do {
const long pos = str_index_for_string_with_cache(RSTR(str),
spat_str, beg, -1, false, &local_cache);
if (pos == -1) {
break;
}
rb_ary_push(result, rstr_substr_with_cache(str, beg, pos - beg,
&local_cache));
beg = pos + spat_len;
}
while (limit == Qnil || --lim > 1);
}
}
else {
long start = beg;
bool last_null = false;
VALUE matcher = rb_reg_matcher_new(spat, str);
again:
do {
const long pos = rb_reg_matcher_search(spat, matcher, start, false);
if (pos < 0) {
break;
}
VALUE match = rb_backref_get();
int count = 0;
rb_match_result_t *results = rb_reg_match_results(match, &count);
assert(count > 0);
if (start == pos && results[0].beg == results[0].end) {
if (last_null) {
VALUE substr;
if (beg + 1 <= len) {
substr = rstr_substr_with_cache(str, beg, 1,
&local_cache);
}
else {
substr = str_new_empty(str);
}
rb_ary_push(result, substr);
beg = start;
}
else {
start++;
last_null = true;
goto again;
}
}
else {
VALUE substr = rstr_substr_with_cache(str, beg, pos - beg,
&local_cache);
rb_ary_push(result, substr);
beg = start = results[0].end;
}
last_null = false;
for (int i = 1; i < count; i++) {
if (results[i].beg == -1 || results[i].end == -1) {
continue;
}
VALUE substr;
if (results[i].beg == results[i].end) {
substr = str_new_empty(str);
}
else {
substr = rstr_substr_with_cache(str, results[i].beg,
results[i].end - results[i].beg, &local_cache);
}
rb_ary_push(result, substr);
}
}
while (limit == Qnil || --lim > 1);
rb_reg_matcher_destroy(matcher);
}
if (len > 0 && (!NIL_P(limit) || len > beg || lim_orig < 0)) {
VALUE tmp;
if (beg >= len) {
tmp = str_new_empty(str);
}
else {
tmp = rb_str_substr(str, beg, len - beg);
}
rb_ary_push(result, tmp);
}
if (NIL_P(limit) && lim_orig == 0) {
while (true) {
const long n = RARRAY_LEN(result);
if (n > 0 && rb_str_chars_len(RARRAY_AT(result, n - 1)) == 0) {
rb_ary_pop(result);
}
else {
break;
}
}
}
if (OBJ_TAINTED(str)) {
for (int i = 0, count = RARRAY_LEN(result); i < count; i++) {
OBJ_TAINT(RARRAY_AT(result, i));
}
}
return result;
}
/*
* call-seq:
* str.to_i(base=10) => integer
*
* Returns the result of interpreting leading characters in <i>str</i> as an
* integer base <i>base</i> (between 2 and 36). Extraneous characters past the
* end of a valid number are ignored. If there is not a valid number at the
* start of <i>str</i>, <code>0</code> is returned. This method never raises an
* exception.
*
* "12345".to_i #=> 12345
* "99 red balloons".to_i #=> 99
* "0a".to_i #=> 0
* "0a".to_i(16) #=> 10
* "hello".to_i #=> 0
* "1100101".to_i(2) #=> 101
* "1100101".to_i(8) #=> 294977
* "1100101".to_i(10) #=> 1100101
* "1100101".to_i(16) #=> 17826049
*/
static VALUE
rstr_to_i(VALUE str, SEL sel, int argc, VALUE *argv)
{
int base = 10;
if (argc > 0) {
VALUE b;
rb_scan_args(argc, argv, "01", &b);
base = NUM2INT(b);
if (base < 0) {
rb_raise(rb_eArgError, "invalid radix %d", base);
}
}
return rb_str_to_inum(str, base, Qfalse);
}
/*
* call-seq:
* str.hex => integer
*
* Treats leading characters from <i>str</i> as a string of hexadecimal digits
* (with an optional sign and an optional <code>0x</code>) and returns the
* corresponding number. Zero is returned on error.
*
* "0x0a".hex #=> 10
* "-1234".hex #=> -4660
* "0".hex #=> 0
* "wombat".hex #=> 0
*/
static VALUE
rstr_hex(VALUE str, SEL sel)
{
return rb_str_to_inum(str, 16, Qfalse);
}
/*
* call-seq:
* str.oct => integer
*
* Treats leading characters of <i>str</i> as a string of octal digits (with an
* optional sign) and returns the corresponding number. Returns 0 if the
* conversion fails.
*
* "123".oct #=> 83
* "-377".oct #=> -255
* "bad".oct #=> 0
* "0377bad".oct #=> 255
*/
static VALUE
rstr_oct(VALUE str, SEL sel)
{
return rb_str_to_inum(str, -8, Qfalse);
}
/*
* call-seq:
* str.ord => integer
*
* Return the <code>Integer</code> ordinal of a one-character string.
*
* "a".ord #=> 97
*/
static VALUE
rstr_ord(VALUE str, SEL sel)
{
if (RSTR(str)->length_in_bytes == 0) {
rb_raise(rb_eArgError, "empty string");
}
UChar c = rb_str_get_uchar(str, 0);
if (c == (UChar)U_SENTINEL) {
str_invalid_byte_sequence(RSTR(str));
}
return INT2NUM(c);
}
/*
* call-seq:
* string.chr -> string
*
* Returns a one-character string at the beginning of the string.
*
* a = "abcde"
* a.chr #=> "a"
*/
static VALUE
rstr_chr(VALUE str, SEL sel)
{
return rstr_substr(str, 0, 1);
}
/*
* call-seq:
* str.to_f => float
*
* Returns the result of interpreting leading characters in <i>str</i> as a
* floating point number. Extraneous characters past the end of a valid number
* are ignored. If there is not a valid number at the start of <i>str</i>,
* <code>0.0</code> is returned. This method never raises an exception.
*
* "123.45e1".to_f #=> 1234.5
* "45.67 degrees".to_f #=> 45.67
* "thx1138".to_f #=> 0.0
*/
static VALUE
rstr_to_f(VALUE str, SEL sel)
{
return DOUBLE2NUM(rb_str_to_dbl(str, 0));
}
/*
* call-seq:
* str.chomp!(separator=$/) => str or nil
*
* Modifies <i>str</i> in place as described for <code>String#chomp</code>,
* returning <i>str</i>, or <code>nil</code> if no modifications were made.
*/
static VALUE
rstr_chomp_bang(VALUE str, SEL sel, int argc, VALUE *argv)
{
VALUE rs;
if (rb_scan_args(argc, argv, "01", &rs) == 0) {
rs = rb_rs;
}
rstr_modify(str);
if (rs == Qnil) {
return Qnil;
}
StringValue(rs);
const long len = rb_str_chars_len(str);
if (len == 0) {
return Qnil;
}
const long rslen = rb_str_chars_len(rs);
long to_del = 0;
if (rs == rb_default_rs
|| (rslen == 1 && rb_str_get_uchar(rs, 0) == '\n')) {
// Remove trailing carriage return.
UChar c = str_get_uchar(RSTR(str), len - 1);
if (c == '\n') {
to_del++;
c = len > 1 ? str_get_uchar(RSTR(str), len - 2) : 0;
}
if (c == '\r' && (rslen > 0 || to_del != 0)) {
to_del++;
}
}
else if (rslen == 0) {
// Remove all trailing carriage returns.
for (int i = len - 1; i >= 0; i--) {
UChar c = str_get_uchar(RSTR(str), i);
if (c != '\n') {
break;
}
to_del++;
if (i > 0 && str_get_uchar(RSTR(str), i - 1) == '\r') {
to_del++;
i--;
}
}
}
else if (rslen <= len) {
// Remove trailing substring.
if (str_index_for_string(RSTR(str), str_need_string(rs),
len - rslen, -1, false) >= 0) {
to_del += rslen;
}
}
if (to_del == 0) {
return Qnil;
}
str_delete(RSTR(str), len - to_del, to_del);
return str;
}
/*
* call-seq:
* str.chomp(separator=$/) => new_str
*
* Returns a new <code>String</code> with the given record separator removed
* from the end of <i>str</i> (if present). If <code>$/</code> has not been
* changed from the default Ruby record separator, then <code>chomp</code> also
* removes carriage return characters (that is it will remove <code>\n</code>,
* <code>\r</code>, and <code>\r\n</code>).
*
* "hello".chomp #=> "hello"
* "hello\n".chomp #=> "hello"
* "hello\r\n".chomp #=> "hello"
* "hello\n\r".chomp #=> "hello\n"
* "hello\r".chomp #=> "hello"
* "hello \n there".chomp #=> "hello \n there"
* "hello".chomp("llo") #=> "he"
*/
static VALUE
rstr_chomp(VALUE str, SEL sel, int argc, VALUE *argv)
{
str = rb_str_new3(str);
rstr_chomp_bang(str, 0, argc, argv);
return str;
}
/*
* call-seq:
* str.chop! => str or nil
*
* Processes <i>str</i> as for <code>String#chop</code>, returning <i>str</i>,
* or <code>nil</code> if <i>str</i> is the empty string. See also
* <code>String#chomp!</code>.
*/
static VALUE
rstr_chop_bang(VALUE str, SEL sel)
{
rstr_modify(str);
const long len = str_length(RSTR(str));
if (len == 0) {
return Qnil;
}
long to_del = 1;
if (len >= 2) {
// if the string ends with \r\n we have to remove both \r and \n
// if the string ends with a character not in the BMP,
// we have to remove the whole character
UChar last_char = rb_str_get_uchar(str, len - 1);
if ((last_char == '\n') || U16_IS_TRAIL(last_char)) {
UChar before_last = rb_str_get_uchar(str, len - 2);
if (((before_last == '\r') && (last_char == '\n'))
|| (U16_IS_LEAD(before_last) && U16_IS_TRAIL(last_char))) {
to_del++;
}
}
}
str_delete(RSTR(str), len - to_del, to_del);
return str;
}
/*
* call-seq:
* str.chop => new_str
*
* Returns a new <code>String</code> with the last character removed. If the
* string ends with <code>\r\n</code>, both characters are removed. Applying
* <code>chop</code> to an empty string returns an empty
* string. <code>String#chomp</code> is often a safer alternative, as it leaves
* the string unchanged if it doesn't end in a record separator.
*
* "string\r\n".chop #=> "string"
* "string\n\r".chop #=> "string\n"
* "string\n".chop #=> "string"
* "string".chop #=> "strin"
* "x".chop.chop #=> ""
*/
static VALUE
rstr_chop(VALUE str, SEL sel)
{
str = rb_str_new3(str);
rstr_chop_bang(str, 0);
return str;
}
/*
* call-seq:
* str.sub!(pattern, replacement) => str or nil
* str.sub!(pattern) {|match| block } => str or nil
*
* Performs the substitutions of <code>String#sub</code> in place,
* returning <i>str</i>, or <code>nil</code> if no substitutions were
* performed.
*/
static VALUE
rb_reg_regsub(VALUE str, VALUE src, VALUE regexp, rb_match_result_t *results,
int results_count, character_boundaries_cache_t *cache_for_src)
{
VALUE val = 0;
RB_STR_GET_UCHARS(str, str_chars, str_chars_len);
long pos = 0;
// if we already have a cache, we will make a local copy just before
// using it to be sure not to have to start from scratch later
// (as for instance with "\\2\\1" would make us do)
character_boundaries_cache_t local_cache_for_src;
reset_character_boundaries_cache(&local_cache_for_src);
for (long i = 0; i < str_chars_len; i++) {
UChar c = str_chars[i];
if (c != '\\') {
continue;
}
if (i + 1 == str_chars_len) {
break;
}
if (val == 0) {
val = rb_unicode_str_new(NULL, 0);
}
str_concat_uchars(RSTR(val), &str_chars[pos], i - pos);
i++;
pos = i + 1;
int no = -1;
c = str_chars[i];
switch (c) {
case '1': case '2': case '3':
case '4': case '5': case '6':
case '7': case '8': case '9':
no = c - '0';
break;
case '0':
case '&':
no = 0;
break;
case '`':
if (cache_for_src != NULL) {
local_cache_for_src = *cache_for_src;
}
str_concat_string_part(RSTR(val), RSTR(src),
0, results[0].beg, cache_for_src);
break;
case '\'':
{
long src_chars_len;
if (cache_for_src == NULL) {
src_chars_len = str_length_with_cache(RSTR(src),
&local_cache_for_src);
}
else {
src_chars_len = str_length_with_cache(RSTR(src),
cache_for_src);
local_cache_for_src = *cache_for_src;
}
str_concat_string_part(RSTR(val), RSTR(src),
results[0].end, src_chars_len - results[0].end,
cache_for_src);
}
break;
case '+':
no = results_count - 1;
while (results[no].beg == -1 && no > 0) {
no--;
}
if (no == 0) {
no = -1;
}
break;
default:
str_append_uchar32(RSTR(val), '\\');
// fall through
case '\\':
str_append_uchar32(RSTR(val), c);
break;
}
if (no >= 0) {
if (no >= results_count) {
continue;
}
if (results[no].beg == -1) {
continue;
}
if (cache_for_src != NULL) {
local_cache_for_src = *cache_for_src;
}
str_concat_string_part(RSTR(val), RSTR(src),
results[no].beg, results[no].end - results[no].beg,
cache_for_src);
}
}
if (val != 0 && pos < str_chars_len) {
str_concat_uchars(RSTR(val), &str_chars[pos], str_chars_len - pos);
}
if (val == 0) {
return str;
}
else {
return val;
}
}
static VALUE
rstr_sub_bang(VALUE str, SEL sel, int argc, VALUE *argv)
{
VALUE repl, hash = Qnil;
bool block_given = false;
bool tainted = false;
bool untrusted = false;
if (argc == 1 && rb_block_given_p()) {
block_given = true;
}
else if (argc == 2) {
repl = argv[1];
hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
if (NIL_P(hash)) {
StringValue(repl);
}
if (OBJ_TAINTED(repl)) {
tainted = true;
}
if (OBJ_UNTRUSTED(repl)) {
untrusted = true;
}
}
else {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
}
if (!block_given) {
// RubySpec compliance...
rstr_modify(str);
}
VALUE pat = get_pat(argv[0], true);
str_modifiable(str);
if (rb_reg_search(pat, str, 0, false) >= 0) {
VALUE match = rb_backref_get();
int count = 0;
rb_match_result_t *results = rb_reg_match_results(match, &count);
assert(count > 0);
if (block_given || !NIL_P(hash)) {
if (block_given) {
rb_match_busy(match);
const unsigned long hash = rb_str_hash(str);
repl = rb_obj_as_string(rb_yield(rb_reg_nth_match(0, match)));
if (rb_str_hash(str) != hash) {
rb_raise(rb_eRuntimeError, "string modified");
}
}
else {
repl = rb_hash_aref(hash, rstr_substr(str, results[0].beg,
results[0].end - results[0].beg));
repl = rb_obj_as_string(repl);
}
rstr_frozen_check(str);
if (block_given) {
rb_backref_set(match);
RETURN_IF_BROKEN();
}
}
else {
repl = rb_reg_regsub(repl, str, pat, results, count, NULL);
}
rstr_modify(str);
str_splice(RSTR(str), results[0].beg, results[0].end - results[0].beg,
str_need_string(repl));
if (OBJ_TAINTED(repl)) {
tainted = true;
}
if (OBJ_UNTRUSTED(repl)) {
untrusted = true;
}
if (tainted) {
OBJ_TAINT(str);
}
if (untrusted) {
OBJ_UNTRUST(str);
}
return str;
}
return Qnil;
}
/*
* call-seq:
* str.sub(pattern, replacement) => new_str
* str.sub(pattern) {|match| block } => new_str
*
* Returns a copy of <i>str</i> with the <em>first</em> occurrence of
* <i>pattern</i> replaced with either <i>replacement</i> or the value of the
* block. The <i>pattern</i> will typically be a <code>Regexp</code>; if it is
* a <code>String</code> then no regular expression metacharacters will be
* interpreted (that is <code>/\d/</code> will match a digit, but
* <code>'\d'</code> will match a backslash followed by a 'd').
*
* If the method call specifies <i>replacement</i>, special variables such as
* <code>$&</code> will not be useful, as substitution into the string occurs
* before the pattern match starts. However, the sequences <code>\1</code>,
* <code>\2</code>, <code>\k<group_name></code>, etc., may be used.
*
* In the block form, the current match string is passed in as a parameter, and
* variables such as <code>$1</code>, <code>$2</code>, <code>$`</code>,
* <code>$&</code>, and <code>$'</code> will be set appropriately. The value
* returned by the block will be substituted for the match on each call.
*
* The result inherits any tainting in the original string or any supplied
* replacement string.
*
* "hello".sub(/[aeiou]/, '*') #=> "h*llo"
* "hello".sub(/([aeiou])/, '<\1>') #=> "h<e>llo"
* "hello".sub(/./) {|s| s[0].ord.to_s + ' ' } #=> "104 ello"
* "hello".sub(/(?<foo>[aeiou])/, '*\k<foo>*') #=> "h*e*llo"
*/
static VALUE
rstr_sub(VALUE str, SEL sel, int argc, VALUE *argv)
{
str = rb_str_new3(str);
rstr_sub_bang(str, 0, argc, argv);
return str;
}
/*
* call-seq:
* str.gsub!(pattern, replacement) => str or nil
* str.gsub!(pattern) {|match| block } => str or nil
*
* Performs the substitutions of <code>String#gsub</code> in place, returning
* <i>str</i>, or <code>nil</code> if no substitutions were performed.
*/
static VALUE
str_gsub(SEL sel, int argc, VALUE *argv, VALUE str, bool bang)
{
bool block_given = false;
bool tainted = false;
bool untrusted = false;
VALUE hash = Qnil, repl = Qnil;
switch (argc) {
case 1:
RETURN_ENUMERATOR(str, argc, argv);
block_given = true;
break;
case 2:
repl = argv[1];
hash = rb_check_convert_type(argv[1], T_HASH, "Hash", "to_hash");
if (NIL_P(hash)) {
StringValue(repl);
}
if (OBJ_TAINTED(repl)) {
tainted = true;
}
if (OBJ_UNTRUSTED(repl)) {
untrusted = true;
}
break;
default:
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)",
argc);
}
VALUE pat = get_pat(argv[0], 1);
VALUE dest = rb_str_new5(str, NULL, 0);
long offset = 0, last = 0;
bool changed = false;
const long len = str_length(RSTR(str));
VALUE match = Qnil;
if (bang) {
// RubySpec compliance...
rstr_modify(str);
}
VALUE matcher = rb_reg_matcher_new(pat, str);
character_boundaries_cache_t local_cache_for_str;
reset_character_boundaries_cache(&local_cache_for_str);
while (true) {
const long pos = rb_reg_matcher_search(pat, matcher, offset, false);
if (pos < 0) {
if (!changed) {
rb_reg_matcher_destroy(matcher);
return bang ? Qnil : rstr_dup(str, 0);
}
if (last < len) {
VALUE substr = rstr_substr_with_cache(str, last, len - last,
&local_cache_for_str);
if (substr != Qnil) {
str_concat_string(RSTR(dest), RSTR(substr));
}
}
break;
}
if (pos - last > 0) {
// this concatenation must be done before calling the block
// or doing the replacement or else the cache can't be used
str_concat_string_part(RSTR(dest), RSTR(str), last, pos - last,
&local_cache_for_str);
}