Permalink
Switch branches/tags
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
2253 lines (2014 sloc) 91.8 KB
#include "platform/memmem.h"
#include "moar.h"
#define MVM_DEBUG_STRANDS 0
/* Max value possible for MVMuint32 MVMStringBody.num_graphs */
#define MAX_GRAPHEMES 0xFFFFFFFFLL
#if MVM_DEBUG_STRANDS
static void check_strand_sanity(MVMThreadContext *tc, MVMString *s) {
MVMGraphemeIter gi;
MVMuint32 len;
MVM_string_gi_init(tc, &gi, s);
len = 0;
while (MVM_string_gi_has_more(tc, &gi)) {
MVM_string_gi_get_grapheme(tc, &gi);
len++;
}
if (len != MVM_string_graphs(tc, s))
MVM_exception_throw_adhoc(tc,
"Strand sanity check failed (strand length %d != num_graphs %d)",
len, MVM_string_graphs(tc, s));
}
#define STRAND_CHECK(tc, s) check_strand_sanity(tc, s);
#else
#define STRAND_CHECK(tc, s)
#endif
MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase);
/* Allocates strand storage. */
static MVMStringStrand * allocate_strands(MVMThreadContext *tc, MVMuint16 num_strands) {
return MVM_malloc(num_strands * sizeof(MVMStringStrand));
}
/* Copies strands from one strand string to another. */
static void copy_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
assert(from->body.storage_type == MVM_STRING_STRAND);
assert(to->body.storage_type == MVM_STRING_STRAND);
memcpy(
to->body.storage.strands + to_offset,
from->body.storage.strands + from_offset,
num_strands * sizeof(MVMStringStrand));
}
/* Move strands inside the same strand string. */
static void move_strands(MVMThreadContext *tc, const MVMString *from, MVMuint16 from_offset,
MVMString *to, MVMuint16 to_offset, MVMuint16 num_strands) {
assert(from->body.storage_type == MVM_STRING_STRAND);
assert(to->body.storage_type == MVM_STRING_STRAND);
memmove(
to->body.storage.strands + to_offset,
from->body.storage.strands + from_offset,
num_strands * sizeof(MVMStringStrand));
}
MVM_STATIC_INLINE int can_fit_into_8bit (MVMGrapheme32 g) {
return -128 <= g && g <= 127;
}
MVM_STATIC_INLINE int can_fit_into_ascii (MVMGrapheme32 g) {
return 0 <= g && g <= 127;
}
/* If a string is currently using 32bit storage, turn it into using
* 8 bit storage. Doesn't do any checks at all. */
static void turn_32bit_into_8bit_unchecked(MVMThreadContext *tc, MVMString *str) {
MVMGrapheme32 *old_buf = str->body.storage.blob_32;
MVMStringIndex i;
str->body.storage_type = MVM_STRING_GRAPHEME_8;
str->body.storage.blob_8 = MVM_malloc(str->body.num_graphs * sizeof(MVMGrapheme8));
for (i = 0; i < str->body.num_graphs; i++) {
str->body.storage.blob_8[i] = old_buf[i];
}
MVM_free(old_buf);
}
/* Accepts an allocated string that should have body.num_graphs set but the blob
* unallocated. This function will allocate the space for the blob and iterate
* the supplied grapheme iterator for the length of body.num_graphs */
static void iterate_gi_into_string(MVMThreadContext *tc, MVMGraphemeIter *gi, MVMString *result) {
MVMuint64 i;
result->body.storage_type = MVM_STRING_GRAPHEME_8;
result->body.storage.blob_8 = MVM_malloc(result->body.num_graphs * sizeof(MVMGrapheme8));
for (i = 0; i < result->body.num_graphs; i++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, gi);
result->body.storage.blob_8[i] = g;
if (!can_fit_into_8bit(g)) {
/* If we get here, we saw a codepoint lower than -127 or higher than 127
* so turn it into a 32 bit string instead */
/* Store the old string pointer and previous value of i */
MVMGrapheme8 *old_ref = result->body.storage.blob_8;
MVMuint64 prev_i = i;
/* Set up the string as 32bit now and allocate space for it */
result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = MVM_malloc(result->body.num_graphs * sizeof(MVMGrapheme32));
/* Copy the data so far copied from the 8bit blob since it's faster than
* setting up the grapheme iterator again */
for (i = 0; i < prev_i; i++) {
result->body.storage.blob_32[i] = old_ref[i];
}
MVM_free(old_ref);
/* Store the grapheme which interupted the sequence. After that we can
* continue from where we left off using the grapheme iterator */
result->body.storage.blob_32[prev_i] = g;
for (i = prev_i + 1; i < result->body.num_graphs; i++) {
result->body.storage.blob_32[i] = MVM_string_gi_get_grapheme(tc, gi);
}
}
}
}
/* Collapses a bunch of strands into a single blob string. */
static MVMString * collapse_strands(MVMThreadContext *tc, MVMString *orig) {
MVMString *result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
MVMGraphemeIter gi;
MVMROOT(tc, orig, {
MVM_string_gi_init(tc, &gi, orig);
result->body.num_graphs = MVM_string_graphs(tc, orig);
iterate_gi_into_string(tc, &gi, result);
});
return result;
}
/* Takes a string that is no longer in NFG form after some concatenation-style
* operation, and returns a new string that is in NFG. Note that we could do a
* much, much, smarter thing in the future that doesn't involve all of this
* copying and allocation and re-doing the whole string, but cases like this
* should be fairly rare anyway, so go with simplicity for now. */
static MVMString * re_nfg(MVMThreadContext *tc, MVMString *in) {
MVMNormalizer norm;
MVMCodepointIter ci;
MVMint32 ready;
MVMString *out;
MVMuint32 bufsize = in->body.num_graphs;
/* Create the output buffer. We used to believe it can't ever be bigger
* than the initial estimate, but utf8-c8 showed us otherwise. */
MVMGrapheme32 *out_buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
MVMint64 out_pos = 0;
/* Iterate codepoints and normalizer. */
MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
MVM_string_ci_init(tc, &ci, in, 0);
while (MVM_string_ci_has_more(tc, &ci)) {
MVMGrapheme32 g;
ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &g);
if (ready) {
if (out_pos + ready > bufsize) {
/* Doubling up the buffer size seems excessive, so just
* add a generous amount of storage */
bufsize += ready + 32;
out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
}
out_buffer[out_pos++] = g;
while (--ready > 0) {
out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
}
}
}
MVM_unicode_normalizer_eof(tc, &norm);
ready = MVM_unicode_normalizer_available(tc, &norm);
if (out_pos + ready > bufsize) {
bufsize += ready + 1;
out_buffer = MVM_realloc(out_buffer, bufsize * sizeof(MVMGrapheme32));
}
while (ready--) {
out_buffer[out_pos++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
}
MVM_unicode_normalizer_cleanup(tc, &norm);
/* Build result string. */
out = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
out->body.storage.blob_32 = out_buffer;
out->body.storage_type = MVM_STRING_GRAPHEME_32;
out->body.num_graphs = out_pos;
return out;
}
/* Returns nonzero if two substrings are equal, doesn't check bounds */
MVMint64 MVM_string_substrings_equal_nocheck(MVMThreadContext *tc, MVMString *a,
MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
MVMGraphemeIter gia;
MVMGraphemeIter gib;
MVMint64 i;
/* Fast paths when storage types are identical. */
switch (a->body.storage_type) {
case MVM_STRING_GRAPHEME_32:
if (b->body.storage_type == MVM_STRING_GRAPHEME_32)
return 0 == memcmp(
a->body.storage.blob_32 + starta,
b->body.storage.blob_32 + startb,
length * sizeof(MVMGrapheme32));
break;
case MVM_STRING_GRAPHEME_ASCII:
case MVM_STRING_GRAPHEME_8:
if (b->body.storage_type == MVM_STRING_GRAPHEME_ASCII ||
b->body.storage_type == MVM_STRING_GRAPHEME_8)
return 0 == memcmp(
a->body.storage.blob_8 + starta,
b->body.storage.blob_8 + startb,
length);
break;
}
/* Normal path, for the rest of the time. */
MVM_string_gi_init(tc, &gia, a);
MVM_string_gi_init(tc, &gib, b);
MVM_string_gi_move_to(tc, &gia, starta);
MVM_string_gi_move_to(tc, &gib, startb);
for (i = 0; i < length; i++)
if (MVM_string_gi_get_grapheme(tc, &gia) != MVM_string_gi_get_grapheme(tc, &gib))
return 0;
return 1;
}
/* Returns the codepoint without doing checks, for internal VM use only. */
MVMGrapheme32 MVM_string_get_grapheme_at_nocheck(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
switch (a->body.storage_type) {
case MVM_STRING_GRAPHEME_32:
return a->body.storage.blob_32[index];
case MVM_STRING_GRAPHEME_ASCII:
return a->body.storage.blob_ascii[index];
case MVM_STRING_GRAPHEME_8:
return a->body.storage.blob_8[index];
case MVM_STRING_STRAND: {
MVMGraphemeIter gi;
MVM_string_gi_init(tc, &gi, a);
MVM_string_gi_move_to(tc, &gi, index);
return MVM_string_gi_get_grapheme(tc, &gi);
}
default:
MVM_exception_throw_adhoc(tc, "String corruption detected: bad storage type");
}
}
/* Returns the location of one string in another or -1 */
MVMint64 MVM_string_index(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
size_t index = (size_t)start;
MVMStringIndex H_graphs = MVM_string_graphs(tc, Haystack), n_graphs = MVM_string_graphs(tc, needle);
MVM_string_check_arg(tc, Haystack, "index search target");
MVM_string_check_arg(tc, needle, "index search term");
if (!n_graphs)
return start <= H_graphs ? start : -1; /* the empty string is in any other string */
if (!H_graphs)
return -1;
if (start < 0 || start >= H_graphs)
return -1;
if (n_graphs > H_graphs || n_graphs < 1)
return -1;
/* Fast paths when storage types are identical. Uses memmem function, which
* uses Knuth-Morris-Pratt algorithm on Linux and on others
* Crochemore+Perrin two-way string matching */
switch (Haystack->body.storage_type) {
case MVM_STRING_GRAPHEME_32:
if (needle->body.storage_type == MVM_STRING_GRAPHEME_32) {
void *start_ptr = Haystack->body.storage.blob_32 + start;
void *mm_return_32;
void *end_ptr = (char*)start_ptr + sizeof(MVMGrapheme32) * (H_graphs - start);
do {
/* Keep as void* to not lose precision */
mm_return_32 = MVM_memmem(
start_ptr, /* start position */
(char*)end_ptr - (char*)start_ptr, /* length of Haystack from start position to end */
needle->body.storage.blob_32, /* needle start */
n_graphs * sizeof(MVMGrapheme32) /* needle length */
);
if (mm_return_32 == NULL)
return -1;
} /* If we aren't on a 32 bit boundary then continue from where we left off (unlikely but possible) */
while ( ( (char*)mm_return_32 - (char*)Haystack->body.storage.blob_32) % sizeof(MVMGrapheme32)
&& ( start_ptr = mm_return_32 ) /* Set the new start pointer at where we left off */
&& ( end_ptr > start_ptr ) /* Check we aren't past the end of the string just in case */
);
return (MVMGrapheme32*)mm_return_32 - Haystack->body.storage.blob_32;
}
break;
case MVM_STRING_GRAPHEME_8:
if (needle->body.storage_type == MVM_STRING_GRAPHEME_8) {
void *mm_return_8 = MVM_memmem(
Haystack->body.storage.blob_8 + start, /* start position */
(H_graphs - start) * sizeof(MVMGrapheme8), /* length of Haystack from start position to end */
needle->body.storage.blob_8, /* needle start */
n_graphs * sizeof(MVMGrapheme8) /* needle length */
);
if (mm_return_8 == NULL)
return -1;
else
return (MVMGrapheme8*)mm_return_8 - Haystack->body.storage.blob_8;
}
break;
}
/* brute force for now. horrible, yes. halp. */
while (index <= H_graphs - n_graphs) {
if (string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle, index, H_graphs, n_graphs, 0, 0) != -1) {
return (MVMint64)index;
}
index++;
}
return -1;
}
/* Returns the location of one string in another or -1 */
MVMint64 MVM_string_index_from_end(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
MVMint64 result = -1;
size_t index;
MVMStringIndex H_graphs, n_graphs;
MVM_string_check_arg(tc, Haystack, "rindex search target");
MVM_string_check_arg(tc, needle, "rindex search term");
H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
n_graphs = MVM_string_graphs_nocheck(tc, needle);
if (!n_graphs) {
if (start >= 0)
return start <= H_graphs ? start : -1; /* the empty string is in any other string */
else
return H_graphs; /* no start, so return end */
}
if (!H_graphs)
return -1;
if (n_graphs > H_graphs || n_graphs < 1)
return -1;
if (start == -1)
start = H_graphs - n_graphs;
if (start < 0 || start >= H_graphs)
/* maybe return -1 instead? */
MVM_exception_throw_adhoc(tc, "index start offset out of range");
index = start;
if (index + n_graphs > H_graphs) {
index = H_graphs - n_graphs;
}
/* brute force for now. horrible, yes. halp. */
do {
if (MVM_string_substrings_equal_nocheck(tc, needle, 0, n_graphs, Haystack, index)) {
result = (MVMint64)index;
break;
}
} while (index-- > 0);
return result;
}
/* Returns a substring of the given string */
MVMString * MVM_string_substring(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMint64 length) {
MVMString *result;
MVMint64 start_pos, end_pos;
MVMint64 agraphs;
MVM_string_check_arg(tc, a, "substring");
/* convert to signed to avoid implicit arithmetic conversions */
agraphs = (MVMint64)MVM_string_graphs_nocheck(tc, a);
/* -1 signifies go to the end of the string; anything less is a bug */
if (length < -1)
MVM_exception_throw_adhoc(tc, "Substring length (%"PRId64") cannot be negative", length);
/* negative offsets count from the end */
start_pos = offset < 0 ? offset + agraphs : offset;
end_pos = length == -1 ? agraphs : start_pos + length;
/* return an empty string if start_pos is out of bounds but positive */
if (start_pos > agraphs) {
start_pos = 0;
end_pos = 0;
}
if (end_pos < 0)
MVM_exception_throw_adhoc(tc, "Substring end (%"PRId64") cannot be less than 0", end_pos);
/* Ensure we're within bounds. */
if (start_pos < 0)
start_pos = 0;
if (end_pos > agraphs)
end_pos = agraphs;
/* Check trivial cases: empty string and whole string. */
if (start_pos == end_pos)
return tc->instance->str_consts.empty;
if (start_pos == 0 && end_pos == agraphs)
return a;
/* Construct a result; how we efficiently do so will vary based on the
* input string. */
MVMROOT(tc, a, {
result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
result->body.num_graphs = end_pos - start_pos;
if (a->body.storage_type != MVM_STRING_STRAND) {
/* It's some kind of buffer. Construct a strand view into it. */
result->body.storage_type = MVM_STRING_STRAND;
result->body.storage.strands = allocate_strands(tc, 1);
result->body.num_strands = 1;
result->body.storage.strands[0].blob_string = a;
result->body.storage.strands[0].start = start_pos;
result->body.storage.strands[0].end = end_pos;
result->body.storage.strands[0].repetitions = 0;
}
else if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
/* Single strand string; quite possibly already a substring. We'll
* just produce an updated view. */
MVMStringStrand *orig_strand = &(a->body.storage.strands[0]);
result->body.storage_type = MVM_STRING_STRAND;
result->body.storage.strands = allocate_strands(tc, 1);
result->body.num_strands = 1;
result->body.storage.strands[0].blob_string = orig_strand->blob_string;
result->body.storage.strands[0].start = orig_strand->start + start_pos;
result->body.storage.strands[0].end = orig_strand->start + end_pos;
result->body.storage.strands[0].repetitions = 0;
}
else {
/* Produce a new blob string, collapsing the strands. */
MVMGraphemeIter gi;
MVM_string_gi_init(tc, &gi, a);
MVM_string_gi_move_to(tc, &gi, start_pos);
iterate_gi_into_string(tc, &gi, result);
}
});
STRAND_CHECK(tc, result);
return result;
}
MVMString * MVM_string_replace(MVMThreadContext *tc, MVMString *original, MVMint64 start, MVMint64 count, MVMString *replacement) {
/* XXX this could probably be done more efficiently directly. */
MVMString *first_part;
MVMString *rest_part;
MVMString *result;
MVM_gc_root_temp_push(tc, (MVMCollectable **)&replacement);
MVM_gc_root_temp_push(tc, (MVMCollectable **)&original);
first_part = MVM_string_substring(tc, original, 0, start);
MVM_gc_root_temp_push(tc, (MVMCollectable **)&first_part);
rest_part = MVM_string_substring(tc, original, start + count, -1);
rest_part = MVM_string_concatenate(tc, replacement, rest_part);
result = MVM_string_concatenate(tc, first_part, rest_part);
MVM_gc_root_temp_pop_n(tc, 3);
STRAND_CHECK(tc, result);
return result;
}
/* Append one string to another. */
static MVMint32 final_strand_matches(MVMThreadContext *tc, MVMString *a, MVMString *b) {
if (a->body.storage_type == MVM_STRING_STRAND) {
MVMStringStrand *ss = &(a->body.storage.strands[a->body.num_strands - 1]);
if (ss->end - ss->start == MVM_string_graphs(tc, b))
if (MVM_string_equal_at(tc, ss->blob_string, b, ss->start))
return 1;
}
return 0;
}
MVMString * MVM_string_concatenate(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVMString *result = NULL, *renormalized_section = NULL;
int renormalized_section_graphs = 0, consumed_a = 0, consumed_b = 0;
MVMuint32 agraphs, bgraphs;
MVMuint64 total_graphs;
int lost_strands = 0;
int is_concat_stable = 0;
int index_ss_b;
MVM_string_check_arg(tc, a, "concatenate");
MVM_string_check_arg(tc, b, "concatenate");
/* Simple empty-string cases. */
agraphs = MVM_string_graphs_nocheck(tc, a);
if (agraphs == 0)
return b;
bgraphs = MVM_string_graphs_nocheck(tc, b);
if (bgraphs == 0)
return a;
is_concat_stable = MVM_nfg_is_concat_stable(tc, a, b);
/* If is_concat_stable equals 0 and a and b are not repetitions. */
if (is_concat_stable == 0 && !(a->body.storage_type == MVM_STRING_STRAND && a->body.storage.strands[a->body.num_strands - 1].repetitions)
&& !(b->body.storage_type == MVM_STRING_STRAND && b->body.storage.strands[0].repetitions)) {
MVMCodepoint last_a_first_b[2] = {
MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1),
MVM_string_get_grapheme_at_nocheck(tc, b, 0)
};
MVMROOT(tc, a, {
MVMROOT(tc, b, {
/* Needing both to be CCC = 0 can probably be relaxed some, but be careful optimizing */
if (0 <= last_a_first_b[0] && 0 <= last_a_first_b[1]) {
renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b, 2);
consumed_a = 1; consumed_b = 1;
}
else {
MVMCodepointIter last_a_ci;
MVMCodepointIter first_b_ci;
MVMuint32 a_codes = MVM_string_grapheme_ci_init(tc, &last_a_ci, last_a_first_b[0]);
MVMuint32 b_codes = MVM_string_grapheme_ci_init(tc, &first_b_ci, last_a_first_b[1]);
/* MSVC doesn't allow variable length arrays so use alloca to allocate onto the stack */
MVMCodepoint *last_a_first_b_codes = alloca((a_codes + b_codes) * sizeof(MVMCodepoint));
MVMuint32 i = 0;
for (; MVM_string_grapheme_ci_has_more(tc, &last_a_ci); i++) {
last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &last_a_ci);
}
for (; MVM_string_grapheme_ci_has_more(tc, &first_b_ci); i++) {
last_a_first_b_codes[i] = MVM_string_grapheme_ci_get_codepoint(tc, &first_b_ci);
}
renormalized_section = MVM_unicode_codepoints_c_array_to_nfg_string(tc, last_a_first_b_codes, a_codes + b_codes);
consumed_a = 1; consumed_b = 1;
}
});
});
if (renormalized_section) {
if (agraphs == consumed_a && bgraphs == consumed_b)
return renormalized_section;
renormalized_section_graphs = MVM_string_graphs_nocheck(tc, renormalized_section);
}
}
/* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
total_graphs = (MVMuint64)agraphs + (MVMuint64)bgraphs;
if (total_graphs > MAX_GRAPHEMES)
MVM_exception_throw_adhoc(tc,
"Can't concatenate strings, required number of graphemes %"PRIu64" > max allowed of %lld",
total_graphs, MAX_GRAPHEMES);
/* Otherwise, we'll assemble a result string. */
MVMROOT(tc, a, {
MVMROOT(tc, b, {
MVMROOT(tc, renormalized_section, {
/* Allocate it. */
result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
/* Total graphemes is trivial; just total up inputs. */
result->body.num_graphs = (MVMuint32)total_graphs;
/* Result string will be made of strands. */
result->body.storage_type = MVM_STRING_STRAND;
/* Detect the wonderful case where we're repeatedly concating the same
* string again and again, and thus can just bump a repetition. */
if (final_strand_matches(tc, a, b)) {
/* We have it; just copy the strands to a new string and bump the
* repetitions count of the last one. */
result->body.storage.strands = allocate_strands(tc, a->body.num_strands);
copy_strands(tc, a, 0, result, 0, a->body.num_strands);
result->body.storage.strands[a->body.num_strands - 1].repetitions++;
result->body.num_strands = a->body.num_strands;
}
/* Otherwise, construct a new strand string. */
else {
/* See if we have too many strands between the two. If so, we will
* collapse the biggest side. */
MVMuint16 strands_a = a->body.storage_type == MVM_STRING_STRAND
? a->body.num_strands
: 1;
MVMuint16 strands_b = b->body.storage_type == MVM_STRING_STRAND
? b->body.num_strands
: 1;
MVMString *effective_a = a;
MVMString *effective_b = b;
if (strands_a + strands_b > MVM_STRING_MAX_STRANDS) {
MVMROOT(tc, result, {
if (strands_a >= strands_b) {
effective_a = collapse_strands(tc, effective_a);
strands_a = 1;
}
else {
effective_b = collapse_strands(tc, effective_b);
strands_b = 1;
}
});
}
/* Assemble the result. */
result->body.num_strands = strands_a + strands_b + (renormalized_section_graphs ? 1 : 0);
result->body.storage.strands = allocate_strands(tc, result->body.num_strands);
/* START 1 */
if (effective_a->body.storage_type == MVM_STRING_STRAND) {
copy_strands(tc, effective_a, 0, result, 0, strands_a);
}
else {
int index_ss_a = 0;
MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
ss_a->blob_string = effective_a;
ss_a->start = 0;
ss_a->end = effective_a->body.num_graphs;
ss_a->repetitions = 0;
}
if (renormalized_section) {
int index_ss_re;
int index_ss_a = strands_a - 1;
/* Tweak the end of the last strand of string a. Since if a is made up of multiple strands, we can't just refer to index 0 and instead erfer to strands_a - 1 */
MVMStringStrand *ss_a = &(result->body.storage.strands[index_ss_a]);
MVMStringStrand *ss_re = NULL;
ss_a->end -= consumed_a;
/* If the strands ends up to be zero length we need to reduce the number of strand_index and also incease lost_strands so the next operation writes over it */
if (ss_a->start == ss_a->end)
lost_strands++;
/* END 1 */
/* START 1.5 (only triggered in some cases) */
index_ss_re = strands_a - lost_strands;
ss_re = &(result->body.storage.strands[index_ss_re]);
/* Add the renormalized section in as a strand */
ss_re->blob_string = renormalized_section;
ss_re->start = 0;
ss_re->end = renormalized_section->body.num_graphs;
ss_re->repetitions = 0;
if (ss_re->start == ss_re->end) {
MVM_exception_throw_adhoc(tc, "Unexpected error in concatenation: renormalized_section is 0 graphemes.\n");
/* renormalized_section should always be at least one grapheme
* in length so throw if it does not (zero length is an error
* we shouldn't lost_strands++ unlike the other strands */
}
/* END 1.5 */
}
/* START 2 */
index_ss_b = strands_a - lost_strands + (renormalized_section_graphs ? 1 : 0 );
if (effective_b->body.storage_type == MVM_STRING_STRAND) {
copy_strands(tc, effective_b, 0, result, index_ss_b, strands_b);
}
else {
MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
ss_b->blob_string = effective_b;
ss_b->start = 0;
ss_b->end = effective_b->body.num_graphs;
ss_b->repetitions = 0;
}
if (renormalized_section_graphs) {
/* Tweak the beginning of the first strand of string b */
MVMStringStrand *ss_b = &(result->body.storage.strands[index_ss_b]);
ss_b->start += consumed_b;
if (ss_b->start == ss_b->end) {
lost_strands++;
move_strands(tc, result, index_ss_b + 1, result, index_ss_b, strands_b - 1);
}
/* END 2 */
/* Adjust result->num_strands */
if (lost_strands)
result->body.num_strands -= lost_strands;
/* Adjust result->num_graphs */
result->body.num_graphs += renormalized_section_graphs - consumed_b - consumed_a;
}
}
});
});
});
STRAND_CHECK(tc, result);
if (is_concat_stable == 1) {
return result;
}
/* If it's regional indicator */
else if (is_concat_stable == 2) {
return re_nfg(tc, result);
}
else if (is_concat_stable == 0 && renormalized_section) {
return result;
}
/* We should have returned by now, but if we did not, return re_nfg */
return re_nfg(tc, result);
}
MVMString * MVM_string_repeat(MVMThreadContext *tc, MVMString *a, MVMint64 count) {
MVMString *result;
MVMuint32 agraphs;
MVMuint64 total_graphs;
MVM_string_check_arg(tc, a, "repeat");
/* Validate count; handle common cases. */
if (count == 0)
return tc->instance->str_consts.empty;
if (count == 1)
return a;
if (count < 0)
MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be negative", count);
if (count > MAX_GRAPHEMES)
MVM_exception_throw_adhoc(tc, "Repeat count (%"PRId64") cannot be greater than max allowed number of graphemes %lld", count, MAX_GRAPHEMES);
/* If input string is empty, repeating it is empty. */
agraphs = MVM_string_graphs_nocheck(tc, a);
if (agraphs == 0)
return tc->instance->str_consts.empty;
/* Total size of the resulting string can't be bigger than an MVMString is allowed to be. */
total_graphs = (MVMuint64)agraphs * (MVMuint64)count;
if (total_graphs > MAX_GRAPHEMES)
MVM_exception_throw_adhoc(tc,
"Can't repeat string, required number of graphemes (%"PRIu32" * %"PRIu64") greater than max allowed of %lld",
agraphs, count, MAX_GRAPHEMES);
/* Now build a result string with the repetition set. */
MVMROOT(tc, a, {
result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
result->body.num_graphs = agraphs * count;
result->body.storage_type = MVM_STRING_STRAND;
result->body.storage.strands = allocate_strands(tc, 1);
if (a->body.storage_type == MVM_STRING_STRAND) {
if (a->body.num_strands == 1 && a->body.storage.strands[0].repetitions == 0) {
copy_strands(tc, a, 0, result, 0, 1);
}
else {
MVMROOT(tc, result, {
a = collapse_strands(tc, a);
});
result->body.storage.strands[0].blob_string = a;
result->body.storage.strands[0].start = 0;
result->body.storage.strands[0].end = agraphs;
}
}
else {
result->body.storage.strands[0].blob_string = a;
result->body.storage.strands[0].start = 0;
result->body.storage.strands[0].end = agraphs;
}
result->body.storage.strands[0].repetitions = count - 1;
result->body.num_strands = 1;
});
STRAND_CHECK(tc, result);
return result;
}
void MVM_string_say(MVMThreadContext *tc, MVMString *a) {
MVM_string_check_arg(tc, a, "say");
MVM_string_print(tc, MVM_string_concatenate(tc, a,
tc->instance->str_consts.platform_newline));
}
void MVM_string_print(MVMThreadContext *tc, MVMString *a) {
MVMOSHandle *handle = (MVMOSHandle *)tc->instance->stdout_handle;
MVMuint64 encoded_size;
char *encoded;
MVM_string_check_arg(tc, a, "print");
encoded = MVM_string_utf8_encode(tc, a, &encoded_size, MVM_TRANSLATE_NEWLINE_OUTPUT);
MVM_io_write_bytes_c(tc, tc->instance->stdout_handle, encoded, encoded_size);
MVM_free(encoded);
}
/* Meant to be pased in a MVMNormalizer of type MVM_NORMALIZE_NFD */
static MVMGrapheme32 ord_getbasechar (MVMThreadContext *tc, MVMGrapheme32 g) {
MVMGrapheme32 return_g;
MVMint32 ready;
MVMNormalizer norm;
MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFD);
ready = MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, g, &return_g);
MVM_unicode_normalizer_eof(tc, &norm);
if (!ready)
return_g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
MVM_unicode_normalizer_cleanup(tc, &norm);
return return_g;
}
/* Tests whether one string a has the other string b as a substring at that index */
MVMint64 MVM_string_equal_at(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 offset) {
MVMStringIndex agraphs, bgraphs;
MVM_string_check_arg(tc, a, "equal_at");
MVM_string_check_arg(tc, b, "equal_at");
agraphs = MVM_string_graphs_nocheck(tc, a);
bgraphs = MVM_string_graphs_nocheck(tc, b);
if (offset < 0) {
offset += agraphs;
if (offset < 0)
offset = 0; /* XXX I think this is the right behavior here */
}
if (agraphs - offset < bgraphs)
return 0;
return MVM_string_substrings_equal_nocheck(tc, a, offset, bgraphs, b, 0);
}
/* Ensure return value can hold numbers at least 3x higher than MVMStringIndex.
* Theoretically if the string has all ffi ligatures and 1/3 the max size of
* MVMStringIndex in length, we could have some weird results. */
/* ignoremark is 0 for normal operation and 1 for ignoring diacritics */
MVM_STATIC_INLINE MVMint64 string_equal_at_ignore_case_INTERNAL_loop(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle_fc, MVMint64 H_start, MVMint64 H_graphs, MVMint64 n_fc_graphs, int ignoremark, int ignorecase) {
MVMuint32 H_fc_cps;
/* An additional needle offset which is used only when codepoints expand
* when casefolded. The offset is the number of additional codepoints that
* have been seen so Haystack and needle stay aligned */
MVMint64 n_offset = 0;
MVMint64 i, j;
MVMGrapheme32 H_g, n_g;
for (i = 0; i + H_start < H_graphs && i + n_offset < n_fc_graphs; i++) {
const MVMCodepoint* H_result_cps;
H_g = MVM_string_get_grapheme_at_nocheck(tc, Haystack, H_start + i);
if (!ignorecase) {
H_fc_cps = 0;
}
else if (H_g >= 0) {
/* For codeponits we can get the case change directly */
H_fc_cps = MVM_unicode_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, &H_result_cps);
}
else {
/* Synthetics must use this function */
H_fc_cps = MVM_nfg_get_case_change(tc, H_g, MVM_unicode_case_change_type_fold, (MVMGrapheme32**) &H_result_cps);
}
/* If we get 0 for the number that means the cp doesn't change when casefolded */
if (H_fc_cps == 0) {
n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
if (ignoremark) {
H_g = ord_getbasechar(tc, H_g);
n_g = ord_getbasechar(tc, n_g);
}
if (H_g != n_g)
return -1;
}
else if (H_fc_cps >= 1) {
for (j = 0; j < H_fc_cps; j++) {
n_g = MVM_string_get_grapheme_at_nocheck(tc, needle_fc, i + n_offset);
H_g = H_result_cps[j];
if (ignoremark) {
H_g = ord_getbasechar(tc, H_g);
n_g = ord_getbasechar(tc, n_g);
}
if (H_g != n_g)
return -1;
n_offset++;
}
n_offset--;
}
}
return n_offset;
/* We return -1 if the strings are not equal and 0 or more if they are equal
* The return values from 0, 1 etc designate how many Haystack graphemes
* were expanded.
* This may seem like an odd arangement, but this extra information is needed
* to determine the length of the Haystack which was traversed, as it can
* differ from the length of the needle if there are expansions. */
}
/* Checks if needle exists at the offset, but ignores case.
* Sometimes there is a difference in length of a string before and after foldcase,
* because of this we must compare this differently than just foldcasing both
* strings to ensure the offset is correct */
static MVMint64 string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset, int ignoremark) {
/* Foldcase version of needle */
MVMString *needle_fc;
MVMStringIndex H_graphs = MVM_string_graphs(tc, Haystack);
MVMStringIndex n_graphs = MVM_string_graphs(tc, needle);
MVMStringIndex n_fc_graphs;
/* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
MVMint64 H_expansion;
if (H_offset < 0) {
H_offset += H_graphs;
if (H_offset < 0)
H_offset = 0; /* XXX I think this is the right behavior here */
}
/* If the offset is greater or equal to the number of Haystack graphemes
* return 0. Since size of graphemes could change under casefolding, we
* can't assume too much. If optimizing this be careful */
if (H_offset >= H_graphs)
return 0;
MVMROOT(tc, Haystack, {
needle_fc = MVM_string_fc(tc, needle);
});
n_fc_graphs = MVM_string_graphs(tc, needle_fc);
H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, H_offset, H_graphs, n_fc_graphs, ignoremark, 1);
if (H_expansion >= 0)
return H_graphs + H_expansion - H_offset >= n_fc_graphs ? 1 : 0;
return 0;
}
static MVMint64 string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start, int ignoremark) {
/* Foldcase version of needle */
MVMString *needle_fc;
MVMStringIndex n_fc_graphs;
size_t index = (size_t)start;
MVMStringIndex H_graphs, n_graphs;
/* H_expansion must be able to hold integers 3x larger than MVMStringIndex */
MVMint64 H_expansion;
MVMint64 return_val = -1;
MVM_string_check_arg(tc, Haystack, ignoremark ? "index ignore case ignore mark search target" : "index ignore case search target");
MVM_string_check_arg(tc, needle, ignoremark ? "index ignore case ignore mark search term" : "index ignore case search term");
H_graphs = MVM_string_graphs_nocheck(tc, Haystack);
n_graphs = MVM_string_graphs_nocheck(tc, needle);
if (!n_graphs)
return start <= H_graphs ? start : -1; /* Empty string is in any other string */
if (!H_graphs)
return -1;
if (start < 0 || start >= H_graphs)
return -1;
/* Codepoints can expand into up to THREE codepoints (as of Unicode 9.0). The next check
* checks if it is at all possible for the needle grapheme number to be higher
* than the Haystack */
if (n_graphs > H_graphs * 3)
return -1;
if (n_graphs < 1)
return -1;
MVMROOT(tc, Haystack, {
needle_fc = MVM_string_fc(tc, needle);
});
n_fc_graphs = MVM_string_graphs(tc, needle_fc);
/* brute force for now. horrible, yes. halp. */
while (index <= H_graphs) {
H_expansion = string_equal_at_ignore_case_INTERNAL_loop(tc, Haystack, needle_fc, index, H_graphs, n_fc_graphs, ignoremark, 1);
if (H_expansion >= 0)
return H_graphs + H_expansion - index >= n_fc_graphs ? (MVMint64)index : -1;
index++;
}
return -1;
}
MVMint64 MVM_string_equal_at_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 0);
}
MVMint64 MVM_string_equal_at_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 H_offset) {
return string_equal_at_ignore_case(tc, Haystack, needle, H_offset, 1);
}
MVMint64 MVM_string_index_ignore_case(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
return string_index_ignore_case(tc, Haystack, needle, start, 0);
}
MVMint64 MVM_string_index_ignore_case_ignore_mark(MVMThreadContext *tc, MVMString *Haystack, MVMString *needle, MVMint64 start) {
return string_index_ignore_case(tc, Haystack, needle, start, 1);
}
MVMGrapheme32 MVM_string_ord_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
MVMStringIndex agraphs;
MVMGrapheme32 g;
MVM_string_check_arg(tc, s, "grapheme_at");
agraphs = MVM_string_graphs(tc, s);
if (offset < 0 || offset >= agraphs)
return -1;
g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
return g >= 0 ? g : MVM_nfg_get_synthetic_info(tc, g)->base;
}
/* Gets the base character at a grapheme position, ignoring things like diacritics */
MVMGrapheme32 MVM_string_ord_basechar_at(MVMThreadContext *tc, MVMString *s, MVMint64 offset) {
MVMStringIndex agraphs;
MVMGrapheme32 g;
MVMint32 ready;
MVM_string_check_arg(tc, s, "ord_basechar_at");
agraphs = MVM_string_graphs_nocheck(tc, s);
if (offset < 0 || offset >= agraphs)
return -1; /* fixes RT #126771 */
g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
if (g < 0) {
MVMNFGSynthetic *si = MVM_nfg_get_synthetic_info(tc, g);
g = si->base;
}
else {
g = ord_getbasechar(tc, g);
}
return g;
}
/* Compares two strings for equality. */
MVMint64 MVM_string_equal(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVM_string_check_arg(tc, a, "equal");
MVM_string_check_arg(tc, b, "equal");
if (a == b)
return 1;
if (MVM_string_graphs_nocheck(tc, a) != MVM_string_graphs_nocheck(tc, b))
return 0;
return MVM_string_equal_at(tc, a, b, 0);
}
/* more general form of has_at; compares two substrings for equality */
MVMint64 MVM_string_have_at(MVMThreadContext *tc, MVMString *a,
MVMint64 starta, MVMint64 length, MVMString *b, MVMint64 startb) {
MVM_string_check_arg(tc, a, "have_at");
MVM_string_check_arg(tc, b, "have_at");
if (starta < 0 || startb < 0)
return 0;
if (length == 0)
return 1;
if (starta + length > MVM_string_graphs_nocheck(tc, a) || startb + length > MVM_string_graphs_nocheck(tc, b))
return 0;
return MVM_string_substrings_equal_nocheck(tc, a, starta, length, b, startb);
}
/* Returns the grapheme at a given index of the string */
MVMint64 MVM_string_get_grapheme_at(MVMThreadContext *tc, MVMString *a, MVMint64 index) {
MVMStringIndex agraphs;
MVM_string_check_arg(tc, a, "grapheme_at");
agraphs = MVM_string_graphs_nocheck(tc, a);
if (index < 0 || index >= agraphs)
MVM_exception_throw_adhoc(tc, "Invalid string index: max %"PRId32", got %"PRId64,
agraphs - 1, index);
return (MVMint64)MVM_string_get_grapheme_at_nocheck(tc, a, index);
}
/* Finds the location of a grapheme in a string. Useful for small character class lookup */
MVMint64 MVM_string_index_of_grapheme(MVMThreadContext *tc, MVMString *a, MVMGrapheme32 grapheme) {
size_t index = -1;
MVMGraphemeIter gi;
MVM_string_check_arg(tc, a, "string_index_of_grapheme");
MVM_string_gi_init(tc, &gi, a);
while (MVM_string_gi_has_more(tc, &gi)) {
index++;
if (MVM_string_gi_get_grapheme(tc, &gi) == grapheme)
return index;
}
return -1;
}
/* Case change functions. */
static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g);
static MVMString * do_case_change(MVMThreadContext *tc, MVMString *s, MVMint32 type, char *error) {
MVMint64 sgraphs;
MVM_string_check_arg(tc, s, error);
sgraphs = MVM_string_graphs_nocheck(tc, s);
if (sgraphs) {
MVMString *result;
MVMGraphemeIter gi;
MVMint64 result_graphs = sgraphs;
MVMGrapheme32 *result_buf = MVM_malloc(result_graphs * sizeof(MVMGrapheme32));
MVMint32 changed = 0;
MVMint64 i = 0;
MVM_string_gi_init(tc, &gi, s);
while (MVM_string_gi_has_more(tc, &gi)) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
peeked:
if (g == 0x03A3) {
/* Greek sigma needs special handling when lowercased. */
switch (type) {
case MVM_unicode_case_change_type_upper:
case MVM_unicode_case_change_type_title:
result_buf[i++] = g;
break;
case MVM_unicode_case_change_type_lower:
changed = 1;
if (i == 0) {
/* Start of string, so not final. */
result_buf[i++] = 0x03C3;
}
else if (!grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, result_buf[i - 1])) {
/* Previous char is not a letter; not final (as has
* to be at end of a word and not only thing in a
* word). */
result_buf[i++] = 0x03C3;
}
else if (!MVM_string_gi_has_more(tc, &gi)) {
/* End of string. We only reach here if we have a
* letter before us, so it must be final. */
result_buf[i++] = 0x03C2;
}
else {
/* Letter before us, something ahead of us. Need to
* peek ahead to see if it's a letter, to decide if
* we have final sigma or not. */
g = MVM_string_gi_get_grapheme(tc, &gi);
if (grapheme_is_cclass(tc, MVM_CCLASS_ALPHABETIC, g))
result_buf[i++] = 0x03C3;
else
result_buf[i++] = 0x03C2;
goto peeked;
}
break;
case MVM_unicode_case_change_type_fold:
result_buf[i++] = 0x03C3;
changed = 1;
break;
}
}
else if (g >= 0) {
const MVMCodepoint *result_cps;
MVMuint32 num_result_cps = MVM_unicode_get_case_change(tc,
g, type, &result_cps);
if (num_result_cps == 0) {
result_buf[i++] = g;
}
else if (num_result_cps == 1) {
result_buf[i++] = *result_cps;
changed = 1;
}
else {
/* To maintain NFG, we need to re-normalize when we get an
* expansion. */
MVMNormalizer norm;
MVMint32 num_result_graphs;
MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
MVM_unicode_normalizer_push_codepoints(tc, &norm, result_cps, num_result_cps);
MVM_unicode_normalizer_eof(tc, &norm);
num_result_graphs = MVM_unicode_normalizer_available(tc, &norm);
/* Make space for any extra graphemes. */
if (num_result_graphs > 1) {
result_graphs += num_result_graphs - 1;
result_buf = MVM_realloc(result_buf,
result_graphs * sizeof(MVMGrapheme32));
}
/* Copy resulting graphemes. */
while (num_result_graphs > 0) {
result_buf[i++] = MVM_unicode_normalizer_get_grapheme(tc, &norm);
num_result_graphs--;
}
changed = 1;
/* Clean up normalizer (we could init one per transform
* and keep it around in the future, if we find it's a
* worthwhile gain). */
MVM_unicode_normalizer_cleanup(tc, &norm);
}
}
else {
MVMGrapheme32 *transformed;
MVMint32 num_transformed = MVM_nfg_get_case_change(tc, g, type, &transformed);
if (num_transformed == 0) {
result_buf[i++] = g;
}
else if (num_transformed == 1) {
result_buf[i++] = *transformed;
changed = 1;
}
else {
MVMuint32 j;
result_graphs += num_transformed - 1;
result_buf = MVM_realloc(result_buf,
result_graphs * sizeof(MVMGrapheme32));
for (j = 0; j < num_transformed; j++)
result_buf[i++] = transformed[j];
changed = 1;
}
}
}
if (changed) {
result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
result->body.num_graphs = result_graphs;
result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = result_buf;
return result;
}
else {
MVM_free(result_buf);
}
}
STRAND_CHECK(tc, s);
return s;
}
MVMString * MVM_string_uc(MVMThreadContext *tc, MVMString *s) {
return do_case_change(tc, s, MVM_unicode_case_change_type_upper, "uc");
}
MVMString * MVM_string_lc(MVMThreadContext *tc, MVMString *s) {
return do_case_change(tc, s, MVM_unicode_case_change_type_lower, "lc");
}
MVMString * MVM_string_tc(MVMThreadContext *tc, MVMString *s) {
return do_case_change(tc, s, MVM_unicode_case_change_type_title, "tc");
}
MVMString * MVM_string_fc(MVMThreadContext *tc, MVMString *s) {
return do_case_change(tc, s, MVM_unicode_case_change_type_fold, "fc");
}
/* Decodes a C buffer to an MVMString, dependent on the encoding type flag. */
MVMString * MVM_string_decode(MVMThreadContext *tc,
const MVMObject *type_object, char *Cbuf, MVMint64 byte_length, MVMint64 encoding_flag) {
switch(encoding_flag) {
case MVM_encoding_type_utf8:
return MVM_string_utf8_decode_strip_bom(tc, type_object, Cbuf, byte_length);
case MVM_encoding_type_ascii:
return MVM_string_ascii_decode(tc, type_object, Cbuf, byte_length);
case MVM_encoding_type_latin1:
return MVM_string_latin1_decode(tc, type_object, Cbuf, byte_length);
case MVM_encoding_type_utf16:
return MVM_string_utf16_decode(tc, type_object, Cbuf, byte_length);
case MVM_encoding_type_windows1252:
return MVM_string_windows1252_decode(tc, type_object, Cbuf, byte_length);
case MVM_encoding_type_utf8_c8:
return MVM_string_utf8_c8_decode(tc, type_object, Cbuf, byte_length);
default:
MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
}
}
/* Encodes an MVMString to a C buffer, dependent on the encoding type flag */
char * MVM_string_encode(MVMThreadContext *tc, MVMString *s, MVMint64 start,
MVMint64 length, MVMuint64 *output_size, MVMint64 encoding_flag,
MVMString *replacement, MVMint32 translate_newlines) {
switch(encoding_flag) {
case MVM_encoding_type_utf8:
return MVM_string_utf8_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
case MVM_encoding_type_ascii:
return MVM_string_ascii_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
case MVM_encoding_type_latin1:
return MVM_string_latin1_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
case MVM_encoding_type_utf16:
return MVM_string_utf16_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
case MVM_encoding_type_windows1252:
return MVM_string_windows1252_encode_substr(tc, s, output_size, start, length, replacement, translate_newlines);
case MVM_encoding_type_utf8_c8:
return MVM_string_utf8_c8_encode_substr(tc, s, output_size, start, length, replacement);
default:
MVM_exception_throw_adhoc(tc, "invalid encoding type flag: %"PRId64, encoding_flag);
}
}
/* Encodes a string, and writes the encoding string into the supplied Buf
* instance, which should be an integer array with MVMArray REPR. */
MVMObject * MVM_string_encode_to_buf(MVMThreadContext *tc, MVMString *s, MVMString *enc_name,
MVMObject *buf, MVMString *replacement) {
MVMuint64 output_size;
MVMuint8 *encoded;
MVMArrayREPRData *buf_rd;
MVMuint8 elem_size = 0;
/* Ensure the target is in the correct form. */
MVM_string_check_arg(tc, s, "encode");
if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
MVM_exception_throw_adhoc(tc, "encode requires a native array to write into");
buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
if (buf_rd) {
switch (buf_rd->slot_type) {
case MVM_ARRAY_I64: elem_size = 8; break;
case MVM_ARRAY_I32: elem_size = 4; break;
case MVM_ARRAY_I16: elem_size = 2; break;
case MVM_ARRAY_I8: elem_size = 1; break;
case MVM_ARRAY_U64: elem_size = 8; break;
case MVM_ARRAY_U32: elem_size = 4; break;
case MVM_ARRAY_U16: elem_size = 2; break;
case MVM_ARRAY_U8: elem_size = 1; break;
}
}
if (!elem_size)
MVM_exception_throw_adhoc(tc, "encode requires a native int array");
if (((MVMArray *)buf)->body.slots.any)
MVM_exception_throw_adhoc(tc, "encode requires an empty array");
/* At least find_encoding may allocate on first call, so root just
* in case. */
MVMROOT(tc, buf, {
MVMROOT(tc, s, {
const MVMuint8 encoding_flag = MVM_string_find_encoding(tc, enc_name);
encoded = (MVMuint8 *)MVM_string_encode(tc, s, 0, MVM_string_graphs_nocheck(tc, s), &output_size,
encoding_flag, replacement, 0);
});
});
/* Stash the encoded data in the VMArray. */
((MVMArray *)buf)->body.slots.i8 = (MVMint8 *)encoded;
((MVMArray *)buf)->body.start = 0;
((MVMArray *)buf)->body.ssize = output_size / elem_size;
((MVMArray *)buf)->body.elems = output_size / elem_size;
return buf;
}
/* Decodes a string using the data from the specified Buf. */
MVMString * MVM_string_decode_from_buf(MVMThreadContext *tc, MVMObject *buf, MVMString *enc_name) {
MVMArrayREPRData *buf_rd;
MVMuint8 encoding_flag;
MVMuint8 elem_size = 0;
/* Ensure the source is in the correct form. */
if (!IS_CONCRETE(buf) || REPR(buf)->ID != MVM_REPR_ID_VMArray)
MVM_exception_throw_adhoc(tc, "decode requires a native array to read from");
buf_rd = (MVMArrayREPRData *)STABLE(buf)->REPR_data;
if (buf_rd) {
switch (buf_rd->slot_type) {
case MVM_ARRAY_I64: elem_size = 8; break;
case MVM_ARRAY_I32: elem_size = 4; break;
case MVM_ARRAY_I16: elem_size = 2; break;
case MVM_ARRAY_I8: elem_size = 1; break;
case MVM_ARRAY_U64: elem_size = 8; break;
case MVM_ARRAY_U32: elem_size = 4; break;
case MVM_ARRAY_U16: elem_size = 2; break;
case MVM_ARRAY_U8: elem_size = 1; break;
}
}
if (!elem_size)
MVM_exception_throw_adhoc(tc, "encode requires a native int array");
/* Decode. */
MVMROOT(tc, buf, {
encoding_flag = MVM_string_find_encoding(tc, enc_name);
});
return MVM_string_decode(tc, tc->instance->VMString,
(char *)(((MVMArray *)buf)->body.slots.i8 + ((MVMArray *)buf)->body.start),
((MVMArray *)buf)->body.elems * elem_size,
encoding_flag);
}
MVMObject * MVM_string_split(MVMThreadContext *tc, MVMString *separator, MVMString *input) {
MVMObject *result;
MVMStringIndex start, end, sep_length;
MVMHLLConfig *hll = MVM_hll_current(tc);
MVM_string_check_arg(tc, separator, "split separator");
MVM_string_check_arg(tc, input, "split input");
MVMROOT(tc, input, {
MVMROOT(tc, separator, {
result = MVM_repr_alloc_init(tc, hll->slurpy_array_type);
MVMROOT(tc, result, {
start = 0;
end = MVM_string_graphs_nocheck(tc, input);
sep_length = MVM_string_graphs_nocheck(tc, separator);
while (start < end) {
MVMString *portion;
MVMStringIndex index;
MVMStringIndex length;
/* XXX make this use the dual-traverse iterator, but such that it
can reset the index of what it's comparing... <!> */
index = MVM_string_index(tc, input, separator, start);
length = sep_length ? (index == -1 ? end : index) - start : 1;
if (length > 0 || (sep_length && length == 0)) {
portion = MVM_string_substring(tc, input, start, length);
MVMROOT(tc, portion, {
MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
MVM_repr_set_str(tc, pobj, portion);
MVM_repr_push_o(tc, result, pobj);
});
}
start += length + sep_length;
/* Gather an empty string if the delimiter is found at the end. */
if (sep_length && start == end) {
MVMObject *pobj = MVM_repr_alloc_init(tc, hll->str_box_type);
MVM_repr_set_str(tc, pobj, tc->instance->str_consts.empty);
MVM_repr_push_o(tc, result, pobj);
}
}
});
});
});
return result;
}
MVMString * MVM_string_join(MVMThreadContext *tc, MVMString *separator, MVMObject *input) {
MVMString *result;
MVMString **pieces;
MVMint64 elems, num_pieces, sgraphs, i, is_str_array, total_graphs;
MVMuint16 sstrands, total_strands;
MVMint32 concats_stable = 1;
MVM_string_check_arg(tc, separator, "join separator");
if (!IS_CONCRETE(input))
MVM_exception_throw_adhoc(tc, "join needs a concrete array to join");
/* See how many things we have to join; if the answer is "none" then we
* can make a hasty escape. */
elems = MVM_repr_elems(tc, input);
if (elems == 0)
return tc->instance->str_consts.empty;
is_str_array = REPR(input)->pos_funcs.get_elem_storage_spec(tc,
STABLE(input)).boxed_primitive == MVM_STORAGE_SPEC_BP_STR;
/* Allocate result. */
MVMROOT(tc, separator, {
MVMROOT(tc, input, {
result = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
});
});
/* Take a first pass through the string, counting up length and the total
* number of strands we encounter as well as building a flat array of the
* strings (to we only have to do the indirect calls once). */
sgraphs = MVM_string_graphs_nocheck(tc, separator);
if (sgraphs)
sstrands = separator->body.storage_type == MVM_STRING_STRAND
? separator->body.num_strands
: 1;
else
sstrands = 1;
pieces = MVM_malloc(elems * sizeof(MVMString *));
num_pieces = 0;
total_graphs = 0;
total_strands = 0;
for (i = 0; i < elems; i++) {
/* Get piece of the string. */
MVMString *piece;
MVMint64 piece_graphs;
if (is_str_array) {
piece = MVM_repr_at_pos_s(tc, input, i);
if (!piece)
continue;
}
else {
MVMObject *item = MVM_repr_at_pos_o(tc, input, i);
if (!item || !IS_CONCRETE(item))
continue;
piece = MVM_repr_get_str(tc, item);
}
/* If it wasn't the first piece, add separator here. */
if (num_pieces) {
total_strands += sstrands;
total_graphs += sgraphs;
}
/* Add on the piece's strands and graphs. */
piece_graphs = MVM_string_graphs(tc, piece);
if (piece_graphs) {
total_strands += piece->body.storage_type == MVM_STRING_STRAND
? piece->body.num_strands
: 1;
total_graphs += piece_graphs;
}
/* Store piece. */
pieces[num_pieces++] = piece;
}
/* We now know the total eventual number of graphemes. */
if (total_graphs == 0) {
free(pieces);
return tc->instance->str_consts.empty;
}
result->body.num_graphs = total_graphs;
/* If we just collect all the things as strands, are we within bounds, and
* will be come out ahead? */
if (total_strands < MVM_STRING_MAX_STRANDS && total_graphs / total_strands >= 16) {
/* XXX TODO: Implement this, conditionalize branch thing below. */
}
/*else {*/
if (1) {
/* We'll produce a single, flat string. */
MVMint64 position = 0;
MVMGraphemeIter gi;
result->body.storage_type = MVM_STRING_GRAPHEME_32;
result->body.storage.blob_32 = MVM_malloc(total_graphs * sizeof(MVMGrapheme32));
for (i = 0; i < num_pieces; i++) {
/* Get piece. */
MVMString *piece = pieces[i];
/* Add separator if needed. */
if (i > 0) {
/* If there's no separator and one piece is The Empty String we
* have to be extra careful about concat stability */
if (sgraphs == 0 && MVM_string_graphs_nocheck(tc, piece) == 0 && concats_stable
&& i + 1 < num_pieces
&& !MVM_nfg_is_concat_stable(tc, pieces[i - 1], pieces[i + 1])) {
concats_stable = 0;
}
if (sgraphs) {
if (!concats_stable)
/* Already unstable; no more checks. */;
else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], separator))
concats_stable = 0;
else if (!MVM_nfg_is_concat_stable(tc, separator, piece))
concats_stable = 0;
switch (separator->body.storage_type) {
case MVM_STRING_GRAPHEME_32:
memcpy(
result->body.storage.blob_32 + position,
separator->body.storage.blob_32,
sgraphs * sizeof(MVMGrapheme32));
position += sgraphs;
break;
/* XXX Can special-case 8-bit NFG and ASCII here too. */
default:
MVM_string_gi_init(tc, &gi, separator);
while (MVM_string_gi_has_more(tc, &gi))
result->body.storage.blob_32[position++] =
MVM_string_gi_get_grapheme(tc, &gi);
break;
}
}
else {
/* Separator has no graphemes, so NFG stability check
* should consider pieces. */
if (!concats_stable)
/* Already stable; no more checks. */;
else if (!MVM_nfg_is_concat_stable(tc, pieces[i - 1], piece))
concats_stable = 0;
}
}
/* Add piece. */
switch (piece->body.storage_type) {
case MVM_STRING_GRAPHEME_32: {
MVMint64 pgraphs = MVM_string_graphs(tc, piece);
memcpy(
result->body.storage.blob_32 + position,
piece->body.storage.blob_32,
pgraphs * sizeof(MVMGrapheme32));
position += pgraphs;
break;
}
/* XXX Can special-case 8-bit NFG and ASCII here too. */
default:
MVM_string_gi_init(tc, &gi, piece);
while (MVM_string_gi_has_more(tc, &gi))
result->body.storage.blob_32[position++] =
MVM_string_gi_get_grapheme(tc, &gi);
break;
}
}
}
MVM_free(pieces);
STRAND_CHECK(tc, result);
return concats_stable ? result : re_nfg(tc, result);
}
/* Returning nonzero means it found the char at the position specified in 'a' in 'b'.
* For character enumerations in regexes. */
MVMint64 MVM_string_char_at_in_string(MVMThreadContext *tc, MVMString *a, MVMint64 offset, MVMString *b) {
MVMuint32 bgraphs;
MVMGrapheme32 search;
MVM_string_check_arg(tc, a, "char_at_in_string");
MVM_string_check_arg(tc, b, "char_at_in_string");
/* We return -2 here only to be able to distinguish between "out of bounds" and "not in string". */
if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, a))
return -2;
search = MVM_string_get_grapheme_at_nocheck(tc, a, offset);
bgraphs = MVM_string_graphs_nocheck(tc, b);
switch (b->body.storage_type) {
case MVM_STRING_GRAPHEME_32: {
MVMStringIndex i;
for (i = 0; i < bgraphs; i++)
if (b->body.storage.blob_32[i] == search)
return i;
break;
}
case MVM_STRING_GRAPHEME_ASCII:
if (can_fit_into_ascii(search)) {
MVMStringIndex i;
for (i = 0; i < bgraphs; i++)
if (b->body.storage.blob_ascii[i] == search)
return i;
}
break;
case MVM_STRING_GRAPHEME_8:
if (can_fit_into_8bit(search)) {
MVMStringIndex i;
for (i = 0; i < bgraphs; i++)
if (b->body.storage.blob_8[i] == search)
return i;
}
break;
case MVM_STRING_STRAND: {
MVMGraphemeIter gi;
MVMStringIndex i;
MVM_string_gi_init(tc, &gi, b);
for (i = 0; i < bgraphs; i++)
if (MVM_string_gi_get_grapheme(tc, &gi) == search)
return i;
}
}
return -1;
}
MVMint64 MVM_string_offset_has_unicode_property_value(MVMThreadContext *tc, MVMString *s, MVMint64 offset, MVMint64 property_code, MVMint64 property_value_code) {
MVMGrapheme32 g;
MVMCodepoint cp;
MVM_string_check_arg(tc, s, "uniprop");
if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
return 0;
g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
if (g >= 0)
cp = (MVMCodepoint)g;
else
cp = MVM_nfg_get_synthetic_info(tc, g)->base;
return MVM_unicode_codepoint_has_property_value(tc, cp, property_code, property_value_code);
}
/* If the string is made up of strands, then produces a flattend string
* representing the exact same graphemes but without strands. Otherwise,
* returns the input string. Intended for strings that will be indexed
* into heavily (when evaluating regexes, for example). */
MVMString * MVM_string_indexing_optimized(MVMThreadContext *tc, MVMString *s) {
MVM_string_check_arg(tc, s, "indexingoptimized");
if (s->body.storage_type == MVM_STRING_STRAND)
return collapse_strands(tc, s);
else
return s;
}
/* Escapes a string, replacing various chars like \n with \\n. Can no doubt be
* further optimized. */
MVMString * MVM_string_escape(MVMThreadContext *tc, MVMString *s) {
MVMString *res = NULL;
MVMStringIndex spos = 0;
MVMStringIndex bpos = 0;
MVMStringIndex sgraphs, balloc;
MVMGrapheme32 *buffer;
MVMGrapheme32 crlf;
MVMint8 string_can_fit_into_8bit = 1;
MVM_string_check_arg(tc, s, "escape");
sgraphs = MVM_string_graphs_nocheck(tc, s);
balloc = sgraphs;
buffer = MVM_malloc(sizeof(MVMGrapheme32) * balloc);
crlf = MVM_nfg_crlf_grapheme(tc);
for (; spos < sgraphs; spos++) {
MVMGrapheme32 graph = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
MVMGrapheme32 esc = 0;
switch (graph) {
case '\\': esc = '\\'; break;
case 7: esc = 'a'; break;
case '\b': esc = 'b'; break;
case '\n': esc = 'n'; break;
case '\r': esc = 'r'; break;
case '\t': esc = 't'; break;
case '\f': esc = 'f'; break;
case '"': esc = '"'; break;
case 27: esc = 'e'; break;
}
if (esc) {
if (bpos + 2 > balloc) {
balloc += 32;
buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
}
buffer[bpos++] = '\\';
buffer[bpos++] = esc;
}
else if (graph == crlf) {
if (bpos + 4 > balloc) {
balloc += 32;
buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
}
buffer[bpos++] = '\\';
buffer[bpos++] = 'r';
buffer[bpos++] = '\\';
buffer[bpos++] = 'n';
}
else {
if (bpos + 1 > balloc) {
balloc += 32;
buffer = MVM_realloc(buffer, sizeof(MVMGrapheme32) * balloc);
}
if (!can_fit_into_8bit(graph))
string_can_fit_into_8bit = 0;
buffer[bpos++] = graph;
}
}
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_32;
res->body.storage.blob_32 = buffer;
res->body.num_graphs = bpos;
if (string_can_fit_into_8bit)
turn_32bit_into_8bit_unchecked(tc, res);
STRAND_CHECK(tc, res);
return res;
}
/* Takes a string and reverses its characters. */
MVMString * MVM_string_flip(MVMThreadContext *tc, MVMString *s) {
MVMString *res = NULL;
MVMStringIndex spos = 0;
MVMStringIndex sgraphs;
MVMStringIndex rpos;
MVM_string_check_arg(tc, s, "flip");
sgraphs = MVM_string_graphs_nocheck(tc, s);
rpos = sgraphs;
if (s->body.storage_type == MVM_STRING_GRAPHEME_8) {
MVMGrapheme8 *rbuffer;
rbuffer = MVM_malloc(sizeof(MVMGrapheme8) * sgraphs);
for (; spos < sgraphs; spos++)
rbuffer[--rpos] = s->body.storage.blob_8[spos];
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_8;
res->body.storage.blob_8 = rbuffer;
} else {
MVMGrapheme32 *rbuffer;
rbuffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
if (s->body.storage_type == MVM_STRING_GRAPHEME_32)
for (; spos < sgraphs; spos++)
rbuffer[--rpos] = s->body.storage.blob_32[spos];
else
for (; spos < sgraphs; spos++)
rbuffer[--rpos] = MVM_string_get_grapheme_at_nocheck(tc, s, spos);
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_32;
res->body.storage.blob_32 = rbuffer;
}
res->body.num_graphs = sgraphs;
STRAND_CHECK(tc, res);
return res;
}
/* Compares two strings, returning -1, 0 or 1 to indicate less than,
* equal or greater than. */
MVMint64 MVM_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVMStringIndex alen, blen, i, scanlen;
MVM_string_check_arg(tc, a, "compare");
MVM_string_check_arg(tc, b, "compare");
/* Simple cases when one or both are zero length. */
alen = MVM_string_graphs_nocheck(tc, a);
blen = MVM_string_graphs_nocheck(tc, b);
if (alen == 0)
return blen == 0 ? 0 : -1;
if (blen == 0)
return 1;
/* Otherwise, need to scan them. */
scanlen = alen > blen ? blen : alen;
for (i = 0; i < scanlen; i++) {
MVMGrapheme32 ai = MVM_string_get_grapheme_at_nocheck(tc, a, i);
MVMGrapheme32 bi = MVM_string_get_grapheme_at_nocheck(tc, b, i);
if (ai != bi)
return ai < bi ? -1 : 1;
}
/* All shared chars equal, so go on length. */
return alen < blen ? -1 :
alen > blen ? 1 :
0 ;
}
/* Takes two strings and AND's their characters. */
MVMString * MVM_string_bitand(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVMString *res;
MVMGrapheme32 *buffer;
MVMStringIndex i, alen, blen, sgraphs;
MVM_string_check_arg(tc, a, "bitwise and");
MVM_string_check_arg(tc, b, "bitwise and");
alen = MVM_string_graphs_nocheck(tc, a);
blen = MVM_string_graphs_nocheck(tc, b);
sgraphs = alen < blen ? alen : blen;
buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
/* Binary-and up to the length of the shortest string. */
for (i = 0; i < sgraphs; i++)
buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
& MVM_string_get_grapheme_at_nocheck(tc, b, i));
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_32;
res->body.storage.blob_32 = buffer;
res->body.num_graphs = sgraphs;
STRAND_CHECK(tc, res);
return res;
}
/* Takes two strings and OR's their characters. */
MVMString * MVM_string_bitor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVMString *res;
MVMGrapheme32 *buffer;
MVMStringIndex alen, blen, sgraphs, i, scanlen;
MVM_string_check_arg(tc, a, "bitwise or");
MVM_string_check_arg(tc, b, "bitwise or");
alen = MVM_string_graphs_nocheck(tc, a);
blen = MVM_string_graphs_nocheck(tc, b);
sgraphs = (alen > blen ? alen : blen);
buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
/* First, binary-or up to the length of the shortest string. */
scanlen = alen > blen ? blen : alen;
for (i = 0; i < scanlen; i++)
buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
| MVM_string_get_grapheme_at_nocheck(tc, b, i));
/* Second pass, fill with characters of the longest string. */
if (alen > blen)
for (; i < sgraphs; i++)
buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i);
else
for (; i < sgraphs; i++)
buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i);
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_32;
res->body.storage.blob_32 = buffer;
res->body.num_graphs = sgraphs;
STRAND_CHECK(tc, res);
return res;
}
/* Takes two strings and XOR's their characters. */
MVMString * MVM_string_bitxor(MVMThreadContext *tc, MVMString *a, MVMString *b) {
MVMString *res;
MVMGrapheme32 *buffer;
MVMStringIndex alen, blen, sgraphs, i, scanlen;
MVM_string_check_arg(tc, a, "bitwise xor");
MVM_string_check_arg(tc, b, "bitwise xor");
alen = MVM_string_graphs_nocheck(tc, a);
blen = MVM_string_graphs_nocheck(tc, b);
sgraphs = (alen > blen ? alen : blen);
buffer = MVM_malloc(sizeof(MVMGrapheme32) * sgraphs);
/* First, binary-xor up to the length of the shorter string. */
scanlen = alen > blen ? blen : alen;
for (i = 0; i < scanlen; i++)
buffer[i] = (MVM_string_get_grapheme_at_nocheck(tc, a, i)
^ MVM_string_get_grapheme_at_nocheck(tc, b, i));
/* Second pass, fill with characters of the longest string. */
if (alen > blen)
for (; i < sgraphs; i++)
buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, a, i);
else
for (; i < sgraphs; i++)
buffer[i] = MVM_string_get_grapheme_at_nocheck(tc, b, i);
res = (MVMString *)MVM_repr_alloc_init(tc, tc->instance->VMString);
res->body.storage_type = MVM_STRING_GRAPHEME_32;
res->body.storage.blob_32 = buffer;
res->body.num_graphs = sgraphs;
STRAND_CHECK(tc, res);
return res;
}
/* The following statics hold on to various unicode property values we will
* resolve once so we don't have to do it repeatedly. */
static MVMint64 UPV_Nd = 0;
static MVMint64 UPV_Lu = 0;
static MVMint64 UPV_Ll = 0;
static MVMint64 UPV_Lt = 0;
static MVMint64 UPV_Lm = 0;
static MVMint64 UPV_Lo = 0;
static MVMint64 UPV_Zs = 0;
static MVMint64 UPV_Zl = 0;
static MVMint64 UPV_Pc = 0;
static MVMint64 UPV_Pd = 0;
static MVMint64 UPV_Ps = 0;
static MVMint64 UPV_Pe = 0;
static MVMint64 UPV_Pi = 0;
static MVMint64 UPV_Pf = 0;
static MVMint64 UPV_Po = 0;
/* concatenating with "" ensures that only literal strings are accepted as argument. */
#define STR_WITH_LEN(str) ("" str ""), (sizeof(str) - 1)
/* Resolves various unicode property values that we'll need. */
void MVM_string_cclass_init(MVMThreadContext *tc) {
UPV_Nd = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Nd"));
UPV_Lu = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lu"));
UPV_Ll = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ll"));
UPV_Lt = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lt"));
UPV_Lm = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lm"));
UPV_Lo = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Lo"));
UPV_Zs = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zs"));
UPV_Zl = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Zl"));
UPV_Pc = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pc"));
UPV_Pd = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pd"));
UPV_Ps = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Ps"));
UPV_Pe = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pe"));
UPV_Pi = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pi"));
UPV_Pf = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Pf"));
UPV_Po = MVM_unicode_cname_to_property_value_code(tc,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, STR_WITH_LEN("Po"));
}
/* Checks if the specified grapheme is in the given character class. */
static MVMint64 grapheme_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMGrapheme32 g) {
/* If it's a synthetic, then grab the base codepoint. */
MVMCodepoint cp;
if (g >= 0)
cp = (MVMCodepoint)g;
else
cp = MVM_nfg_get_synthetic_info(tc, g)->base;
switch (cclass) {
case MVM_CCLASS_ANY:
return 1;
case MVM_CCLASS_UPPERCASE:
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu);
case MVM_CCLASS_LOWERCASE:
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll);
case MVM_CCLASS_WORD:
if (cp <= 'z') { /* short circuit common case */
if (cp >= 'a' || cp == '_' || (cp >= 'A' && cp <= 'Z') || (cp >= '0' && cp <= '9'))
return 1;
else
return 0;
}
/* Deliberate fall-through; word is _ or digit or alphabetic. */
case MVM_CCLASS_ALPHANUMERIC:
if (cp <= '9' && cp >= '0') /* short circuit common case */
return 1;
if (MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd))
return 1;
/* Deliberate fall-through; alphanumeric is digit or alphabetic. */
case MVM_CCLASS_ALPHABETIC:
if (cp <= 'z') { /* short circuit common case */
if (cp >= 'a' || (cp >= 'A' && cp <= 'Z'))
return 1;
else
return 0;
}
return
MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lo) /* lots of CJK chars */
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ll) /* (ascii handled above) */
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lu)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lt)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Lm);
case MVM_CCLASS_NUMERIC:
if (cp <= '9' && cp >= '0') /* short circuit common case */
return 1;
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Nd);
case MVM_CCLASS_HEXADECIMAL:
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_ASCII_HEX_DIGIT, 1);
case MVM_CCLASS_WHITESPACE:
if (cp <= '~') {
if (cp == ' ' || (cp <= 13 && cp >= 9))
return 1;
else
return 0;
}
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_WHITE_SPACE, 1);
case MVM_CCLASS_BLANK:
if (cp == '\t')
return 1;
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zs);
case MVM_CCLASS_CONTROL: {
return (cp >= 0 && cp < 32) || (cp >= 127 && cp < 160);
}
case MVM_CCLASS_PRINTING: {
return !((cp >= 0 && cp < 32) || (cp >= 127 && cp < 160));
}
case MVM_CCLASS_PUNCTUATION:
return
MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pc)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pd)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Ps)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pe)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pi)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Pf)
|| MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Po);
case MVM_CCLASS_NEWLINE: {
if (cp == '\n' || cp == 0x0b || cp == 0x0c || cp == '\r' ||
cp == 0x85 || cp == 0x2028 || cp == 0x2029)
return 1;
return MVM_unicode_codepoint_has_property_value(tc, cp,
MVM_UNICODE_PROPERTY_GENERAL_CATEGORY, UPV_Zl);
}
default:
return 0;
}
}
/* Checks if the character at the specified offset is a member of the
* indicated character class. */
MVMint64 MVM_string_is_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset) {
MVMGrapheme32 g;
MVM_string_check_arg(tc, s, "is_cclass");
if (offset < 0 || offset >= MVM_string_graphs_nocheck(tc, s))
return 0;
g = MVM_string_get_grapheme_at_nocheck(tc, s, offset);
return grapheme_is_cclass(tc, cclass, g);
}
/* Searches for the next char that is in the specified character class. */
MVMint64 MVM_string_find_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
MVMGraphemeIter gi;
MVMint64 length, end, pos;
MVM_string_check_arg(tc, s, "find_cclass");
length = MVM_string_graphs_nocheck(tc, s);
end = offset + count;
end = length < end ? length : end;
if (offset < 0 || offset >= length)
return end;
MVM_string_gi_init(tc, &gi, s);
MVM_string_gi_move_to(tc, &gi, offset);
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
if (grapheme_is_cclass(tc, cclass, g) > 0)
return pos;
}
return end;
}
/* Searches for the next char that is not in the specified character class. */
MVMint64 MVM_string_find_not_cclass(MVMThreadContext *tc, MVMint64 cclass, MVMString *s, MVMint64 offset, MVMint64 count) {
MVMGraphemeIter gi;
MVMint64 length, end, pos;
MVM_string_check_arg(tc, s, "find_not_cclass");
length = MVM_string_graphs_nocheck(tc, s);
end = offset + count;
end = length < end ? length : end;
if (offset < 0 || offset >= length)
return end;
MVM_string_gi_init(tc, &gi, s);
MVM_string_gi_move_to(tc, &gi, offset);
for (pos = offset; pos < end; pos++) {
MVMGrapheme32 g = MVM_string_gi_get_grapheme(tc, &gi);
if (grapheme_is_cclass(tc, cclass, g) == 0)
return pos;
}
return end;
}
static MVMint16 encoding_name_init = 0;
static MVMString *encoding_utf8_name = NULL;
static MVMString *encoding_ascii_name = NULL;
static MVMString *encoding_latin1_name = NULL;
static MVMString *encoding_utf16_name = NULL;
static MVMString *encoding_windows1252_name = NULL;
static MVMString *encoding_utf8_c8_name = NULL;
MVMuint8 MVM_string_find_encoding(MVMThreadContext *tc, MVMString *name) {
MVM_string_check_arg(tc, name, "find encoding");
if (!encoding_name_init) {
MVM_gc_allocate_gen2_default_set(tc);
encoding_utf8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_name, "Encoding name");
encoding_ascii_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "ascii");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_ascii_name, "Encoding name");
encoding_latin1_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "iso-8859-1");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_latin1_name, "Encoding name");
encoding_utf16_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf16");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf16_name, "Encoding name");
encoding_windows1252_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "windows-1252");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_windows1252_name, "Encoding name");
encoding_utf8_c8_name = MVM_string_ascii_decode_nt(tc, tc->instance->VMString, "utf8-c8");
MVM_gc_root_add_permanent_desc(tc, (MVMCollectable **)&encoding_utf8_c8_name, "Encoding name");
encoding_name_init = 1;
MVM_gc_allocate_gen2_default_clear(tc);
}
if (MVM_string_equal(tc, name, encoding_utf8_name)) {
return MVM_encoding_type_utf8;
}
else if (MVM_string_equal(tc, name, encoding_ascii_name)) {
return MVM_encoding_type_ascii;
}
else if (MVM_string_equal(tc, name, encoding_latin1_name)) {
return MVM_encoding_type_latin1;
}
else if (MVM_string_equal(tc, name, encoding_windows1252_name)) {
return MVM_encoding_type_windows1252;
}
else if (MVM_string_equal(tc, name, encoding_utf16_name)) {
return MVM_encoding_type_utf16;
}
else if (MVM_string_equal(tc, name, encoding_utf8_c8_name)) {
return MVM_encoding_type_utf8_c8;
}
else {
char *c_name = MVM_string_utf8_encode_C_string(tc, name);
char *waste[] = { c_name, NULL };
MVM_exception_throw_adhoc_free(tc, waste, "Unknown string encoding: '%s'",
c_name);
}
}
/* Turns a codepoint into a string. If required uses the normalizer to ensure
* that we get a valid NFG string (NFG is a superset of NFC, and singleton
* decompositions exist). */
MVMString * MVM_string_chr(MVMThreadContext *tc, MVMint64 cp) {
MVMString *s;
MVMGrapheme32 g;
if (cp < 0)
MVM_exception_throw_adhoc(tc, "chr codepoint cannot be negative");
/* If the codepoint decomposes we may need to normalize it.
* The first cp that decomposes is U+0340, but to be on the safe side
* for now we go with the first significant character which at the time
* of writing (Unicode 9.0) is COMBINING GRAVE ACCENT U+300 */
if (cp >= MVM_NORMALIZE_FIRST_SIG_NFC
&& MVM_unicode_codepoint_get_property_int(tc, cp, MVM_UNICODE_PROPERTY_DECOMPOSITION_TYPE)
!= MVM_UNICODE_PVALUE_DT_NONE) {
MVMNormalizer norm;
MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
if (!MVM_unicode_normalizer_process_codepoint_to_grapheme(tc, &norm, cp, &g)) {
MVM_unicode_normalizer_eof(tc, &norm);
g = MVM_unicode_normalizer_get_grapheme(tc, &norm);
}
MVM_unicode_normalizer_cleanup(tc, &norm);
}
else {
g = (MVMGrapheme32) cp;
}
s = (MVMString *)REPR(tc->instance->VMString)->allocate(tc, STABLE(tc->instance->VMString));
if (can_fit_into_8bit(g)) {
s->body.storage_type = MVM_STRING_GRAPHEME_8;
s->body.storage.blob_8 = MVM_malloc(sizeof(MVMGrapheme8));
s->body.storage.blob_8[0] = g;
} else {
s->body.storage_type = MVM_STRING_GRAPHEME_32;
s->body.storage.blob_32 = MVM_malloc(sizeof(MVMGrapheme32));
s->body.storage.blob_32[0] = g;
}
s->body.num_graphs = 1;
return s;
}
/* Takes a string and computes a hash code for it, storing it in the hash code
* cache field of the string. Hashing code is derived from the Jenkins hash
* implementation in uthash.h. */
typedef union {
MVMint32 graphs[3];
unsigned char bytes[12];
} MVMJenHashGraphemeView;
void MVM_string_compute_hash_code(MVMThreadContext *tc, MVMString *s) {
/* The hash algorithm works in bytes. Since we can represent strings in a
* number of ways, and we want consistent hashing, then we'll read the
* strings using the grapheme iterator in groups of 3, using 32-bit ints
* for the graphemes no matter what the string really holds them as. Then
* we'll use the bytes view of that in the hashing function. */
MVMJenHashGraphemeView hash_block;
MVMGraphemeIter gi;
MVMuint32 graphs_remaining = MVM_string_graphs(tc, s);
/* Initialize hash state. */
MVMuint32 hashv = 0xfeedbeef;
MVMuint32 _hj_i, _hj_j;
_hj_i = _hj_j = 0x9e3779b9;
/* Work through the string 3 graphemes at a time. */
MVM_string_gi_init(tc, &gi, s);
while (graphs_remaining >= 3) {
hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
hash_block.graphs[2] = MVM_string_gi_get_grapheme(tc, &gi);
_hj_i += (hash_block.bytes[0] + ( (unsigned)hash_block.bytes[1] << 8 )
+ ( (unsigned)hash_block.bytes[2] << 16 )
+ ( (unsigned)hash_block.bytes[3] << 24 ) );
_hj_j += (hash_block.bytes[4] + ( (unsigned)hash_block.bytes[5] << 8 )
+ ( (unsigned)hash_block.bytes[6] << 16 )
+ ( (unsigned)hash_block.bytes[7] << 24 ) );
hashv += (hash_block.bytes[8] + ( (unsigned)hash_block.bytes[9] << 8 )
+ ( (unsigned)hash_block.bytes[10] << 16 )
+ ( (unsigned)hash_block.bytes[11] << 24 ) );
HASH_JEN_MIX(_hj_i, _hj_j, hashv);
graphs_remaining -= 3;
}
/* Mix in key length (in bytes, not graphemes). */
hashv += MVM_string_graphs(tc, s) * sizeof(MVMGrapheme32);
/* Now handle trailing graphemes (must be 2, 1, or 0). */
switch (graphs_remaining) {
case 2:
hash_block.graphs[1] = MVM_string_gi_get_grapheme(tc, &gi);
_hj_j += ( (unsigned)hash_block.bytes[7] << 24 ) +
( (unsigned)hash_block.bytes[6] << 16 ) +
( (unsigned)hash_block.bytes[5] << 8 ) +
hash_block.bytes[4];
/* Fallthrough */
case 1:
hash_block.graphs[0] = MVM_string_gi_get_grapheme(tc, &gi);
_hj_i += ( (unsigned)hash_block.bytes[3] << 24 ) +
( (unsigned)hash_block.bytes[2] << 16 ) +
( (unsigned)hash_block.bytes[1] << 8 ) +
hash_block.bytes[0];
}
HASH_JEN_MIX(_hj_i, _hj_j, hashv);
/* Store computed hash value. */
s->body.cached_hash_code = hashv;
}