Skip to content

Commit

Permalink
Refactor handles for new separator data structure.
Browse files Browse the repository at this point in the history
Again, no intended semantics changes in this patch. However, we do now
preserve the full set of graphemes in multi-grapheme separators, to
later enable full handling of this case (for now, only the handling of
multiple separators is going to be used, as part of the \r\n grapheme
change).
  • Loading branch information
jnthn committed Oct 30, 2015
1 parent 65bcb12 commit 81fb7c1
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 37 deletions.
9 changes: 4 additions & 5 deletions src/io/syncfile.c
Expand Up @@ -37,8 +37,8 @@ typedef struct {
/* Decode stream, for turning bytes from disk into strings. */
MVMDecodeStream *ds;

/* Current separator codepoint. */
MVMGrapheme32 sep;
/* Current separator specification for line-by-line reading. */
MVMDecodeStreamSeparators sep_spec;
} MVMIOFileData;

/* Closes the file. */
Expand Down Expand Up @@ -93,8 +93,7 @@ static MVMint64 mvm_tell(MVMThreadContext *tc, MVMOSHandle *h) {
/* Set the line separator. */
static void set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) {
MVMIOFileData *data = (MVMIOFileData *)h->body.data;
data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep,
MVM_string_graphs(tc, sep) - 1);
MVM_string_decode_stream_maybe_sep_from_string(tc, &(data->sep_spec), sep);
}

/* Read a bunch of bytes into the current decode stream. */
Expand Down Expand Up @@ -126,7 +125,7 @@ static MVMString * read_line(MVMThreadContext *tc, MVMOSHandle *h) {

/* Pull data until we can read a line. */
do {
MVMString *line = MVM_string_decodestream_get_until_sep(tc, data->ds, data->sep);
MVMString *line = MVM_string_decodestream_get_until_sep(tc, data->ds, &(data->sep_spec));
if (line != NULL)
return line;
} while (read_to_buffer(tc, data, CHUNK_SIZE) > 0);
Expand Down
2 changes: 1 addition & 1 deletion src/io/syncpipe.c
Expand Up @@ -106,7 +106,7 @@ MVMObject * MVM_io_syncpipe(MVMThreadContext *tc) {
uv_pipe_init(tc->loop, handle, 0);
data->ss.handle = (uv_stream_t *)handle;
data->ss.encoding = MVM_encoding_type_utf8;
data->ss.sep = '\n';
MVM_string_decode_stream_sep_default(tc, &(data->ss.sep_spec));
result->body.ops = &op_table;
result->body.data = data;
return (MVMObject *)result;
Expand Down
4 changes: 2 additions & 2 deletions src/io/syncsocket.c
Expand Up @@ -208,7 +208,7 @@ static MVMObject * socket_accept(MVMThreadContext *tc, MVMOSHandle *h) {
MVMIOSyncSocketData * const data = MVM_calloc(1, sizeof(MVMIOSyncSocketData));
data->ss.handle = (uv_stream_t *)client;
data->ss.encoding = MVM_encoding_type_utf8;
data->ss.sep = '\n';
MVM_string_decode_stream_sep_default(tc, &(data->ss.sep_spec));
result->body.ops = &op_table;
result->body.data = data;
return (MVMObject *)result;
Expand All @@ -226,7 +226,7 @@ MVMObject * MVM_io_socket_create(MVMThreadContext *tc, MVMint64 listen) {
MVMIOSyncSocketData * const data = MVM_calloc(1, sizeof(MVMIOSyncSocketData));
data->ss.handle = NULL;
data->ss.encoding = MVM_encoding_type_utf8;
data->ss.sep = '\n';
MVM_string_decode_stream_sep_default(tc, &(data->ss.sep_spec));
result->body.ops = &op_table;
result->body.data = data;
return (MVMObject *)result;
Expand Down
8 changes: 3 additions & 5 deletions src/io/syncstream.c
Expand Up @@ -38,10 +38,8 @@ MVMint64 MVM_io_syncstream_tell(MVMThreadContext *tc, MVMOSHandle *h) {

/* Set the line separator. */
void MVM_io_syncstream_set_separator(MVMThreadContext *tc, MVMOSHandle *h, MVMString *sep) {
/* For now, take last character. */
MVMIOSyncStreamData *data = (MVMIOSyncStreamData *)h->body.data;
data->sep = (MVMGrapheme32)MVM_string_get_grapheme_at(tc, sep,
MVM_string_graphs(tc, sep) - 1);
MVM_string_decode_stream_maybe_sep_from_string(tc, &(data->sep_spec), sep);
}

/* Read a bunch of bytes into the current decode stream. Returns true if we
Expand Down Expand Up @@ -99,7 +97,7 @@ MVMString * MVM_io_syncstream_read_line(MVMThreadContext *tc, MVMOSHandle *h) {

/* Pull data until we can read a line. */
do {
MVMString *line = MVM_string_decodestream_get_until_sep(tc, data->ds, data->sep);
MVMString *line = MVM_string_decodestream_get_until_sep(tc, data->ds, &(data->sep_spec));
if (line != NULL)
return line;
} while (read_to_buffer(tc, data, CHUNK_SIZE) > 0);
Expand Down Expand Up @@ -316,7 +314,7 @@ MVMObject * MVM_io_syncstream_from_uvstream(MVMThreadContext *tc, uv_stream_t *h
MVMIOSyncStreamData * const data = MVM_calloc(1, sizeof(MVMIOSyncStreamData));
data->handle = handle;
data->encoding = MVM_encoding_type_utf8;
data->sep = '\n';
MVM_string_decode_stream_sep_default(tc, &(data->sep_spec));
result->body.ops = &op_table;
result->body.data = data;
return (MVMObject *)result;
Expand Down
4 changes: 2 additions & 2 deletions src/io/syncstream.h
Expand Up @@ -18,8 +18,8 @@ struct MVMIOSyncStreamData {
/* Total bytes we've written. */
MVMint64 total_bytes_written;

/* Current separator codepoint. */
MVMGrapheme32 sep;
/* Current separator specification for line-by-line reading. */
MVMDecodeStreamSeparators sep_spec;
};

void MVM_io_syncstream_set_encoding(MVMThreadContext *tc, MVMOSHandle *h, MVMint64 encoding);
Expand Down
63 changes: 42 additions & 21 deletions src/strings/decode_stream.c
Expand Up @@ -75,19 +75,19 @@ void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *d
}

/* Does a decode run, selected by encoding. */
static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *seps) {
static void run_decode(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMint32 *stopper_chars, MVMDecodeStreamSeparators *sep_spec) {
switch (ds->encoding) {
case MVM_encoding_type_utf8:
MVM_string_utf8_decodestream(tc, ds, stopper_chars, seps);
MVM_string_utf8_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_ascii:
MVM_string_ascii_decodestream(tc, ds, stopper_chars, seps);
MVM_string_ascii_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_latin1:
MVM_string_latin1_decodestream(tc, ds, stopper_chars, seps);
MVM_string_latin1_decodestream(tc, ds, stopper_chars, sep_spec);
break;
case MVM_encoding_type_windows1252:
MVM_string_windows1252_decodestream(tc, ds, stopper_chars, seps);
MVM_string_windows1252_decodestream(tc, ds, stopper_chars, sep_spec);
break;
default:
MVM_exception_throw_adhoc(tc, "Streaming decode NYI for encoding %d",
Expand Down Expand Up @@ -163,41 +163,33 @@ MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStr
return NULL;
}

/* Gets characters up until the specified string is encountered. If we do
* not encounter it, returns NULL. This may mean more input buffers are needed
/* Gets characters up until one of the specified separators is encountered. If
* we do not encounter it, returns 9. This may mean more input buffers are needed
* or that we reached the end of the stream. */
static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMGrapheme32 sep) {
static MVMint32 find_separator(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec) {
MVMint32 sep_loc = 0;
MVMDecodeStreamChars *cur_chars = ds->chars_head;
while (cur_chars) {
MVMint32 start = cur_chars == ds->chars_head ? ds->chars_head_pos : 0;
MVMint32 i;
for (i = start; i < cur_chars->length; i++) {
sep_loc++;
if (cur_chars->chars[i] == sep)
if (MVM_string_decode_stream_maybe_sep(tc, sep_spec, cur_chars->chars[i]))
return sep_loc;
}
cur_chars = cur_chars->next;
}
return 0;
}
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep) {
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *sep_spec) {
MVMint32 sep_loc;

/* Look for separator, trying more decoding if it fails. We get the place
* just beyond the separator, so can use take_chars to get what's need. */
sep_loc = find_separator(tc, ds, sep);
sep_loc = find_separator(tc, ds, sep_spec);
if (!sep_loc) {
/* XXX Temporarily set up the separator spec here; we'll request it be
* passed down to use in the future, and fully support multiple and
* multi-grapheme separators. */
MVMDecodeStreamSeparators sep_spec;
MVMint32 sep_length = 1;
sep_spec.num_seps = 1;
sep_spec.sep_lengths = &sep_length;
sep_spec.sep_graphemes = &sep;
run_decode(tc, ds, NULL, &sep_spec);
sep_loc = find_separator(tc, ds, sep);
run_decode(tc, ds, NULL, sep_spec);
sep_loc = find_separator(tc, ds, sep_spec);
}
if (sep_loc)
return take_chars(tc, ds, sep_loc);
Expand Down Expand Up @@ -359,3 +351,32 @@ void MVM_string_decodestream_destory(MVMThreadContext *tc, MVMDecodeStream *ds)
MVM_unicode_normalizer_cleanup(tc, &(ds->norm));
MVM_free(ds);
}

/* Sets a decode stream separator to its default value. */
void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
sep_spec->num_seps = 1;
sep_spec->sep_lengths = MVM_malloc(sizeof(MVMint32));
sep_spec->sep_lengths[0] = 1;
sep_spec->sep_graphemes = MVM_malloc(sizeof(MVMGrapheme32));
sep_spec->sep_graphemes[0] = '\n';
}

/* Takes a string and sets it up as a decode stream separator. */
void MVM_string_decode_stream_maybe_sep_from_string(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, MVMString *sep) {
MVMGraphemeIter gi;
MVMint32 i;

if (MVM_string_graphs(tc, sep) > 0xFFFF)
MVM_exception_throw_adhoc(tc, "Line separator too long");

MVM_free(sep_spec->sep_lengths);
MVM_free(sep_spec->sep_graphemes);

sep_spec->num_seps = 1;
sep_spec->sep_lengths = MVM_malloc(sizeof(MVMint32));
sep_spec->sep_lengths[0] = MVM_string_graphs(tc, sep);
sep_spec->sep_graphemes = MVM_malloc(sep_spec->sep_lengths[0] * sizeof(MVMGrapheme32));
MVM_string_gi_init(tc, &gi, sep);
for (i = 0; i < sep_spec->sep_lengths[0]; i++)
sep_spec->sep_graphemes[i] = MVM_string_gi_get_grapheme(tc, &gi);
}
4 changes: 3 additions & 1 deletion src/strings/decode_stream.h
Expand Up @@ -75,10 +75,12 @@ void MVM_string_decodestream_add_bytes(MVMThreadContext *tc, MVMDecodeStream *ds
void MVM_string_decodestream_add_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 *chars, MVMint32 length);
void MVM_string_decodestream_discard_to(MVMThreadContext *tc, MVMDecodeStream *ds, const MVMDecodeStreamBytes *bytes, MVMint32 pos);
MVMString * MVM_string_decodestream_get_chars(MVMThreadContext *tc, MVMDecodeStream *ds, MVMint32 chars);
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMGrapheme32 sep);
MVMString * MVM_string_decodestream_get_until_sep(MVMThreadContext *tc, MVMDecodeStream *ds, MVMDecodeStreamSeparators *seps);
MVMString * MVM_string_decodestream_get_all(MVMThreadContext *tc, MVMDecodeStream *ds);
MVMint64 MVM_string_decodestream_have_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds, MVMint32 bytes);
MVMint64 MVM_string_decodestream_bytes_to_buf(MVMThreadContext *tc, MVMDecodeStream *ds, char **buf, MVMint32 bytes);
MVMint64 MVM_string_decodestream_tell_bytes(MVMThreadContext *tc, const MVMDecodeStream *ds);
MVMint32 MVM_string_decodestream_is_empty(MVMThreadContext *tc, MVMDecodeStream *ds);
void MVM_string_decodestream_destory(MVMThreadContext *tc, MVMDecodeStream *ds);
void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec);
void MVM_string_decode_stream_sep_from_string(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec, MVMString *str);

0 comments on commit 81fb7c1

Please sign in to comment.