Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

replace instead of ignore so that we can update the offset #1335

Merged
merged 3 commits into from Nov 21, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -27,6 +27,8 @@

@Singleton
public class SandboxManager {
private static final String REPLACEMENT_CHARACTER = "\ufffd";
private static final String TWO_REPLACEMENT_CHARACTERS = REPLACEMENT_CHARACTER + REPLACEMENT_CHARACTER;

private final AsyncHttpClient asyncHttpClient;
private final ObjectMapper objectMapper;
Expand Down Expand Up @@ -115,17 +117,47 @@ public Optional<MesosFileChunkObject> read(String slaveHostname, String fullPath
}

/**
* this method will first sanitize the input by throwing away invalid UTF8 characters before sending it to
* Jackson for parsing. We can get invalid UTF8 data if there is a multibyte character at the boundary
* of our fetch
* This method will first sanitize the input by replacing invalid UTF8 characters with \ufffd (Unicode's "REPLACEMENT CHARACTER")
* before sending it to Jackson for parsing. We then strip the replacement characters characters from the beginning and end of the string
* and increment the offset field by how many characters were stripped from the beginning.
*/
@VisibleForTesting
MesosFileChunkObject parseResponseBody(Response response) throws IOException {
// not thread-safe, need to make a new one each time;
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.IGNORE);
CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPLACE);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe call replaceWith(REPLACEMENT_CHARACTER) to be more explicit?


ByteBuffer responseBuffer = response.getResponseBodyAsByteBuffer();
Reader sanitizedReader = CharSource.wrap(decoder.decode(responseBuffer)).openStream();
return objectMapper.readValue(sanitizedReader, MesosFileChunkObject.class);
final MesosFileChunkObject initialChunk = objectMapper.readValue(sanitizedReader, MesosFileChunkObject.class);

// bail early if no replacement characters
if (!initialChunk.getData().startsWith(REPLACEMENT_CHARACTER) && !initialChunk.getData().endsWith(REPLACEMENT_CHARACTER)) {
return initialChunk;
}

final String data = initialChunk.getData();

// if we requested data between two characters, return nothing and advance the offset to the end
if (data.length() <= 4 && data.replace(REPLACEMENT_CHARACTER, "").length() == 0) {
return new MesosFileChunkObject("", initialChunk.getOffset() + data.length(), Optional.<Long>absent());
}

// trim incomplete character at the beginning of the string
int startIndex = 0;
if (data.startsWith(TWO_REPLACEMENT_CHARACTERS)) {
startIndex = 2;
} else if (data.startsWith(REPLACEMENT_CHARACTER)) {
startIndex = 1;
}

// trim incomplete character at the end of the string
int endIndex = data.length();
if (data.endsWith(TWO_REPLACEMENT_CHARACTERS)) {
endIndex -= 2;
} else if (data.endsWith(REPLACEMENT_CHARACTER)) {
endIndex -= 1;
}

return new MesosFileChunkObject(data.substring(startIndex, endIndex), initialChunk.getOffset() + startIndex, Optional.<Long>absent());
}
}
Expand Up @@ -17,43 +17,99 @@
import com.ning.http.client.Response;

public class SandboxManagerTest extends SingularityTestBaseNoDb {
private static final byte[] SNOWMAN_BYTES = "☃".getBytes(StandardCharsets.UTF_8);
private static final byte FIRST_SNOWMAN_BYTE = SNOWMAN_BYTES[0];
private static final byte SECOND_SNOWMAN_BYTE = SNOWMAN_BYTES[1];
private static final int DEFAULT_OFFSET = 123;

private static final String JSON_START = "{\"data\":\"";
private static final String JSON_END = "\",\"offset\":" + DEFAULT_OFFSET + "}";

private static final String SNOWMAN = "☃";
private static final byte[] SNOWMAN_UTF8_BYTES = SNOWMAN.getBytes(StandardCharsets.UTF_8);
private static final byte FIRST_SNOWMAN_BYTE = SNOWMAN_UTF8_BYTES[0];
private static final byte SECOND_SNOWMAN_BYTE = SNOWMAN_UTF8_BYTES[1];

private static final String BALLOON = "\uD83C\uDF88";
private static final byte[] BALLOON_BYTES = BALLOON.getBytes(StandardCharsets.UTF_8);
private static final byte SECOND_BALLOON_BYTE = BALLOON_BYTES[1];
private static final byte THIRD_BALLOON_BYTE = BALLOON_BYTES[2];

@Inject
private SandboxManager sandboxManager;

@Test
public void testInvalidUtf8WithOneByteOfThreeByteCharacter() throws IOException {
// data contains a ☃ character and the first byte of another ☃ character
byte[] bytes = toBytes("{\"data\":\"", SNOWMAN_BYTES, FIRST_SNOWMAN_BYTE, "\",\"offset\":123}");
byte[] bytes = toBytes(JSON_START, SNOWMAN_UTF8_BYTES, FIRST_SNOWMAN_BYTE, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// the partial ☃ should be dropped
assertThat(chunk.getData()).isEqualTo("☃");
assertThat(chunk.getOffset()).isEqualTo(123);
assertThat(chunk.getData()).isEqualTo(SNOWMAN);
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET);
}

@Test
public void testInvalidUtf8WithTwoBytesOfThreeByteCharacter() throws IOException {
// data contains a ☃ character and the first two bytes of another ☃ character
byte[] bytes = toBytes("{\"data\":\"", SNOWMAN_BYTES, FIRST_SNOWMAN_BYTE, SECOND_SNOWMAN_BYTE, "\",\"offset\":123}");
byte[] bytes = toBytes(JSON_START, SNOWMAN_UTF8_BYTES, FIRST_SNOWMAN_BYTE, SECOND_SNOWMAN_BYTE, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// the partial ☃ should be dropped
assertThat(chunk.getData()).isEqualTo("☃");
assertThat(chunk.getOffset()).isEqualTo(123);
assertThat(chunk.getData()).isEqualTo(SNOWMAN);
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET);
}

@Test
public void testValidUtf8WithThreeByteCharacters() throws IOException {
// data contains two ☃ characters
byte[] bytes = toBytes("{\"data\":\"", SNOWMAN_BYTES, SNOWMAN_BYTES, "\",\"offset\":123}");
byte[] bytes = toBytes(JSON_START, SNOWMAN_UTF8_BYTES, SNOWMAN_UTF8_BYTES, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// nothing should be dropped
assertThat(chunk.getData()).isEqualTo(SNOWMAN + SNOWMAN);
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET);
}

@Test
public void testInvalidUtf8WithLastByte() throws IOException {
// data contains last byte of a fire character and a ☃ character
byte[] bytes = toBytes(JSON_START, THIRD_BALLOON_BYTE, SNOWMAN_UTF8_BYTES, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// the partial fire should be dropped and the offset should be advanced by one byte
assertThat(chunk.getData()).isEqualTo(SNOWMAN);
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET + 1);
}

@Test
public void testInvalidUtf8WithLastTwoBytes() throws IOException {
// data contains last two bytes of a fire character and a ☃ character
byte[] bytes = toBytes(JSON_START, SECOND_BALLOON_BYTE, THIRD_BALLOON_BYTE, SNOWMAN_UTF8_BYTES, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// the partial fire should be dropped and the offset should be advanced by two bytes
assertThat(chunk.getData()).isEqualTo(SNOWMAN);
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET + 2);
}

@Test
public void testInvalidUtf8WithOneByte() throws IOException {
// data contains the last middle byte of a fire character
byte[] bytes = toBytes(JSON_START, SECOND_BALLOON_BYTE, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
// the partial fire should be dropped and the offset should be advanced by one byte
assertThat(chunk.getData()).isEqualTo("");
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET + 1);
}

@Test
public void testInvalidUtf8WithTwoBytes() throws IOException {
// data contains the last two bytes of a fire character
byte[] bytes = toBytes(JSON_START, SECOND_BALLOON_BYTE, THIRD_BALLOON_BYTE, JSON_END);

MesosFileChunkObject chunk = sandboxManager.parseResponseBody(response(bytes));
assertThat(chunk.getData()).isEqualTo("☃☃");
assertThat(chunk.getOffset()).isEqualTo(123);
// the partial fire should be dropped and the offset should be advanced by two bytes
assertThat(chunk.getData()).isEqualTo("");
assertThat(chunk.getOffset()).isEqualTo(DEFAULT_OFFSET + 2);
}

private static byte[] toBytes(Object... objects) throws IOException {
Expand Down