Skip to content

Commit

Permalink
Merge pull request #7503 from IQSS/7188-utf8-filenames
Browse files Browse the repository at this point in the history
support download of UTF-8 filenames #7188
  • Loading branch information
kcondon committed Jan 11, 2021
2 parents 382cba9 + 074ac6a commit 0bcdcfd
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 3 deletions.
9 changes: 9 additions & 0 deletions doc/release-notes/7188-utf-8-filenames.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Notes for Tool Developers and Integrators

### UTF-8 Characters and Spaces in File Names

UTF-8 characters in filenames are now preserved when downloaded.

Dataverse Installations will no longer replace spaces in file names of downloaded files with the + character. If your tool or integration has any special handling around this, you may need to make further adjustments to maintain backwards compatibility while also supporting Dataverse installations on 5.4+.

Note that this follows a change from 5.1 that only corrected this for installations running with S3 storage. This makes the behavior consistent across installations running all types of file storage.
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import java.io.FileInputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
Expand Down Expand Up @@ -301,9 +302,11 @@ public void writeTo(DownloadInstance di, Class<?> clazz, Type type, Annotation[]

// Provide both the "Content-disposition" and "Content-Type" headers,
// to satisfy the widest selection of browsers out there.

httpHeaders.add("Content-disposition", "attachment; filename=\"" + fileName + "\"");
httpHeaders.add("Content-Type", mimeType + "; name=\"" + fileName + "\"");
// Encode the filename as UTF-8, then deal with spaces. "encode" changes
// a space to + so we change it back to a space (%20).
String finalFileName = URLEncoder.encode(fileName, "UTF-8").replaceAll("\\+", "%20");
httpHeaders.add("Content-disposition", "attachment; filename=\"" + finalFileName + "\"");
httpHeaders.add("Content-Type", mimeType + "; name=\"" + finalFileName + "\"");

long contentSize;
boolean useChunkedTransfer = false;
Expand Down
83 changes: 83 additions & 0 deletions src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import com.jayway.restassured.RestAssured;
import com.jayway.restassured.path.json.JsonPath;
import com.jayway.restassured.response.Headers;
import com.jayway.restassured.response.Response;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
Expand All @@ -13,6 +15,7 @@
import java.util.HashSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;
import static javax.ws.rs.core.Response.Status.CREATED;
import static javax.ws.rs.core.Response.Status.FORBIDDEN;
import static javax.ws.rs.core.Response.Status.OK;
Expand Down Expand Up @@ -367,6 +370,86 @@ public void downloadAllFilesTabular() throws IOException {
Assert.assertEquals(new HashSet<>(Arrays.asList("50by1000.dta", "MANIFEST.TXT")), gatherFilenames(downloadFiles2.getBody().asInputStream()));
}

/**
* Download a file with a UTF-8 filename with a space.
*/
@Test
public void downloadFilenameUtf8() throws IOException {

Response createUser = UtilIT.createRandomUser();
createUser.prettyPrint();
createUser.then().assertThat()
.statusCode(OK.getStatusCode());
String username = UtilIT.getUsernameFromResponse(createUser);
String apiToken = UtilIT.getApiTokenFromResponse(createUser);

Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken);
createDataverseResponse.prettyPrint();
createDataverseResponse.then().assertThat()
.statusCode(CREATED.getStatusCode());

String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse);

Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken);
createDataset.prettyPrint();
createDataset.then().assertThat()
.statusCode(CREATED.getStatusCode());

Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset);
String datasetPid = UtilIT.getDatasetPersistentIdFromResponse(createDataset);

// Put a filename with an en-dash ("MY READ–ME.md") into a zip file.
StringBuilder sb = new StringBuilder();
sb.append("This is my README.");
Path pathtoTempDir = Paths.get(Files.createTempDirectory(null).toString());
String pathToZipFile = pathtoTempDir + File.separator + "test.zip";
File f = new File(pathToZipFile);
ZipOutputStream out = new ZipOutputStream(new FileOutputStream(f));
ZipEntry e = new ZipEntry("MY READ–ME.md");
out.putNextEntry(e);
byte[] data = sb.toString().getBytes();
out.write(data, 0, data.length);
out.closeEntry();
out.close();

// We upload via SWORD (as a zip) because the native API gives this error:
// "Constraint violation found in FileMetadata. File Name cannot contain any
// of the following characters: / : * ? " < > | ; # . The invalid value is "READ?ME.md"."
// This error probably has something to do with the way REST Assured sends the filename
// to the native API. The en-dash is turned into question mark, which is disallowed.
Response uploadViaSword = UtilIT.uploadZipFileViaSword(datasetPid, pathToZipFile, apiToken);
uploadViaSword.prettyPrint();
uploadViaSword.then().assertThat()
.statusCode(CREATED.getStatusCode());

Response getDatasetJson = UtilIT.nativeGet(datasetId, apiToken);
getDatasetJson.then().assertThat()
.statusCode(OK.getStatusCode());

int fileId = JsonPath.from(getDatasetJson.getBody().asString()).getInt("data.latestVersion.files[0].dataFile.id");

// Download the file individually and assert READ–ME.md has an en-dash.
Response downloadFile = UtilIT.downloadFile(new Integer(fileId), apiToken);
downloadFile.then().assertThat()
.statusCode(OK.getStatusCode());
Headers headers = downloadFile.getHeaders();
// In "MY READ–ME.md" below the space is %20 and the en-dash ("–") is "%E2%80%93" (e2 80 93 in hex).
Assert.assertEquals("attachment; filename=\"MY%20READ%E2%80%93ME.md\"", headers.getValue("Content-disposition"));
Assert.assertEquals("text/markdown; name=\"MY%20READ%E2%80%93ME.md\";charset=UTF-8", headers.getValue("Content-Type"));

// Download all files as a zip and assert "MY READ–ME.md" has an en-dash.
Response downloadFiles = UtilIT.downloadFiles(datasetPid, apiToken);
downloadFiles.then().assertThat()
.statusCode(OK.getStatusCode());

HashSet<String> filenamesFound = gatherFilenames(downloadFiles.getBody().asInputStream());

// Note that a MANIFEST.TXT file is added.
// "MY READ–ME.md" (with an en-dash) is correctly extracted from the downloaded zip
HashSet<String> expectedFiles = new HashSet<>(Arrays.asList("MANIFEST.TXT", "MY READ–ME.md"));
Assert.assertEquals(expectedFiles, filenamesFound);
}

private HashSet<String> gatherFilenames(InputStream inputStream) throws IOException {
HashSet<String> filenamesFound = new HashSet<>();
try (ZipInputStream zipStream = new ZipInputStream(inputStream)) {
Expand Down

0 comments on commit 0bcdcfd

Please sign in to comment.