diff --git a/src/main/webapp/filesFragment.xhtml b/src/main/webapp/filesFragment.xhtml
index a74ef7ddbbf..c91fb368f97 100644
--- a/src/main/webapp/filesFragment.xhtml
+++ b/src/main/webapp/filesFragment.xhtml
@@ -370,7 +370,7 @@
-
+
@@ -631,7 +631,7 @@
-
+
diff --git a/src/main/webapp/search-include-fragment.xhtml b/src/main/webapp/search-include-fragment.xhtml
index c07fd77ff7b..cdf0f1de1a7 100644
--- a/src/main/webapp/search-include-fragment.xhtml
+++ b/src/main/webapp/search-include-fragment.xhtml
@@ -581,7 +581,7 @@
diff --git a/src/test/java/edu/harvard/iq/dataverse/DataFileServiceBeanTest.java b/src/test/java/edu/harvard/iq/dataverse/DataFileServiceBeanTest.java
index 92a1f6a6b17..136916cf449 100644
--- a/src/test/java/edu/harvard/iq/dataverse/DataFileServiceBeanTest.java
+++ b/src/test/java/edu/harvard/iq/dataverse/DataFileServiceBeanTest.java
@@ -186,8 +186,8 @@ public void testIsThumbnailSupportedForSize() throws Exception {
*/
@Test
public void testGetFileClass() throws Exception {
- assertEquals("other", dataFileServiceBean.getFileClass(fileWoContentType));
- assertEquals("other", dataFileServiceBean.getFileClass(fileWithBogusContentType));
+ assertEquals("other", dataFileServiceBean.getFileThumbnailClass(fileWoContentType));
+ assertEquals("other", dataFileServiceBean.getFileThumbnailClass(fileWithBogusContentType));
}
/**
diff --git a/src/test/java/edu/harvard/iq/dataverse/api/FileTypeDetectionIT.java b/src/test/java/edu/harvard/iq/dataverse/api/FileTypeDetectionIT.java
new file mode 100644
index 00000000000..8e38a0da2f2
--- /dev/null
+++ b/src/test/java/edu/harvard/iq/dataverse/api/FileTypeDetectionIT.java
@@ -0,0 +1,207 @@
+package edu.harvard.iq.dataverse.api;
+
+import com.jayway.restassured.path.json.JsonPath;
+import com.jayway.restassured.response.Response;
+import javax.json.Json;
+import javax.json.JsonObjectBuilder;
+import static javax.ws.rs.core.Response.Status.CREATED;
+import static javax.ws.rs.core.Response.Status.OK;
+import static javax.ws.rs.core.Response.Status.UNAUTHORIZED;
+import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.nullValue;
+import org.junit.Test;
+
+public class FileTypeDetectionIT {
+
+ @Test
+ public void testOverrideMimeType() {
+ Response createUser = UtilIT.createRandomUser();
+ createUser.prettyPrint();
+ createUser.then().assertThat()
+ .statusCode(OK.getStatusCode());
+ String username = UtilIT.getUsernameFromResponse(createUser);
+ String apiToken = UtilIT.getApiTokenFromResponse(createUser);
+
+ Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken);
+ createDataverseResponse.prettyPrint();
+ createDataverseResponse.then().assertThat()
+ .statusCode(CREATED.getStatusCode());
+
+ String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse);
+
+ Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken);
+ createDataset.prettyPrint();
+ createDataset.then().assertThat()
+ .statusCode(CREATED.getStatusCode());
+
+ Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset);
+
+ String readmeFile = "README.md";
+
+ JsonObjectBuilder readmeFileMetadata = Json.createObjectBuilder()
+ .add("description", "How to run the code on the data.")
+ .add("categories", Json.createArrayBuilder()
+ .add("Documentation")
+ );
+
+ // Markdown media type: https://tools.ietf.org/html/rfc7763
+ String overrideMimeType = "text/markdown";
+ Response addReadme = UtilIT.uploadFileViaNative(datasetId.toString(), readmeFile, readmeFileMetadata.build().toString(), overrideMimeType, apiToken);
+ addReadme.prettyPrint();
+ addReadme.then().assertThat()
+ .body("data.files[0].categories[0]", equalTo("Documentation"))
+ .body("data.files[0].dataFile.contentType", equalTo("text/markdown"))
+ .body("data.files[0].dataFile.description", equalTo("How to run the code on the data."))
+ .body("data.files[0].directoryLabel", nullValue())
+ .body("data.files[0].dataFile.tags", nullValue())
+ .body("data.files[0].dataFile.tabularTags", nullValue())
+ .body("data.files[0].label", equalTo("README.md"))
+ // not sure why description appears in two places
+ .body("data.files[0].description", equalTo("How to run the code on the data."))
+ .statusCode(OK.getStatusCode());
+
+ String jupyterNotebook = "src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb";
+
+ JsonObjectBuilder jupyterNotebookMetadata = Json.createObjectBuilder()
+ .add("description", "Jupyter Notebook showing IRC metrics.")
+ .add("directoryLabel", "code")
+ .add("categories", Json.createArrayBuilder()
+ .add("Code")
+ );
+
+ Response addCode = UtilIT.uploadFileViaNative(datasetId.toString(), jupyterNotebook, jupyterNotebookMetadata.build(), apiToken);
+ addCode.prettyPrint();
+ addCode.then().assertThat()
+ .body("data.files[0].categories[0]", equalTo("Code"))
+ .body("data.files[0].dataFile.contentType", equalTo("application/x-ipynb+json"))
+ .body("data.files[0].dataFile.description", equalTo("Jupyter Notebook showing IRC metrics."))
+ .body("data.files[0].directoryLabel", equalTo("code"))
+ .body("data.files[0].dataFile.tags", nullValue())
+ .body("data.files[0].dataFile.tabularTags", nullValue())
+ .body("data.files[0].label", equalTo("irc-metrics.ipynb"))
+ // not sure why description appears in two places
+ .body("data.files[0].description", equalTo("Jupyter Notebook showing IRC metrics."))
+ .statusCode(OK.getStatusCode());
+
+ String tsvFile = "src/test/java/edu/harvard/iq/dataverse/util/irclog.tsv";
+
+ JsonObjectBuilder tsvFileMetadata = Json.createObjectBuilder()
+ .add("description", "TSV file of Dataverse IRC logs.")
+ .add("directoryLabel", "data")
+ .add("categories", Json.createArrayBuilder()
+ .add("Data")
+ );
+
+ Response addData = UtilIT.uploadFileViaNative(datasetId.toString(), tsvFile, tsvFileMetadata.build(), apiToken);
+ addData.prettyPrint();
+ addData.then().assertThat()
+ .body("data.files[0].categories[0]", equalTo("Data"))
+ .body("data.files[0].dataFile.contentType", equalTo("text/tsv"))
+ .body("data.files[0].dataFile.description", equalTo("TSV file of Dataverse IRC logs."))
+ .body("data.files[0].directoryLabel", equalTo("data"))
+ .body("data.files[0].dataFile.tags", nullValue())
+ .body("data.files[0].dataFile.tabularTags", nullValue())
+ .body("data.files[0].label", equalTo("irclog.tsv"))
+ // not sure why description appears in two places
+ .body("data.files[0].description", equalTo("TSV file of Dataverse IRC logs."))
+ .statusCode(OK.getStatusCode());
+
+ }
+
+ @Test
+ public void testRedetectMimeType() {
+ Response createUser = UtilIT.createRandomUser();
+ createUser.prettyPrint();
+ createUser.then().assertThat()
+ .statusCode(OK.getStatusCode());
+ String username = UtilIT.getUsernameFromResponse(createUser);
+ String apiToken = UtilIT.getApiTokenFromResponse(createUser);
+
+ Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken);
+ createDataverseResponse.prettyPrint();
+ createDataverseResponse.then().assertThat()
+ .statusCode(CREATED.getStatusCode());
+
+ String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse);
+
+ Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken);
+ createDataset.prettyPrint();
+ createDataset.then().assertThat()
+ .statusCode(CREATED.getStatusCode());
+
+ Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset);
+
+ String filePath = "scripts/issues/1380/dvs.pdf";
+
+ JsonObjectBuilder readmeFileMetadata = Json.createObjectBuilder()
+ .add("description", "This is a PDF.")
+ .add("categories", Json.createArrayBuilder()
+ .add("Documentation")
+ );
+
+ /**
+ * We are overriding the MIME type here because even though Dataverse
+ * knows how to figure out what a PDF is we want to pretend it doesn't
+ * so that we can later try the "redetect file type" API.
+ */
+ String overrideMimeType = "foo/bar";
+ Response addFileUnknownType = UtilIT.uploadFileViaNative(datasetId.toString(), filePath, readmeFileMetadata.build().toString(), overrideMimeType, apiToken);
+ addFileUnknownType.prettyPrint();
+ addFileUnknownType.then().assertThat()
+ .statusCode(OK.getStatusCode())
+ .body("data.files[0].categories[0]", equalTo("Documentation"))
+ .body("data.files[0].dataFile.contentType", equalTo("foo/bar"))
+ .body("data.files[0].dataFile.description", equalTo("This is a PDF."))
+ .body("data.files[0].directoryLabel", nullValue())
+ .body("data.files[0].dataFile.tags", nullValue())
+ .body("data.files[0].dataFile.tabularTags", nullValue())
+ .body("data.files[0].label", equalTo("dvs.pdf"))
+ // not sure why description appears in two places
+ .body("data.files[0].description", equalTo("This is a PDF."));
+
+ Long fileId = JsonPath.from(addFileUnknownType.asString()).getLong("data.files[0].dataFile.id");
+ System.out.println("file id: " + fileId);
+ boolean dryRunTrue = true;
+ Response redetectDryRun = UtilIT.redetectFileType(fileId.toString(), dryRunTrue, apiToken);
+ redetectDryRun.prettyPrint();
+ redetectDryRun.then().assertThat()
+ .statusCode(OK.getStatusCode())
+ .body("data.dryRun", equalTo(true))
+ .body("data.oldContentType", equalTo("foo/bar"))
+ .body("data.newContentType", equalTo("application/pdf"));
+
+ Response createNoPrivsUser = UtilIT.createRandomUser();
+ createNoPrivsUser.prettyPrint();
+ createNoPrivsUser.then().assertThat()
+ .statusCode(OK.getStatusCode());
+ String noPrivsUsername = UtilIT.getUsernameFromResponse(createNoPrivsUser);
+ String noPrivsApiToken = UtilIT.getApiTokenFromResponse(createNoPrivsUser);
+
+ Response forbidden = UtilIT.redetectFileType(fileId.toString(), true, noPrivsApiToken);
+ forbidden.then().assertThat()
+ .statusCode(UNAUTHORIZED.getStatusCode());
+
+ Response noChange = UtilIT.nativeGet(datasetId, apiToken);
+ noChange.prettyPrint();
+ noChange.then().assertThat()
+ .statusCode(OK.getStatusCode())
+ .body("data.latestVersion.files[0].dataFile.contentType", equalTo("foo/bar"));
+
+ boolean dryRunFalse = false;
+ Response redetectAndChange = UtilIT.redetectFileType(fileId.toString(), dryRunFalse, apiToken);
+ redetectAndChange.prettyPrint();
+ redetectAndChange.then().assertThat()
+ .statusCode(OK.getStatusCode())
+ .body("data.dryRun", equalTo(false))
+ .body("data.oldContentType", equalTo("foo/bar"))
+ .body("data.newContentType", equalTo("application/pdf"));
+
+ Response databaseChanged = UtilIT.nativeGet(datasetId, apiToken);
+ databaseChanged.prettyPrint();
+ databaseChanged.then().assertThat()
+ .statusCode(OK.getStatusCode())
+ .body("data.latestVersion.files[0].dataFile.contentType", equalTo("application/pdf"));
+
+ }
+
+}
diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java
index 4487a0553ae..f23d480632e 100644
--- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java
+++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java
@@ -558,10 +558,15 @@ static Response uploadFileViaNative(String datasetId, String pathToFile, JsonObj
}
static Response uploadFileViaNative(String datasetId, String pathToFile, String jsonAsString, String apiToken) {
+ String nullMimeType = null;
+ return uploadFileViaNative(datasetId, pathToFile, jsonAsString, nullMimeType, apiToken);
+ }
+
+ static Response uploadFileViaNative(String datasetId, String pathToFile, String jsonAsString, String mimeType, String apiToken) {
RequestSpecification requestSpecification = given()
.header(API_TOKEN_HTTP_HEADER, apiToken)
.multiPart("datasetId", datasetId)
- .multiPart("file", new File(pathToFile));
+ .multiPart("file", new File(pathToFile), mimeType);
if (jsonAsString != null) {
requestSpecification.multiPart("jsonData", jsonAsString);
}
@@ -701,6 +706,12 @@ static Response testIngest(String fileName, String fileType) {
.get("/api/ingest/test/file?fileName=" + fileName + "&fileType=" + fileType);
}
+ static Response redetectFileType(String fileId, boolean dryRun, String apiToken) {
+ return given()
+ .header(API_TOKEN_HTTP_HEADER, apiToken)
+ .post("/api/files/" + fileId + "/redetect?dryRun=" + dryRun);
+ }
+
static Response getSwordAtomEntry(String persistentId, String apiToken) {
Response response = given()
.auth().basic(apiToken, EMPTY_STRING)
@@ -822,7 +833,14 @@ public static Response deleteUser(String username) {
.delete("/api/admin/authenticatedUsers/" + username + "/");
return deleteUserResponse;
}
-
+
+ public static Response reingestFile(Long fileId, String apiToken) {
+ Response response = given()
+ .header(API_TOKEN_HTTP_HEADER, apiToken)
+ .post("/api/files/" + fileId + "/reingest");
+ return response;
+ }
+
public static Response uningestFile(Long fileId, String apiToken) {
Response uningestFileResponse = given()
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/BundleUtilTest.java b/src/test/java/edu/harvard/iq/dataverse/util/BundleUtilTest.java
index c34ab81c7f5..8889d492829 100644
--- a/src/test/java/edu/harvard/iq/dataverse/util/BundleUtilTest.java
+++ b/src/test/java/edu/harvard/iq/dataverse/util/BundleUtilTest.java
@@ -74,7 +74,7 @@ public void testGetStringFromBundleWithArgumentsAndSpecificBundle() {
@Test
public void testStringFromPropertyFile() {
- assertEquals("ZIP", BundleUtil.getStringFromPropertyFile("application/zip","MimeTypeFacets"));
+ assertEquals("Archive", BundleUtil.getStringFromPropertyFile("application/zip","MimeTypeFacets"));
}
//To assure that the MissingResourceException bubble up from this call
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/FileTypeDetectionTest.java b/src/test/java/edu/harvard/iq/dataverse/util/FileTypeDetectionTest.java
new file mode 100644
index 00000000000..5d2b9b4d56a
--- /dev/null
+++ b/src/test/java/edu/harvard/iq/dataverse/util/FileTypeDetectionTest.java
@@ -0,0 +1,42 @@
+package edu.harvard.iq.dataverse.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.commons.io.FileUtils;
+import org.junit.AfterClass;
+import static org.junit.Assert.assertEquals;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class FileTypeDetectionTest {
+
+ static String baseDirForConfigFiles = "/tmp";
+
+ @BeforeClass
+ public static void setUpClass() {
+ System.setProperty("com.sun.aas.instanceRoot", baseDirForConfigFiles);
+ String testFile1Src = "conf/jhove/jhove.conf";
+ String testFile1Tmp = baseDirForConfigFiles + "/config/jhove.conf";
+ try {
+ FileUtils.copyFile(new File(testFile1Src), new File(testFile1Tmp));
+ } catch (IOException ex) {
+ Logger.getLogger(JhoveFileTypeTest.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+ @AfterClass
+ public static void tearDownClass() {
+ // SiteMapUtilTest relies on com.sun.aas.instanceRoot being null.
+ System.clearProperty("com.sun.aas.instanceRoot");
+ }
+
+ @Test
+ public void testDetermineFileTypeJupyterNoteboook() throws Exception {
+ File file = new File("src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb");
+ // https://jupyter.readthedocs.io/en/latest/reference/mimetype.html
+ assertEquals("application/x-ipynb+json", FileTypeDetection.determineFileType(file));
+ }
+
+}
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/JhoveFileTypeTest.java b/src/test/java/edu/harvard/iq/dataverse/util/JhoveFileTypeTest.java
new file mode 100644
index 00000000000..88a8d24c772
--- /dev/null
+++ b/src/test/java/edu/harvard/iq/dataverse/util/JhoveFileTypeTest.java
@@ -0,0 +1,90 @@
+package edu.harvard.iq.dataverse.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.commons.io.FileUtils;
+import org.junit.AfterClass;
+import static org.junit.Assert.assertEquals;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class JhoveFileTypeTest {
+
+ static JhoveFileType jhoveFileType;
+ static String baseDirForConfigFiles = "/tmp";
+ static File png;
+ static File gif;
+ static File jpg;
+ static File pdf;
+ static File zip;
+ static File xml;
+ static File html;
+ static File ico;
+ static File ipynb;
+
+ @BeforeClass
+ public static void setUpClass() {
+ System.setProperty("com.sun.aas.instanceRoot", baseDirForConfigFiles);
+ jhoveFileType = new JhoveFileType();
+ copyConfigIntoPlace();
+
+ png = new File("src/test/resources/images/coffeeshop.png");
+ gif = new File("src/main/webapp/resources/images/ajax-loading.gif");
+ jpg = new File("src/main/webapp/resources/images/dataverseproject_logo.jpg");
+ pdf = new File("scripts/issues/1380/dvs.pdf");
+ zip = new File("src/test/resources/doi-10-5072-fk2hyixmyv1.0.zip");
+ xml = new File("pom.xml");
+ html = new File("src/main/webapp/mydata_templates/mydata.html");
+ ico = new File("src/main/webapp/resources/images/fav/favicon.ico");
+ ipynb = new File("src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb");
+ }
+
+ @AfterClass
+ public static void tearDownClass() {
+ // SiteMapUtilTest relies on com.sun.aas.instanceRoot being null.
+ System.clearProperty("com.sun.aas.instanceRoot");
+ }
+
+ @Test
+ public void testGetFileMimeType() {
+ System.out.println("getFileMimeType");
+ // GOOD: figured it out. :)
+ assertEquals("image/png", jhoveFileType.getFileMimeType(png));
+ assertEquals("image/gif", jhoveFileType.getFileMimeType(gif));
+ assertEquals("image/jpeg", jhoveFileType.getFileMimeType(jpg));
+ assertEquals("application/pdf", jhoveFileType.getFileMimeType(pdf));
+ // BAD: couldn't figure it out. :(
+ assertEquals("application/octet-stream", jhoveFileType.getFileMimeType(zip));
+ assertEquals("application/octet-stream", jhoveFileType.getFileMimeType(ico));
+ // BAD: not very specific. :(
+ assertEquals("text/plain; charset=US-ASCII", jhoveFileType.getFileMimeType(xml));
+ assertEquals("text/plain; charset=US-ASCII", jhoveFileType.getFileMimeType(html));
+ assertEquals("text/plain; charset=US-ASCII", jhoveFileType.getFileMimeType(ipynb));
+ }
+
+ @Test
+ public void testCheckFileType() {
+ System.out.println("checkFileType");
+ jhoveFileType = new JhoveFileType();
+ assertEquals(543938, jhoveFileType.checkFileType(png).getSize());
+ }
+
+ @Test
+ public void testGetJhoveConfigFile() {
+ System.out.println("getJhoveConfigFile");
+ assertEquals(baseDirForConfigFiles + "/config/jhove.conf", JhoveFileType.getJhoveConfigFile());
+ }
+
+ private static void copyConfigIntoPlace() {
+ String testFile1Src = "conf/jhove/jhove.conf";
+ String testFile1Tmp = baseDirForConfigFiles + "/config/jhove.conf";
+ try {
+ FileUtils.copyFile(new File(testFile1Src), new File(testFile1Tmp));
+ } catch (IOException ex) {
+ Logger.getLogger(JhoveFileTypeTest.class.getName()).log(Level.SEVERE, null, ex);
+ }
+ }
+
+}
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb b/src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb
new file mode 100644
index 00000000000..13088234fcb
--- /dev/null
+++ b/src/test/java/edu/harvard/iq/dataverse/util/irc-metrics.ipynb
@@ -0,0 +1,251 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Pandas version 0.22.0\n",
+ "Numpy version 1.13.3\n"
+ ]
+ }
+ ],
+ "source": [
+ "%matplotlib inline\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "pd.set_option('display.max_columns', 100)\n",
+ "\n",
+ "print('Pandas version ' + pd.__version__)\n",
+ "print('Numpy version ' + np.__version__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "data = pd.read_table(\"irclog.tsv\", encoding = \"ISO-8859-1\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " channel | \n",
+ " day | \n",
+ " nick | \n",
+ " timestamp | \n",
+ " line | \n",
+ " spam | \n",
+ " in_summary | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " #dvn | \n",
+ " 2012-12-08 | \n",
+ " NaN | \n",
+ " 1355005146 | \n",
+ " iqlogbot joined #dvn | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " #dvn | \n",
+ " 2012-12-08 | \n",
+ " NaN | \n",
+ " 1355005248 | \n",
+ " Topic for #dvn is now http://thedata.org - The... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " #dvn | \n",
+ " 2012-12-08 | \n",
+ " pdurbin | \n",
+ " 1355005351 | \n",
+ " hello! welcome to #dvn, an IRC channel on Free... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " #dvn | \n",
+ " 2012-12-08 | \n",
+ " pdurbin | \n",
+ " 1355005459 | \n",
+ " our website is http://thedata.org and we're st... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " #dvn | \n",
+ " 2012-12-08 | \n",
+ " pdurbin | \n",
+ " 1355005517 | \n",
+ " we call our project DVN for short :) | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id channel day nick timestamp \\\n",
+ "0 1 #dvn 2012-12-08 NaN 1355005146 \n",
+ "1 2 #dvn 2012-12-08 NaN 1355005248 \n",
+ "2 3 #dvn 2012-12-08 pdurbin 1355005351 \n",
+ "3 4 #dvn 2012-12-08 pdurbin 1355005459 \n",
+ "4 5 #dvn 2012-12-08 pdurbin 1355005517 \n",
+ "\n",
+ " line spam in_summary \n",
+ "0 iqlogbot joined #dvn 0 0 \n",
+ "1 Topic for #dvn is now http://thedata.org - The... 0 0 \n",
+ "2 hello! welcome to #dvn, an IRC channel on Free... 0 0 \n",
+ "3 our website is http://thedata.org and we're st... 0 0 \n",
+ "4 we call our project DVN for short :) 0 0 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 92847 entries, 0 to 92846\n",
+ "Data columns (total 8 columns):\n",
+ "id 92847 non-null int64\n",
+ "channel 92847 non-null object\n",
+ "day 92847 non-null object\n",
+ "nick 60116 non-null object\n",
+ "timestamp 92847 non-null int64\n",
+ "line 92845 non-null object\n",
+ "spam 92847 non-null int64\n",
+ "in_summary 92847 non-null int64\n",
+ "dtypes: int64(4), object(4)\n",
+ "memory usage: 5.7+ MB\n"
+ ]
+ }
+ ],
+ "source": [
+ "data.info()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['id', 'channel', 'day', 'nick', 'timestamp', 'line', 'spam', 'in_summary']"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "list(data.columns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "#dataverse 82587\n",
+ "#dvn 10260\n",
+ "Name: channel, dtype: int64"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data['channel'].value_counts()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/src/test/java/edu/harvard/iq/dataverse/util/irclog.tsv b/src/test/java/edu/harvard/iq/dataverse/util/irclog.tsv
new file mode 100644
index 00000000000..d0e22852965
--- /dev/null
+++ b/src/test/java/edu/harvard/iq/dataverse/util/irclog.tsv
@@ -0,0 +1,7 @@
+id channel day nick timestamp line spam in_summary
+10261 #dataverse 2014-06-24 1403620825 iqlogbot joined #dataverse 0 0
+10262 #dataverse 2014-06-24 1403620825 Topic for #dataverse is now Dataverse: http://dataverse.org | logs at http://irclog.iq.harvard.edu/dataverse/today 0 0
+10263 #dataverse 2014-06-24 pdurbin 1403620846 hello world! 0 0
+10264 #dataverse 2014-06-24 pdurbin 1403620958 for over a year I've been gathering people in #dvn to talk about Dataverse Network but as a bit of a rebranding effort, we're shortening the name to just "Dataverse" 0 0
+10265 #dataverse 2014-06-24 pdurbin 1403621058 we even have a fancy new domain: http://dataverse.org :) 0 0
+10266 #dataverse 2014-06-24 pdurbin 1403621094 once I get everyone who's in the old #dvn channel to join this new #dataverse channel we'll shut the old one down 0 0