merge-conflicts-v4.15 fixed

IQSS · Jun 24, 2019 · 86f3792 · 86f3792
2 parents 7023545 + f3be218
commit 86f3792
Show file tree

Hide file tree

Showing 168 changed files with 4,689 additions and 1,701 deletions.
diff --git a/.gitignore b/.gitignore
@@ -53,3 +53,11 @@ scripts/installer/default.config
 .idea
 **/*.iml
 /bin/
+
+# do not track Visual Studio Code files
+.vscode
+
+# ignore UI testing related files
+tests/node_modules
+tests/package-lock.json
+.factorypath
diff --git a/.travis.yml.future b/.travis.yml.future
@@ -0,0 +1,42 @@
+services:
+  - docker
+
+jobs:
+  include:
+    # Execute java unit- and integration tests
+    - stage: test
+      language: java
+      jdk:
+        - oraclejdk8
+      script: mvn -DcompilerArgument=-Xlint:unchecked test -P all-unit-tests
+      after_success: mvn jacoco:report coveralls:report
+
+    # Execute Cypress for UI testing
+    # see https://docs.cypress.io/guides/guides/continuous-integration.html
+    - stage: test
+      language: node_js
+      node_js:
+        - "10"
+      addons:
+        apt:
+          packages:
+            # Ubuntu 16+ does not install this dependency by default, so we need to install it ourselves
+            - libgconf-2-4
+      cache:
+        # Caches $HOME/.npm when npm ci is default script command
+        # Caches node_modules in all other cases
+        npm: true
+        directories:
+          # we also need to cache folder with Cypress binary
+          - ~/.cache
+          # we want to cache the Glassfish and Solr dependencies as well
+          - conf/docker-aio/dv/deps
+      before_install:
+        - cd tests
+      install:
+        - npm ci
+      before_script:
+        - ./run_docker_dataverse.sh
+      script:
+        # --key needs to be injected using CYPRESS_RECORD_KEY to keep it secret
+        - $(npm bin)/cypress run --record
diff --git a/conf/docker-aio/readme.md b/conf/docker-aio/readme.md
@@ -34,7 +34,7 @@ Also usable for normal development and system evaluation; not intended for produ
 - Installation (integration test): `docker exec dv /opt/dv/setupIT.bash` 
   (Note that it's possible to customize the installation by editing `conf/docker-aio/default.config` and running `docker exec dv /opt/dv/install.bash` but for the purposes of integration testing, the `setupIT.bash` script above works fine.)
 
-- update `dataverse.siteUrl` (appears only necessary for `DatasetsIT.testPrivateUrl`): `docker exec -it dv /usr/local/glassfish4/bin/asadmin create-jvm-options "-Ddataverse.siteUrl=http\://localhost\:8084"` (or use the provided `seturl.bash`)
+- update `dataverse.siteUrl` (appears only necessary for `DatasetsIT.testPrivateUrl`): `docker exec dv /usr/local/glassfish4/bin/asadmin create-jvm-options "-Ddataverse.siteUrl=http\://localhost\:8084"` (or use the provided `seturl.bash`)
 
 #### Run integration tests: 
 

diff --git a/conf/docker-aio/run-test-suite.sh b/conf/docker-aio/run-test-suite.sh
@@ -8,4 +8,4 @@ fi
 
 # Please note the "dataverse.test.baseurl" is set to run for "all-in-one" Docker environment.
 # TODO: Rather than hard-coding the list of "IT" classes here, add a profile to pom.xml.
-mvn test -Dtest=DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,MoveIT,MakeDataCountApiIT -Ddataverse.test.baseurl=$dvurl
+mvn test -Dtest=DataversesIT,DatasetsIT,SwordIT,AdminIT,BuiltinUsersIT,UsersIT,UtilIT,ConfirmEmailIT,FileMetadataIT,FilesIT,SearchIT,InReviewWorkflowIT,HarvestingServerIT,MoveIT,MakeDataCountApiIT,FileTypeDetectionIT -Ddataverse.test.baseurl=$dvurl
diff --git a/conf/docker-aio/seturl.bash b/conf/docker-aio/seturl.bash
@@ -1,3 +1,3 @@
 #!/usr/bin/env bash
 
-docker exec -it dv /usr/local/glassfish4/bin/asadmin create-jvm-options "\"-Ddataverse.siteUrl=http\://localhost\:8084\""
+docker exec dv /usr/local/glassfish4/bin/asadmin create-jvm-options "\"-Ddataverse.siteUrl=http\://localhost\:8084\""
diff --git a/conf/jhove/jhove.conf b/conf/jhove/jhove.conf
@@ -40,4 +40,15 @@
  <module>
   <class>edu.harvard.hul.ois.jhove.module.Utf8Module</class>
  </module>
+ <!-- New modules for application/gzip and application/warc: -->
+ <module>
+  <class>edu.harvard.hul.ois.jhove.module.GzipModule</class>
+ </module>
+ <module>
+  <class>edu.harvard.hul.ois.jhove.module.WarcModule</class>
+ </module>
+ <!-- A new 3rd-party module for image/png from mcgauth.com: -->
+ <module>
+  <class>com.mcgath.jhove.module.PngModule</class>
+ </module>
 </jhoveConfig>
diff --git a/conf/solr/7.3.1/schema.xml b/conf/solr/7.3.1/schema.xml
@@ -145,6 +145,7 @@
     <field name="dvObjectType" type="string" stored="true" indexed="true" multiValued="false"/>
     <field name="metadataSource" type="string" stored="true" indexed="true" multiValued="false"/>
     <field name="isHarvested" type="boolean" stored="true" indexed="true" multiValued="false"/>
+    <field name="fileDeleted" type="boolean" stored="true" indexed="true" multiValued="false"/>
 
     <field name="dvName" type="text_en" stored="true" indexed="true" multiValued="false"/>
     <field name="dvAlias" type="text_en" stored="true" indexed="true" multiValued="false"/>

diff --git a/doc/release-notes/2202-improved-file-detection.md b/doc/release-notes/2202-improved-file-detection.md
@@ -0,0 +1,5 @@
+Upgrade instructions:
+
+A new version of file type detection software, Jhove, is added in this release. It requires an update of its configuration file: ``jhove.conf``. Download the new configuration file from the Dataverse release page on GitHub, or from the source tree at https://github.com/IQSS/dataverse/blob/master/conf/jhove/jhove.conf, and place it in ``<GLASSFISH_DOMAIN_DIRECTORY>/config/``. For example: ``/usr/local/glassfish4/glassfish/domains/domain1/config/jhove.conf``. 
+
+**Important:** If your Glassfish installation directory is different from ``/usr/local/glassfish4``, make sure to edit the header of the config file, to reflect the correct location. 
diff --git a/doc/release-notes/5584-dataset-page-solr.md b/doc/release-notes/5584-dataset-page-solr.md
@@ -0,0 +1 @@
+* Improved search on files within a dataset plus search facets are added to the dataset page. A solr schema upgrade and a full reindex are required to fully take advantage of this functionality. 
diff --git a/doc/sphinx-guides/source/admin/dataverses-datasets.rst b/doc/sphinx-guides/source/admin/dataverses-datasets.rst
@@ -103,5 +103,5 @@ As a superuser, click "Update Current Version" when publishing. (This option is
 Diagnose Constraint Violations Issues in Datasets
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-To identifiy invalid data values in specific datasets (if, for example, an attempt to edit a dataset results in a ConstraintViolationException in the server log), or to check all the datasets in the Dataverse for constraint violations, see :ref:`Dataset Validation <dataset-validation-api>` in the :doc:`/api/native-api` section of the User Guide.
+To identify invalid data values in specific datasets (if, for example, an attempt to edit a dataset results in a ConstraintViolationException in the server log), or to check all the datasets in the Dataverse for constraint violations, see :ref:`Dataset Validation <dataset-validation-api>` in the :doc:`/api/native-api` section of the User Guide.
 
diff --git a/doc/sphinx-guides/source/admin/solr-search-index.rst b/doc/sphinx-guides/source/admin/solr-search-index.rst
@@ -1,7 +1,7 @@
 Solr Search Index
 =================
 
-Dataverse requires Solr to be operational at all times. If you stop Solr, you should see a error about this on the home page, which is powered by the search index Solr provides. You can set up Solr by following the steps in our Installation Guide's :doc:`/installation/prerequisites` and :doc:`/installation/config` sections explaining how to configure it. This section you're reading now is about the care and feeding of the search index. PostgreSQL is the "source of truth" and the Dataverse application will copy data from PostgreSQL into Solr. For this reason, the search index can be rebuilt at any time. Depending on the amount of data you have, this can be a slow process. You are encouraged to experiment with production data to get a sense of how long a full reindexing will take.
+Dataverse requires Solr to be operational at all times. If you stop Solr, you should see a error about this on the root dataverse page, which is powered by the search index Solr provides. You can set up Solr by following the steps in our Installation Guide's :doc:`/installation/prerequisites` and :doc:`/installation/config` sections explaining how to configure it. This section you're reading now is about the care and feeding of the search index. PostgreSQL is the "source of truth" and the Dataverse application will copy data from PostgreSQL into Solr. For this reason, the search index can be rebuilt at any time. Depending on the amount of data you have, this can be a slow process. You are encouraged to experiment with production data to get a sense of how long a full reindexing will take.
 
 .. contents:: Contents:
 	:local:
@@ -17,7 +17,7 @@ Clear and Reindex
 Clearing Data from Solr
 ~~~~~~~~~~~~~~~~~~~~~~~
 
-Please note that the moment you issue this command, it will appear to end users looking at the home page that all data is gone! This is because the home page is powered by the search index.
+Please note that the moment you issue this command, it will appear to end users looking at the root dataverse page that all data is gone! This is because the root dataverse page is powered by the search index.
 
 ``curl http://localhost:8080/api/admin/index/clear``
 

diff --git a/doc/sphinx-guides/source/admin/troubleshooting.rst b/doc/sphinx-guides/source/admin/troubleshooting.rst
@@ -71,3 +71,8 @@ In real life production use, it may be possible to end up in a situation where s
 (contrary to what the message suggests, there are no specific "details" anywhere in the stack trace that would explain what values violate which constraints)  
 
 To identifiy the specific invalid values in the affected datasets, or to check all the datasets in the Dataverse for constraint violations, see :ref:`Dataset Validation <dataset-validation-api>` in the :doc:`/api/native-api` section of the User Guide.
+
+Many Files with a File Type of "Unknown", "Application", or "Binary"
+--------------------------------------------------------------------
+
+From the home page of a Dataverse installation you can get a count of files by file type by clicking "Files" and then scrolling down to "File Type". If you see a lot of files that are "Unknown", "Application", or "Binary" you can have Dataverse attempt to redetect the file type by using the :ref:`Redetect File Type <redetect-file-type>` API endpoint.
diff --git a/doc/sphinx-guides/source/api/client-libraries.rst b/doc/sphinx-guides/source/api/client-libraries.rst
@@ -11,9 +11,12 @@ Because Dataverse is a SWORD server, additional client libraries exist for Java,
 Python
 ------
 
-https://github.com/IQSS/dataverse-client-python is the offical Python package for Dataverse APIs.
+There are two Python modules for interacting with Dataverse APIs.
+
+`pyDataverse <https://github.com/AUSSDA/pyDataverse>`_ had its initial release in 2019 and can be installed with ``pip install pyDataverse``. The module is developed by `Stefan Kasberger <http://stefankasberger.at>`_ from `AUSSDA - The Austrian Social Science Data Archive <https://aussda.at>`_.  
+
+`dataverse-client-python <https://github.com/IQSS/dataverse-client-python>`_ had its initial release in 2015. `Robert Liebowitz <https://github.com/rliebz>`_ created this library while at the `Center for Open Science (COS) <https://centerforopenscience.org>`_ and the COS uses it to integrate the `Open Science Framework (OSF) <https://osf.io>`_ with Dataverse via an add-on which itself is open source and listed on the :doc:`/api/apps` page.
 
-`Robert Liebowitz <https://github.com/rliebz>`_ created this library while at the `Center for Open Science (COS) <https://centerforopenscience.org>`_ and the COS uses it to integrate the `Open Science Framework (OSF) <https://osf.io>`_ with Dataverse via an add-on which itself is open source and listed on the :doc:`/api/apps` page.
 
 R
 -

diff --git a/doc/sphinx-guides/source/api/dataaccess.rst b/doc/sphinx-guides/source/api/dataaccess.rst
@@ -100,9 +100,18 @@ It returns a zipped bundle that contains the data in the following formats:
 * Data (Variable) metadata record, in DDI XML;
 * File citation, in Endnote and RIS formats. 
 
+
 Parameters: 
 ~~~~~~~~~~~
-none.
+
+``fileMetadataId``
+
+==============  ===========
+Value           Description
+==============  ===========
+ID              Exports file with specific file metadata ``ID``.
+==============  ===========
+
 
 Data Variable Metadata Access
 -----------------------------
@@ -177,6 +186,19 @@ Example:
       </dataDscr>
    </codeBook>
 
+
+Parameters: 
+~~~~~~~~~~~
+
+``fileMetadataId``
+
+==============  ===========
+Value           Description
+==============  ===========
+ID              Exports file with specific file metadata ``ID``. For example for data file with id 6 and file metadata id 2: ``curl 'http://localhost:8080/api/access/datafile/6/metadata/ddi?fileMetadataId=2'``
+==============  ===========
+
+
 More information on DDI is available in the :doc:`/user/tabulardataingest/ingestprocess` section of the User Guide.
 
 Advanced options/Parameters: 

diff --git a/doc/sphinx-guides/source/api/intro.rst b/doc/sphinx-guides/source/api/intro.rst
@@ -47,6 +47,6 @@ Rather than using a production installation of Dataverse, API users are welcome
 Support
 -------
 
-If you are using the APIs for an installation of Dataverse hosted by your institution, you may want to reach out to the team that supports it. At the top of the Dataverse installation's home page, there should be a form you can fill out by clicking the "Support" link.
+If you are using the APIs for an installation of Dataverse hosted by your institution, you may want to reach out to the team that supports it. In the header at the top of the site, there should be a form you can fill out by clicking the "Support" link.
 
 If you are having trouble with http://demo.dataverse.org or have questions about the APIs, please feel free to reach out to the Dataverse community via https://groups.google.com/forum/#!forum/dataverse-community .
diff --git a/doc/sphinx-guides/source/api/native-api.rst b/doc/sphinx-guides/source/api/native-api.rst
@@ -21,9 +21,11 @@ Create a Dataverse
 ~~~~~~~~~~~~~~~~~~
 
 Generates a new dataverse under ``$id``. Expects a JSON content describing the dataverse, as in the example below.
-If ``$id`` is omitted, a root dataverse is created. ``$id`` can either be a dataverse id (long) or a dataverse alias (more robust). ::
+If ``$id`` is omitted, a root dataverse is created. ``$id`` can either be a dataverse id (long) or a dataverse alias (more robust). In the example below, "root" is the id, which means that the dataverse will be created as a child of the root dataverse::
 
-    POST http://$SERVER/api/dataverses/$id?key=$apiKey
+``export id=root`
+
+``curl -H "X-Dataverse-key:$API_TOKEN" -X POST $SERVER_URL/api/dataverses/$id --upload-file dataverse-complete.json``
 
 Download the :download:`JSON example <../_static/api/dataverse-complete.json>` file and modified to create dataverses to suit your needs. The fields ``name``, ``alias``, and ``dataverseContacts`` are required. The controlled vocabulary for ``dataverseType`` is
 
@@ -64,6 +66,18 @@ Show Contents of a Dataverse
 
     GET http://$SERVER/api/dataverses/$id/contents
 
+
+Report the data (file) size of a Dataverse
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Shows the combined size in bytes of all the files uploaded into the dataverse ``id``. ::
+
+    GET http://$SERVER/api/dataverses/$id/storagesize
+
+Both published and unpublished files will be counted, in the dataverse specified, and in all its sub-dataverses, recursively. 
+By default, only the archival files are counted - i.e., the files uploaded by users (plus the tab-delimited versions generated for tabular data files on ingest). If the optional argument ``includeCached=true`` is specified, the API will also add the sizes of all the extra files generated and cached by Dataverse - the resized thumbnail versions for image files, the metadata exports for published datasets, etc. 
+
+
 List Roles Defined in a Dataverse
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
@@ -444,6 +458,8 @@ A more detailed "add" example using curl::
 
     curl -H "X-Dataverse-key:$API_TOKEN" -X POST -F 'file=@data.tsv' -F 'jsonData={"description":"My description.","directoryLabel":"data/subdir1","categories":["Data"], "restrict":"true"}' "https://example.dataverse.edu/api/datasets/:persistentId/add?persistentId=$PERSISTENT_ID"
 
+Please note that it's possible to "trick" Dataverse into giving a file a content type (MIME type) of your choosing. For example, you can make a text file be treated like a video file with ``-F 'file=@README.txt;type=video/mpeg4'``, for example. If Dataverse does not properly detect a file type, specifying the content type via API like this a potential workaround.
+
 Example python code to add a file. This may be run by changing these parameters in the sample code:
 
 * ``dataverse_server`` - e.g. https://demo.dataverse.org
@@ -738,6 +754,25 @@ Note that this requires "superuser" credentials::
 
 Note: at present, the API cannot be used on a file that's already successfully ingested as tabular.
 
+.. _redetect-file-type:
+
+Redetect File Type
+~~~~~~~~~~~~~~~~~~
+
+Dataverse uses a variety of methods for determining file types (MIME types or content types) and these methods (listed below) are updated periodically. If you have files that have an unknown file type, you can have Dataverse attempt to redetect the file type.
+
+When using the curl command below, you can pass ``dryRun=true`` if you don't want any changes to be saved to the database. Change this to ``dryRun=false`` (or omit it) to save the change. In the example below, the file is identified by database id "42".
+
+``export FILE_ID=42``
+
+``curl -H "X-Dataverse-key:$API_TOKEN" -X POST $SERVER_URL/api/files/$FILE_ID/redetect?dryRun=true``
+
+Currently the following methods are used to detect file types:
+
+- The file type detected by the browser (or sent via API).
+- JHOVE: http://jhove.openpreservation.org
+- As a last resort the file extension (e.g. ".ipybn") is used, defined in a file called ``MimeTypeDetectionByFileExtension.properties``.
+
 Replacing Files
 ~~~~~~~~~~~~~~~
 

diff --git a/doc/sphinx-guides/source/conf.py b/doc/sphinx-guides/source/conf.py
@@ -65,9 +65,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '4.14'
+version = '4.15'
 # The full version, including alpha/beta/rc tags.
-release = '4.14'
+release = '4.15'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.

diff --git a/doc/sphinx-guides/source/developers/deployment.rst b/doc/sphinx-guides/source/developers/deployment.rst
@@ -80,19 +80,16 @@ You can skip this step if you're fine with the values in the "main.yml" file in
 Download and Run the "Create Instance" Script
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-Once you have done the configuration above, you are ready to try running the "create instance" script to spin up Dataverse in AWS.
+Once you have done the configuration above, you are ready to try running the "ec2-create-instance.sh" script to spin up Dataverse in AWS.
 
 Download :download:`ec2-create-instance.sh <../../../../scripts/installer/ec2-create-instance.sh>` and put it somewhere reasonable. For the purpose of these instructions we'll assume it's in the "Downloads" directory in your home directory.
 
-You need to decide which branch you'd like to deploy to AWS. Select a branch from https://github.com/IQSS/dataverse/branches/all such as "develop" and pass it to the script with ``-b`` as in the following example. (Branches such as "master" and "develop" are described in the :doc:`version-control` section.)
+ec2-create-instance accepts a number few command-line switches:
 
-``bash ~/Downloads/ec2-create-instance.sh -b develop``
-
-You must specify the branch with ``-b`` but you can also specify a non-IQSS git repo URL with ``-r`` as in the following example.
-
-``bash ~/Downloads/ec2-create-instance.sh -b develop -r https://github.com/scholarsportal/dataverse.git``
-
-If you configured an Ansible file above and want to make use of it, add ``-g main.yml`` (or whatever you named your file) as in the following example.
+* -r: GitHub Repository URL (defaults to https://github.com/IQSS/dataverse.git)
+* -b: branch to build (defaults to develop)
+* -p: pemfile directory (defaults to $HOME)
+* -g: Ansible GroupVars file (if you wish to override role defaults)
 
 ``bash ~/Downloads/ec2-create-instance.sh -b develop -r https://github.com/scholarsportal/dataverse.git -g main.yml``
 
@@ -101,7 +98,7 @@ Now you will need to wait around 15 minutes until the deployment is finished. Ev
 Caveats
 ~~~~~~~
 
-Please note that while the script should work fine on newish branches, older branches that have different dependencies such as an older version of Solr are now expected to yield a working Dataverse installation. Your mileage may vary.
+Please note that while the script should work fine on newish branches, older branches that have different dependencies such as an older version of Solr may not produce a working Dataverse installation. Your mileage may vary.
 
 ----