MITLibraries · jonavellecuerdo · Feb 11, 2025 · Jan 15, 2025 · Jan 28, 2025 · Feb 3, 2025
@@ -6,6 +6,7 @@
 from dsc.workflows.base import Workflow, WorkflowEvents
 from dsc.workflows.base.simple_csv import SimpleCSV
 from dsc.workflows.demo import Demo
+from dsc.workflows.opencourseware import OpenCourseWare
 from dsc.workflows.sccs import SCCS
 
-__all__ = ["SCCS", "Demo", "SimpleCSV", "Workflow", "WorkflowEvents"]
+__all__ = ["SCCS", "Demo", "OpenCourseWare", "SimpleCSV", "Workflow", "WorkflowEvents"]
@@ -0,0 +1,27 @@
+{
+    "item_identifier": {
+        "source_field_name": "item_identifier",
+        "required": true
+    },
+    "dc.title": {
+        "source_field_name": "course_title",
+        "language": "en_US",
+        "required": true
+    },
+    "dc.description.abstract": {
+        "source_field_name": "course_description"
+    },
+    "dc.subject": {
+        "source_field_name": "topics"
+    },
+    "dc.date.issued": {
+        "source_field_name": "year"
+    },
+    "dc.identifier.other": {
+        "source_field_name": "primary_course_number"
+    },
+    "dc.contributor.author": {
+        "source_field_name": "instructors",
+        "delimiter": "|"
+    }
+}
@@ -0,0 +1,196 @@
+import json
+import logging
+import zipfile
+from collections.abc import Iterator
+from typing import Any
+
+import smart_open
+
+from dsc.exceptions import ReconcileError
+from dsc.utilities.aws.s3 import S3Client
+from dsc.workflows.base import Workflow
+
+logger = logging.getLogger(__name__)
+
+
+class OpenCourseWare(Workflow):
+    """Workflow for OpenCourseWare (OCW) deposits.
+
+    The deposits managed by this workflow are requested by the
+    Scholarly Communications and Collections Strategy (SCCS) department
+    and were previously deposited into DSpace@MIT by Technical services staff.
+    """
+
+    workflow_name: str = "opencourseware"
+
+    @property
+    def metadata_mapping_path(self) -> str:
+        return "dsc/workflows/metadata_mapping/opencourseware.json"
+
+    @property
+    def s3_bucket(self) -> str:
+        return "awaiting AWS infrastructure"
+
+    @property
+    def output_queue(self) -> str:
+        return "awaiting AWS infrastructure"
+
+    def reconcile_bitstreams_and_metadata(self) -> None:
+        """Reconcile bitstreams against item metadata.
+
+        Generate a list of bitstreams without item metadata.
+
+        For OpenCourseWare deposits, the zip files are the bitstreams to be deposited
+        into DSpace, but they also must contain a 'data.json' file, representing the
+        metadata. As such, the 'reconcile' method only determines whether there are any
+        bitstreams without metadata (any zip files without a 'data.json').
+        Metadata without bitstreams is not calculated as for a 'data.json' file to
+        exist, the zip file must also exist.
+        """
+        item_identifiers = []
+        bitstreams_without_metadata = []
+        s3_client = S3Client()
+        for file in s3_client.files_iter(
+            bucket=self.s3_bucket, prefix=self.batch_path, file_type=".zip"
+        ):
+            item_identifier = self.parse_item_identifier(file)
+            item_identifiers.append(item_identifier)
+            try:
+                self._extract_metadata_from_zip_file(file)
+            except FileNotFoundError:
+                bitstreams_without_metadata.append(item_identifier)
+
+        if any(bitstreams_without_metadata):
+            reconcile_error_message = {
+                "note": "Failed to reconcile bitstreams and metadata.",
+                "bitstreams_without_metadata": {
+                    "count": len(bitstreams_without_metadata),
+                    "identifiers": bitstreams_without_metadata,
+                },
+            }
+            logger.error(json.dumps(reconcile_error_message))
+            raise ReconcileError(json.dumps(reconcile_error_message))
+
+        logger.info(
+            "Successfully reconciled bitstreams and metadata for all "
+            f"items (n={len(item_identifiers)})."
+        )
+
+    def item_metadata_iter(self) -> Iterator[dict[str, Any]]:
+        """Yield source metadata from metadata JSON file in the zip file.
+
+        The item identifiers are retrieved from the filenames of the zip
+        files, which follow the naming format "<item_identifier>.zip".
+        """
+        s3_client = S3Client()
+        for file in s3_client.files_iter(
+            bucket=self.s3_bucket, prefix=self.batch_path, file_type=".zip"
+        ):
+            yield {
+                "item_identifier": self.parse_item_identifier(file),
+                **self._extract_metadata_from_zip_file(file),
+            }
+
+    def _extract_metadata_from_zip_file(self, file: str) -> dict[str, str]:
+        """Yield source metadata from metadata JSON file in zip archive.
+
+        This method expects a JSON file called "data.json" at the root
+        level of the the zip file.
+
+        Args:
+            file: Object prefix for bitstream zip file, formatted as the
+                path from the S3 bucket to the file.
+                Given an S3 URI "s3://dsc/opencourseware/batch-00/123.zip",
+                then file = "opencourseware/batch-00/123.zip".
+
+            item_identifier: Item identifier, used to find and read the metadata
+                JSON file for the associated bitstream zip file.
+        """
+        zip_file_uri = f"s3://{self.s3_bucket}/{file}"
+        with smart_open.open(zip_file_uri, "rb") as file_input, zipfile.ZipFile(
+            file_input
+        ) as zip_file:
+            for filename in zip_file.namelist():
+                if filename == "data.json":
+                    return self._read_metadata_json_file(zip_file)
+            raise FileNotFoundError(
+                "The required file 'data.json' file was not found in the zip file: "
+                f"{file}"
+            )
+
+    def _read_metadata_json_file(self, zip_file: zipfile.ZipFile) -> dict[str, str]:
+        """Read source metadata JSON file."""
+        with zip_file.open("data.json") as file:
+            source_metadata = json.load(file)
+            source_metadata["instructors"] = self._get_instructors_delimited_string(
+                source_metadata["instructors"]
+            )
+            return source_metadata
+
+    def _get_instructors_delimited_string(self, instructors: list[dict[str, str]]) -> str:
+        """Get delimited string of 'instructors' from source metadata JSON file.
+
+        Source metadata JSON files stored in OCW zip files contain an 'instructors'
+        property, which contains an array of objects representing an instructor's
+        credentials:
+
+            [
+                {
+                    "first_name": "Kerry",
+                    "last_name": "Oki",
+                    "middle_initial": "",
+                    "salutation": "Prof.",
+                    "title": "Prof. Kerry Oki"
+                },
+                {
+                    "first_name": "Earl",
+                    "last_name": "Bird",
+                    "middle_initial": "E.",
+                    "salutation": "Prof.",
+                    "title": "Prof. Earl E. Bird"
+                }
+            ]
+
+        Given these credentials, this method will construct a pipe-delimited ("|")
+        string with the following format: "<last_name>, <first_name> <middle_initial>".
+
+            Example output:
+                "Oki, Kerry|Bird, Earl E."
+
+        """
+        return "|".join(
+            [
+                instructor_name
+                for instructor in instructors
+                if (instructor_name := self._construct_instructor_name(instructor))
+            ]
+        ).strip()
+
+    @staticmethod
+    def _construct_instructor_name(instructor: dict[str, str]) -> str:
+        """Given a dictionary of name fields, derive instructor name."""
+        if not (last_name := instructor.get("last_name")) or not (
+            first_name := instructor.get("first_name")
+        ):
+            return ""
+        return f"{last_name}, {first_name} {instructor.get("middle_initial", "")}".strip()
+
+    def get_item_identifier(self, item_metadata: dict[str, Any]) -> str:
+        """Get 'item_identifier' from item metadata entry."""
+        return item_metadata["item_identifier"]
+
+    def parse_item_identifier(self, file: str) -> str:
+        """Parse item identifier from bitstream zip file."""
+        return file.split("/")[-1].removesuffix(".zip")
+
+    def get_bitstream_s3_uris(self, item_identifier: str) -> list[str]:
+        s3_client = S3Client()
+        return list(
+            s3_client.files_iter(
+                bucket=self.s3_bucket,
+                prefix=self.batch_path,
+                item_identifier=item_identifier,
+                file_type=".zip",
+                exclude_prefixes=["archived"],
+            )
+        )
@@ -12,6 +12,7 @@
 from dsc.utilities.aws.s3 import S3Client
 from dsc.utilities.aws.ses import SESClient
 from dsc.utilities.aws.sqs import SQSClient
+from dsc.workflows import OpenCourseWare
 from dsc.workflows.base import Workflow, WorkflowEvents
 from dsc.workflows.base.simple_csv import SimpleCSV
 
@@ -83,6 +84,17 @@ def output_queue(self) -> str:
         return "mock-output-queue"
 
 
+class TestOpenCourseWare(OpenCourseWare):
+
+    @property
+    def s3_bucket(self) -> str:
+        return "dsc"
+
+    @property
+    def output_queue(self) -> str:
+        return "mock-output-queue"
+
+
 @pytest.fixture(autouse=True)
 def _test_env(monkeypatch):
     monkeypatch.setenv("SENTRY_DSN", "None")
@@ -105,6 +117,36 @@ def simple_csv_workflow_instance(metadata_mapping):
     return TestSimpleCSV(batch_id="batch-aaa")
 
 
+@pytest.fixture
+def opencourseware_workflow_instance():
+    return TestOpenCourseWare(batch_id="batch-aaa")
+
+
+@pytest.fixture
+def opencourseware_source_metadata():
+    return {
+        "course_title": "Matrix Calculus for Machine Learning and Beyond",
+        "course_description": "We all know that calculus courses.",
+        "site_uid": "2318fd9f-1b5c-4a48-8a04-9c56d902a1f8",
+        "instructors": [
+            {
+                "first_name": "Alan",
+                "last_name": "Edelman",
+                "middle_initial": "",
+                "salutation": "Prof.",
+                "title": "Prof. Alan Edelman",
+            },
+            {
+                "first_name": "Steven",
+                "last_name": "Johnson",
+                "middle_initial": "G.",
+                "salutation": "Prof.",
+                "title": "Prof. Steven G. Johnson",
+            },
+        ],
+    }
+
+
 @pytest.fixture
 def config_instance():
     return Config()