diff --git a/README.md b/README.md index 9cb7ddb..11ad1ab 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,10 @@ The test suite uses SQLite, so you can develop and test without connecting to th **Note**: As of this writing, Apple M1 Macs cannot run Oracle Instant Client. * If you are on a machine that can run Oracle Instant Client, follow the steps outlined in [Without Docker](#without-docker). +The data retrieved by the Carbon application contains personally identifiable information (PII), so downloading the files is not recommended. However, if examining the files created by Carbon is **absolutely necessary** for testing purposes, this can be done on your local machine via a Docker container. For more information, please refer to the Confluence document: [How to download files from an application that connects to the Data Warehouse](https://mitlibraries.atlassian.net/wiki/spaces/~712020c6fcb37f39c94e54a6bfd09607f29eb4/pages/3469443097/Running+applications+in+a+local+Docker+Container). + +**Note:** Any downloaded files or `.env` files must be **immediately deleted** after testing is complete. + #### With Docker 1. Run `make dependencies` to download the Oracle Instant Client from S3. @@ -47,6 +51,7 @@ The test suite uses SQLite, so you can develop and test without connecting to th 4. Run any `make` commands for testing the application. In the Makefile, the names of relevant make commands will contain the suffix '-with-docker'. + #### Without Docker 1. Download [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client/downloads.html) (basiclite is sufficient) and set the `ORACLE_LIB_DIR` env variable. diff --git a/carbon/app.py b/carbon/app.py index 5879475..9d45cbb 100644 --- a/carbon/app.py +++ b/carbon/app.py @@ -46,26 +46,6 @@ def ntransfercmd(self, cmd: str, rest: str | int | None = None) -> tuple[socket, ) return conn, size - def storbinary( - self, - cmd: str, - fp: IO, # type: ignore[override] - blocksize: int = 8192, - callback: Callable | None = None, - rest: str | None = None, # type: ignore[override] - ) -> str: - """Store a file in binary mode.""" - self.voidcmd("TYPE I") - with self.transfercmd(cmd, rest) as conn: - while 1: - buf = fp.read(blocksize) - if not buf: - break - conn.sendall(buf) - if callback: - callback(buf) - return self.voidresp() - class FileWriter: """A writer that outputs normalized XML strings to a specified file. @@ -91,8 +71,11 @@ def write(self, feed_type: str) -> None: elif feed_type == "articles": xml_feed = ArticlesXmlFeed(engine=self.engine, output_file=self.output_file) xml_feed.run() + logger.info( - "The feed has collected %s '%s' records", xml_feed.record_count, feed_type + "The '%s' feed has processed %s records.", + feed_type, + xml_feed.processed_record_count, ) @@ -164,8 +147,17 @@ def __call__(self) -> None: ftps.connect(host=self.host, port=self.port) ftps.login(user=self.user, passwd=self.password) ftps.prot_p() - ftps.storbinary(cmd=f"STOR {self.path}", fp=self.content_feed) - ftps.quit() + try: + ftps.storbinary(cmd=f"STOR {self.path}", fp=self.content_feed) + except TimeoutError: + logger.warning( + "Timeout occurred but XML file may have been uploaded anyway. \ + Please check the Elements FTP server to confirm." + ) + except Exception: + raise + else: + ftps.quit() class DatabaseToFilePipe: diff --git a/carbon/cli.py b/carbon/cli.py index e87680b..5e5e7f8 100644 --- a/carbon/cli.py +++ b/carbon/cli.py @@ -85,7 +85,9 @@ def main(*, output_file: IO, run_connection_tests: bool, use_sns_logging: bool) pipe.run_connection_test() if not run_connection_tests: - logger.info("Carbon run has started.") + logger.info( + "Carbon run for the '%s' feed has started.", config_values["FEED_TYPE"] + ) if use_sns_logging: sns_log(config_values=config_values, status="start") try: diff --git a/carbon/feed.py b/carbon/feed.py index 8b1b6f8..93d67cb 100644 --- a/carbon/feed.py +++ b/carbon/feed.py @@ -35,7 +35,7 @@ class BaseXmlFeed(ABC): root_element_name: str = "" query: Select = select() - record_count: int = 0 + processed_record_count: int = 0 def __init__(self, engine: DatabaseEngine, output_file: IO): self.engine = engine @@ -52,7 +52,6 @@ def records(self) -> Generator[dict[str, Any], Any, None]: with closing(self.engine().connect()) as connection: result = connection.execute(self.query) for row in result: - self.record_count += 1 yield dict(zip(result.keys(), row, strict=True)) @abstractmethod @@ -101,6 +100,7 @@ def run(self, **kwargs: dict[str, Any]) -> None: for record in self.records: element = self._add_element(record) xml_file.write(element) + self.processed_record_count += 1 class ArticlesXmlFeed(BaseXmlFeed):