From 05b509fd4f00115ac931fdaed6a87baa5e70e336 Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Fri, 22 Sep 2023 13:30:24 -0400 Subject: [PATCH 1/2] IN-924 Investigate error with FTP file transfer Why these changes are being introduced: * The ftplib.storbinary method throws errors that suggest file transfer has failed even when the Carbon run successfully uploads an XML file to the Elements FTP server. For the 'people' feed, it would log: ftplib.error_temp: 425 Error while transfering data: ECONNABORTED - Connection aborted For the 'articles' feed, it would log: TimeoutError: The read operation timed out These changes introduce cleaner error handling that avoid the error for the 'people' feed and except' timeout errors that do not actually seem to have a negative impact. How this addresses that need: * Remove custom storbinary method to resolve errors for 'people' feed * Add try-except-finally code block to cleanly handle timeout errors for 'articles' feed Side effects of this change: * None Relevant ticket(s): * https://mitlibraries.atlassian.net/jira/software/c/projects/IN/boards/70?modal=detail&selectedIssue=IN-924 --- carbon/app.py | 38 +++++++++++++++----------------------- carbon/cli.py | 4 +++- carbon/feed.py | 4 ++-- 3 files changed, 20 insertions(+), 26 deletions(-) diff --git a/carbon/app.py b/carbon/app.py index 5879475..9d45cbb 100644 --- a/carbon/app.py +++ b/carbon/app.py @@ -46,26 +46,6 @@ def ntransfercmd(self, cmd: str, rest: str | int | None = None) -> tuple[socket, ) return conn, size - def storbinary( - self, - cmd: str, - fp: IO, # type: ignore[override] - blocksize: int = 8192, - callback: Callable | None = None, - rest: str | None = None, # type: ignore[override] - ) -> str: - """Store a file in binary mode.""" - self.voidcmd("TYPE I") - with self.transfercmd(cmd, rest) as conn: - while 1: - buf = fp.read(blocksize) - if not buf: - break - conn.sendall(buf) - if callback: - callback(buf) - return self.voidresp() - class FileWriter: """A writer that outputs normalized XML strings to a specified file. @@ -91,8 +71,11 @@ def write(self, feed_type: str) -> None: elif feed_type == "articles": xml_feed = ArticlesXmlFeed(engine=self.engine, output_file=self.output_file) xml_feed.run() + logger.info( - "The feed has collected %s '%s' records", xml_feed.record_count, feed_type + "The '%s' feed has processed %s records.", + feed_type, + xml_feed.processed_record_count, ) @@ -164,8 +147,17 @@ def __call__(self) -> None: ftps.connect(host=self.host, port=self.port) ftps.login(user=self.user, passwd=self.password) ftps.prot_p() - ftps.storbinary(cmd=f"STOR {self.path}", fp=self.content_feed) - ftps.quit() + try: + ftps.storbinary(cmd=f"STOR {self.path}", fp=self.content_feed) + except TimeoutError: + logger.warning( + "Timeout occurred but XML file may have been uploaded anyway. \ + Please check the Elements FTP server to confirm." + ) + except Exception: + raise + else: + ftps.quit() class DatabaseToFilePipe: diff --git a/carbon/cli.py b/carbon/cli.py index e87680b..5e5e7f8 100644 --- a/carbon/cli.py +++ b/carbon/cli.py @@ -85,7 +85,9 @@ def main(*, output_file: IO, run_connection_tests: bool, use_sns_logging: bool) pipe.run_connection_test() if not run_connection_tests: - logger.info("Carbon run has started.") + logger.info( + "Carbon run for the '%s' feed has started.", config_values["FEED_TYPE"] + ) if use_sns_logging: sns_log(config_values=config_values, status="start") try: diff --git a/carbon/feed.py b/carbon/feed.py index 8b1b6f8..93d67cb 100644 --- a/carbon/feed.py +++ b/carbon/feed.py @@ -35,7 +35,7 @@ class BaseXmlFeed(ABC): root_element_name: str = "" query: Select = select() - record_count: int = 0 + processed_record_count: int = 0 def __init__(self, engine: DatabaseEngine, output_file: IO): self.engine = engine @@ -52,7 +52,6 @@ def records(self) -> Generator[dict[str, Any], Any, None]: with closing(self.engine().connect()) as connection: result = connection.execute(self.query) for row in result: - self.record_count += 1 yield dict(zip(result.keys(), row, strict=True)) @abstractmethod @@ -101,6 +100,7 @@ def run(self, **kwargs: dict[str, Any]) -> None: for record in self.records: element = self._add_element(record) xml_file.write(element) + self.processed_record_count += 1 class ArticlesXmlFeed(BaseXmlFeed): From 91fce2c804fda3e4ffc9e279f5e0916b4d2da3cd Mon Sep 17 00:00:00 2001 From: jonavellecuerdo Date: Mon, 25 Sep 2023 13:16:29 -0400 Subject: [PATCH 2/2] Update README with instructions for downloading files --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 9cb7ddb..11ad1ab 100644 --- a/README.md +++ b/README.md @@ -37,6 +37,10 @@ The test suite uses SQLite, so you can develop and test without connecting to th **Note**: As of this writing, Apple M1 Macs cannot run Oracle Instant Client. * If you are on a machine that can run Oracle Instant Client, follow the steps outlined in [Without Docker](#without-docker). +The data retrieved by the Carbon application contains personally identifiable information (PII), so downloading the files is not recommended. However, if examining the files created by Carbon is **absolutely necessary** for testing purposes, this can be done on your local machine via a Docker container. For more information, please refer to the Confluence document: [How to download files from an application that connects to the Data Warehouse](https://mitlibraries.atlassian.net/wiki/spaces/~712020c6fcb37f39c94e54a6bfd09607f29eb4/pages/3469443097/Running+applications+in+a+local+Docker+Container). + +**Note:** Any downloaded files or `.env` files must be **immediately deleted** after testing is complete. + #### With Docker 1. Run `make dependencies` to download the Oracle Instant Client from S3. @@ -47,6 +51,7 @@ The test suite uses SQLite, so you can develop and test without connecting to th 4. Run any `make` commands for testing the application. In the Makefile, the names of relevant make commands will contain the suffix '-with-docker'. + #### Without Docker 1. Download [Oracle Instant Client](https://www.oracle.com/database/technologies/instant-client/downloads.html) (basiclite is sufficient) and set the `ORACLE_LIB_DIR` env variable.