-
Notifications
You must be signed in to change notification settings - Fork 0
Config class and S3 clients #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,33 +1,63 @@ | ||
| import logging | ||
| import os | ||
| from collections.abc import Iterable | ||
| from io import StringIO | ||
|
|
||
| import sentry_sdk | ||
|
|
||
|
|
||
| def configure_logger(logger: logging.Logger, *, verbose: bool) -> str: | ||
| if verbose: | ||
| logging.basicConfig( | ||
| format="%(asctime)s %(levelname)s %(name)s.%(funcName)s() line %(lineno)d: " | ||
| "%(message)s" | ||
| ) | ||
| logger.setLevel(logging.DEBUG) | ||
| for handler in logging.root.handlers: | ||
| handler.addFilter(logging.Filter("dsc")) | ||
| else: | ||
| logging.basicConfig( | ||
| format="%(asctime)s %(levelname)s %(name)s.%(funcName)s(): %(message)s" | ||
| class Config: | ||
| REQUIRED_ENV_VARS: Iterable[str] = [ | ||
| "WORKSPACE", | ||
| "SENTRY_DSN", | ||
| "AWS_REGION_NAME", | ||
| ] | ||
|
|
||
| OPTIONAL_ENV_VARS: Iterable[str] = ["LOG_LEVEL"] | ||
|
|
||
| def __getattr__(self, name: str) -> str | None: | ||
| """Provide dot notation access to configurations and env vars on this class.""" | ||
| if name in self.REQUIRED_ENV_VARS or name in self.OPTIONAL_ENV_VARS: | ||
| return os.getenv(name) | ||
| message = f"'{name}' not a valid configuration variable" | ||
| raise AttributeError(message) | ||
|
|
||
| def check_required_env_vars(self) -> None: | ||
| """Method to raise exception if required env vars not set.""" | ||
| missing_vars = [var for var in self.REQUIRED_ENV_VARS if not os.getenv(var)] | ||
| if missing_vars: | ||
| message = f"Missing required environment variables: {', '.join(missing_vars)}" | ||
| raise OSError(message) | ||
|
|
||
| def configure_logger( | ||
| self, logger: logging.Logger, stream: StringIO, *, verbose: bool | ||
| ) -> str: | ||
| logging_format_base = "%(asctime)s %(levelname)s %(name)s.%(funcName)s()" | ||
| logger.addHandler(logging.StreamHandler(stream)) | ||
|
|
||
| if verbose: | ||
| log_method, log_level = logger.debug, logging.DEBUG | ||
| template = logging_format_base + " line %(lineno)d: %(message)s" | ||
| for handler in logging.root.handlers: | ||
| handler.addFilter(logging.Filter("dsc")) | ||
| else: | ||
| log_method, log_level = logger.info, logging.INFO | ||
| template = logging_format_base + ": %(message)s" | ||
|
|
||
| logger.setLevel(log_level) | ||
| logging.basicConfig(format=template) | ||
| logger.addHandler(logging.StreamHandler(stream)) | ||
| log_method(f"{logging.getLevelName(logger.getEffectiveLevel())}") | ||
|
|
||
| return ( | ||
| f"Logger '{logger.name}' configured with level=" | ||
| f"{logging.getLevelName(logger.getEffectiveLevel())}" | ||
| ) | ||
| logger.setLevel(logging.INFO) | ||
| return ( | ||
| f"Logger '{logger.name}' configured with level=" | ||
| f"{logging.getLevelName(logger.getEffectiveLevel())}" | ||
| ) | ||
|
|
||
|
|
||
| def configure_sentry() -> str: | ||
| env = os.getenv("WORKSPACE") | ||
| sentry_dsn = os.getenv("SENTRY_DSN") | ||
| if sentry_dsn and sentry_dsn.lower() != "none": | ||
| sentry_sdk.init(sentry_dsn, environment=env) | ||
| return f"Sentry DSN found, exceptions will be sent to Sentry with env={env}" | ||
| return "No Sentry DSN found, exceptions will not be sent to Sentry" | ||
|
|
||
| def configure_sentry(self) -> str: | ||
| env = self.WORKSPACE | ||
| sentry_dsn = self.SENTRY_DSN | ||
| if sentry_dsn and sentry_dsn.lower() != "none": | ||
| sentry_sdk.init(sentry_dsn, environment=env) | ||
| return f"Sentry DSN found, exceptions will be sent to Sentry with env={env}" | ||
| return "No Sentry DSN found, exceptions will not be sent to Sentry" | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,80 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import logging | ||
| from typing import TYPE_CHECKING | ||
|
|
||
| import boto3 | ||
|
|
||
| if TYPE_CHECKING: | ||
| from collections.abc import Iterator | ||
|
|
||
| from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class S3Client: | ||
| """A class to perform common S3 operations for this application.""" | ||
|
|
||
| def __init__(self) -> None: | ||
| self.client = boto3.client("s3") | ||
|
|
||
| def archive_file_with_new_key( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Curious whether archiving the metadata file will be required for workflows other than Wiley deposits? 🤔 If an
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe so, we probably want an |
||
| self, bucket: str, key: str, archived_key_prefix: str | ||
| ) -> None: | ||
| """Update the key of the specified file to archive it from processing. | ||
|
|
||
| Args: | ||
| bucket: The S3 bucket containing the files to be archived. | ||
| key: The key of the file to archive. | ||
| archived_key_prefix: The prefix to be applied to the archived file. | ||
| """ | ||
| self.client.copy_object( | ||
| Bucket=bucket, | ||
| CopySource=f"{bucket}/{key}", | ||
| Key=f"{archived_key_prefix}/{key}", | ||
| ) | ||
| self.client.delete_object( | ||
| Bucket=bucket, | ||
| Key=key, | ||
| ) | ||
|
|
||
| def put_file( | ||
| self, file_content: str | bytes, bucket: str, key: str | ||
| ) -> PutObjectOutputTypeDef: | ||
| """Put a file in a specified S3 bucket with a specified key. | ||
|
|
||
| Args: | ||
| file_content: The content of the file to be uploaded. | ||
| bucket: The S3 bucket where the file will be uploaded. | ||
| key: The key to be used for the uploaded file. | ||
| """ | ||
| response = self.client.put_object( | ||
| Body=file_content, | ||
| Bucket=bucket, | ||
| Key=key, | ||
| ) | ||
| logger.debug(f"'{key}' uploaded to S3") | ||
| return response | ||
|
|
||
| def get_files_iter( | ||
| self, bucket: str, file_type: str, excluded_key_prefix: str | ||
| ) -> Iterator[str]: | ||
| """Retrieve file based on file type, bucket, and without excluded prefix. | ||
|
|
||
| Args: | ||
| bucket: The S3 bucket to search. | ||
| file_type: The file type to retrieve. | ||
| excluded_key_prefix: Files with this key prefix will not be retrieved. | ||
| """ | ||
| paginator = self.client.get_paginator("list_objects_v2") | ||
| page_iterator = paginator.paginate(Bucket=bucket) | ||
|
|
||
| for page in page_iterator: | ||
| files = [ | ||
| content["Key"] | ||
| for content in page["Contents"] | ||
| if content["Key"].endswith(file_type) | ||
| and excluded_key_prefix not in content["Key"] | ||
| ] | ||
| yield from files | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,13 +1,44 @@ | ||
| from io import StringIO | ||
|
|
||
| import boto3 | ||
| import pytest | ||
| from click.testing import CliRunner | ||
| from moto import mock_aws | ||
|
|
||
| from dsc.config import Config | ||
| from dsc.s3 import S3Client | ||
|
|
||
|
|
||
| @pytest.fixture(autouse=True) | ||
| def _test_env(monkeypatch): | ||
| monkeypatch.setenv("SENTRY_DSN", "None") | ||
| monkeypatch.setenv("WORKSPACE", "test") | ||
| monkeypatch.setenv("AWS_REGION_NAME", "us-east-1") | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def config_instance() -> Config: | ||
| return Config() | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def mocked_s3(config_instance): | ||
| with mock_aws(): | ||
| s3 = boto3.client("s3", region_name=config_instance.AWS_REGION_NAME) | ||
| s3.create_bucket(Bucket="awd") | ||
| yield s3 | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def runner(): | ||
| return CliRunner() | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def s3_client(): | ||
| return S3Client() | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def stream(): | ||
| return StringIO() |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,37 +1,45 @@ | ||
| import logging | ||
|
|
||
| from dsc.config import configure_logger, configure_sentry | ||
| import pytest | ||
|
|
||
|
|
||
| def test_configure_logger_not_verbose(): | ||
| def test_check_required_env_vars(monkeypatch, config_instance): | ||
| monkeypatch.delenv("WORKSPACE") | ||
| with pytest.raises(OSError, match="Missing required environment variables:"): | ||
| config_instance.check_required_env_vars() | ||
|
|
||
|
|
||
| def test_configure_logger_not_verbose(config_instance, stream): | ||
| logger = logging.getLogger(__name__) | ||
| result = configure_logger(logger, verbose=False) | ||
| info_log_level = 20 | ||
| assert logger.getEffectiveLevel() == info_log_level | ||
| result = config_instance.configure_logger(logger, stream, verbose=False) | ||
| assert logger.getEffectiveLevel() == logging.INFO | ||
| assert result == "Logger 'tests.test_config' configured with level=INFO" | ||
| stream.seek(0) | ||
| assert next(stream) == "INFO\n" | ||
|
|
||
|
|
||
| def test_configure_logger_verbose(): | ||
| def test_configure_logger_verbose(config_instance, stream): | ||
| logger = logging.getLogger(__name__) | ||
| result = configure_logger(logger, verbose=True) | ||
| debug_log_level = 10 | ||
| assert logger.getEffectiveLevel() == debug_log_level | ||
| result = config_instance.configure_logger(logger, stream, verbose=True) | ||
| assert logger.getEffectiveLevel() == logging.DEBUG | ||
| assert result == "Logger 'tests.test_config' configured with level=DEBUG" | ||
| stream.seek(0) | ||
| assert next(stream) == "DEBUG\n" | ||
|
|
||
|
|
||
| def test_configure_sentry_no_env_variable(monkeypatch): | ||
| def test_configure_sentry_no_env_variable(monkeypatch, config_instance): | ||
| monkeypatch.delenv("SENTRY_DSN", raising=False) | ||
| result = configure_sentry() | ||
| result = config_instance.configure_sentry() | ||
| assert result == "No Sentry DSN found, exceptions will not be sent to Sentry" | ||
|
|
||
|
|
||
| def test_configure_sentry_env_variable_is_none(monkeypatch): | ||
| def test_configure_sentry_env_variable_is_none(monkeypatch, config_instance): | ||
| monkeypatch.setenv("SENTRY_DSN", "None") | ||
| result = configure_sentry() | ||
| result = config_instance.configure_sentry() | ||
| assert result == "No Sentry DSN found, exceptions will not be sent to Sentry" | ||
|
|
||
|
|
||
| def test_configure_sentry_env_variable_is_dsn(monkeypatch): | ||
| def test_configure_sentry_env_variable_is_dsn(monkeypatch, config_instance): | ||
| monkeypatch.setenv("SENTRY_DSN", "https://1234567890@00000.ingest.sentry.io/123456") | ||
| result = configure_sentry() | ||
| result = config_instance.configure_sentry() | ||
| assert result == "Sentry DSN found, exceptions will be sent to Sentry with env=test" |
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a combination of
configure_loggerfromwiley-depositsand our template config code, there's some repetition and I have a gut feeling there's a better way to do this but didn't land on anything specific, curious about your thoughtsUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here's a possible refactor, removing some redundancies:
At a glance, what makes this hard to refactor are the tests:
test_configure_logger_not_verbosetest_configure_logger_verbosewhich attempt to read from the stream handler that is setup. That's what the
log_method, log_level =do above, is get the appropriate logging method to use forlog_method(f"{logging.getLevelName(logger.getEffectiveLevel())}")which satisfies these tests.My feeling: good enough for now, but globally I think we could improve our logging setups. The presence of
logging.basicConfig()is kind of a red flag unto itself. But, I think this is mostly consistent with other projects, so value in that.For example: I'm not sure that returning a string is ideal. Perhaps this method could just log what it has done, and return
None. But this would require updating the CLI, tests, etc. that are expecting a string response. And as mentioned, this is consistent with other apps.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still need review this at a deeper level but figured I'd pass along a discussion @ghukill and I had on a previous PR I worked on re: excluding
configure_sentryandconfigure_loggerfrom theConfigclass:From @ghukill :
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great refactor, thanks! I was hoping this would prompt a larger conversation about the template/config class because I think it needs work as well, maybe a future DataEng meeting topic?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think a future DataEng topic could be good, for sure. But I think it could/should extend longer than 30 minute meeting w/ CB.
I have two somewhat conflicting feelings at the moment re: config and logging:
I think it'd be easy for any one project to grind to a halt trying to get it perfect, when we'll always have tension with, "but others don't do it that way..."
Until we do take a long and thoughtful look at our config + logging approach for python projects, I'm pretty okay with allowing projects to try and strike a balance between feelings #1 and #2 above.