Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,15 @@ verify_ssl = true
name = "pypi"

[packages]
boto3 = "*"
boto3-stubs = {extras = ["essential","ses"], version = "*"}
click = "*"
sentry-sdk = "*"

[dev-packages]
black = "*"
coveralls = "*"
moto="*"
mypy = "*"
pre-commit = "*"
pytest = "*"
Expand Down
604 changes: 509 additions & 95 deletions Pipfile.lock

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ Description of the app
```shell
SENTRY_DSN=### If set to a valid Sentry DSN, enables Sentry exception monitoring. This is not needed for local development.
WORKSPACE=### Set to `dev` for local development, this will be set to `stage` and `prod` in those environments by Terraform.
AWS_REGION_NAME=### Default AWS region.
```

### Optional

```shell
LOG_LEVEL=### Logging level. Defaults to 'INFO'.
```


10 changes: 7 additions & 3 deletions dsc/cli.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import logging
from datetime import timedelta
from io import StringIO
from time import perf_counter

import click

from dsc.config import configure_logger, configure_sentry
from dsc.config import Config

logger = logging.getLogger(__name__)
CONFIG = Config()


@click.command()
Expand All @@ -15,9 +17,11 @@
)
def main(*, verbose: bool) -> None:
start_time = perf_counter()
stream = StringIO()
root_logger = logging.getLogger()
logger.info(configure_logger(root_logger, verbose=verbose))
logger.info(configure_sentry())
logger.info(CONFIG.configure_logger(root_logger, stream, verbose=verbose))
logger.info(CONFIG.configure_sentry())
CONFIG.check_required_env_vars()
logger.info("Running process")

# Do things here!
Expand Down
82 changes: 56 additions & 26 deletions dsc/config.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,63 @@
import logging
import os
from collections.abc import Iterable
from io import StringIO

import sentry_sdk


def configure_logger(logger: logging.Logger, *, verbose: bool) -> str:
if verbose:
logging.basicConfig(
format="%(asctime)s %(levelname)s %(name)s.%(funcName)s() line %(lineno)d: "
"%(message)s"
)
logger.setLevel(logging.DEBUG)
for handler in logging.root.handlers:
handler.addFilter(logging.Filter("dsc"))
else:
logging.basicConfig(
format="%(asctime)s %(levelname)s %(name)s.%(funcName)s(): %(message)s"
class Config:
REQUIRED_ENV_VARS: Iterable[str] = [
"WORKSPACE",
"SENTRY_DSN",
"AWS_REGION_NAME",
]

OPTIONAL_ENV_VARS: Iterable[str] = ["LOG_LEVEL"]

def __getattr__(self, name: str) -> str | None:
"""Provide dot notation access to configurations and env vars on this class."""
if name in self.REQUIRED_ENV_VARS or name in self.OPTIONAL_ENV_VARS:
return os.getenv(name)
message = f"'{name}' not a valid configuration variable"
raise AttributeError(message)

def check_required_env_vars(self) -> None:
"""Method to raise exception if required env vars not set."""
missing_vars = [var for var in self.REQUIRED_ENV_VARS if not os.getenv(var)]
if missing_vars:
message = f"Missing required environment variables: {', '.join(missing_vars)}"
raise OSError(message)

def configure_logger(
Copy link
Contributor Author

@ehanson8 ehanson8 Dec 5, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a combination of configure_logger from wiley-deposits and our template config code, there's some repetition and I have a gut feeling there's a better way to do this but didn't land on anything specific, curious about your thoughts

Copy link

@ghukill ghukill Dec 6, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's a possible refactor, removing some redundancies:

def configure_logger(
    self, logger: logging.Logger, stream: StringIO, *, verbose: bool
) -> str:
    logging_format_base = "%(asctime)s %(levelname)s %(name)s.%(funcName)s()"
    logger.addHandler(logging.StreamHandler(stream))

    if verbose:
        log_method, log_level = logger.debug, logging.DEBUG
        template = logging_format_base + " line %(lineno)d: %(message)s"
        for handler in logging.root.handlers:
            handler.addFilter(logging.Filter("dsc"))
    else:
        log_method, log_level = logger.info, logging.INFO
        template = logging_format_base + ": %(message)s"

    logger.setLevel(log_level)
    logging.basicConfig(format=template)
    logger.addHandler(logging.StreamHandler(stream))
    log_method(f"{logging.getLevelName(logger.getEffectiveLevel())}")

    return (
        f"Logger '{logger.name}' configured with level="
        f"{logging.getLevelName(logger.getEffectiveLevel())}"
    )

At a glance, what makes this hard to refactor are the tests:

  • test_configure_logger_not_verbose
  • test_configure_logger_verbose

which attempt to read from the stream handler that is setup. That's what the log_method, log_level = do above, is get the appropriate logging method to use for log_method(f"{logging.getLevelName(logger.getEffectiveLevel())}") which satisfies these tests.

My feeling: good enough for now, but globally I think we could improve our logging setups. The presence of logging.basicConfig() is kind of a red flag unto itself. But, I think this is mostly consistent with other projects, so value in that.

For example: I'm not sure that returning a string is ideal. Perhaps this method could just log what it has done, and return None. But this would require updating the CLI, tests, etc. that are expecting a string response. And as mentioned, this is consistent with other apps.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Still need review this at a deeper level but figured I'd pass along a discussion @ghukill and I had on a previous PR I worked on re: excluding configure_sentry and configure_logger from the Config class:

From @ghukill :

  • I like having a Config class that I can put complex behavior on for static-esque values I want to access through the application
  • I'm indifferent to configure_logger and configure_sentry on there / they feel somewhat disconnected from the essence of the application, which I attribute to the Config class. Especially sentry. Logging is a little more difficult for me... but then again logging is always kind of difficult. \ Of course, if those methods (configure_sentry and configure_logger) are on it, they don't need to be called.... but if I'm not supposed to call configure_sentry from anywhere in the app, then maybe it doesn't belong on that global Config object?
    This is 5:00pm talking, no much reasoning, but just my knee-jerk thinking.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great refactor, thanks! I was hoping this would prompt a larger conversation about the template/config class because I think it needs work as well, maybe a future DataEng meeting topic?

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think a future DataEng topic could be good, for sure. But I think it could/should extend longer than 30 minute meeting w/ CB.

I have two somewhat conflicting feelings at the moment re: config and logging:

  1. it's okay for projects to tweak our established approach as needed
  2. it's okay if slightly awkward or sub-optimal insofar as it's consistent with other repos

I think it'd be easy for any one project to grind to a halt trying to get it perfect, when we'll always have tension with, "but others don't do it that way..."

Until we do take a long and thoughtful look at our config + logging approach for python projects, I'm pretty okay with allowing projects to try and strike a balance between feelings #1 and #2 above.

self, logger: logging.Logger, stream: StringIO, *, verbose: bool
) -> str:
logging_format_base = "%(asctime)s %(levelname)s %(name)s.%(funcName)s()"
logger.addHandler(logging.StreamHandler(stream))

if verbose:
log_method, log_level = logger.debug, logging.DEBUG
template = logging_format_base + " line %(lineno)d: %(message)s"
for handler in logging.root.handlers:
handler.addFilter(logging.Filter("dsc"))
else:
log_method, log_level = logger.info, logging.INFO
template = logging_format_base + ": %(message)s"

logger.setLevel(log_level)
logging.basicConfig(format=template)
logger.addHandler(logging.StreamHandler(stream))
log_method(f"{logging.getLevelName(logger.getEffectiveLevel())}")

return (
f"Logger '{logger.name}' configured with level="
f"{logging.getLevelName(logger.getEffectiveLevel())}"
)
logger.setLevel(logging.INFO)
return (
f"Logger '{logger.name}' configured with level="
f"{logging.getLevelName(logger.getEffectiveLevel())}"
)


def configure_sentry() -> str:
env = os.getenv("WORKSPACE")
sentry_dsn = os.getenv("SENTRY_DSN")
if sentry_dsn and sentry_dsn.lower() != "none":
sentry_sdk.init(sentry_dsn, environment=env)
return f"Sentry DSN found, exceptions will be sent to Sentry with env={env}"
return "No Sentry DSN found, exceptions will not be sent to Sentry"

def configure_sentry(self) -> str:
env = self.WORKSPACE
sentry_dsn = self.SENTRY_DSN
if sentry_dsn and sentry_dsn.lower() != "none":
sentry_sdk.init(sentry_dsn, environment=env)
return f"Sentry DSN found, exceptions will be sent to Sentry with env={env}"
return "No Sentry DSN found, exceptions will not be sent to Sentry"
80 changes: 80 additions & 0 deletions dsc/s3.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
from __future__ import annotations

import logging
from typing import TYPE_CHECKING

import boto3

if TYPE_CHECKING:
from collections.abc import Iterator

from mypy_boto3_s3.type_defs import PutObjectOutputTypeDef

logger = logging.getLogger(__name__)


class S3Client:
"""A class to perform common S3 operations for this application."""

def __init__(self) -> None:
self.client = boto3.client("s3")

def archive_file_with_new_key(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious whether archiving the metadata file will be required for workflows other than Wiley deposits? 🤔 If an archive/ folder will exist for every workflow bucket, I think the function name is fine as-is, otherwise, it might be good to rename the function so it is more universal?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe so, we probably want an archived/processed path in all of the workflows to prevent re-processing of files

self, bucket: str, key: str, archived_key_prefix: str
) -> None:
"""Update the key of the specified file to archive it from processing.

Args:
bucket: The S3 bucket containing the files to be archived.
key: The key of the file to archive.
archived_key_prefix: The prefix to be applied to the archived file.
"""
self.client.copy_object(
Bucket=bucket,
CopySource=f"{bucket}/{key}",
Key=f"{archived_key_prefix}/{key}",
)
self.client.delete_object(
Bucket=bucket,
Key=key,
)

def put_file(
self, file_content: str | bytes, bucket: str, key: str
) -> PutObjectOutputTypeDef:
"""Put a file in a specified S3 bucket with a specified key.

Args:
file_content: The content of the file to be uploaded.
bucket: The S3 bucket where the file will be uploaded.
key: The key to be used for the uploaded file.
"""
response = self.client.put_object(
Body=file_content,
Bucket=bucket,
Key=key,
)
logger.debug(f"'{key}' uploaded to S3")
return response

def get_files_iter(
self, bucket: str, file_type: str, excluded_key_prefix: str
) -> Iterator[str]:
"""Retrieve file based on file type, bucket, and without excluded prefix.

Args:
bucket: The S3 bucket to search.
file_type: The file type to retrieve.
excluded_key_prefix: Files with this key prefix will not be retrieved.
"""
paginator = self.client.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=bucket)

for page in page_iterator:
files = [
content["Key"]
for content in page["Contents"]
if content["Key"].endswith(file_type)
and excluded_key_prefix not in content["Key"]
]
yield from files
11 changes: 5 additions & 6 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,21 +28,20 @@ ignore = [
# default
"ANN101",
"ANN102",
"COM812",
"D107",
"N812",
"PTH",

# project-specific
"C90",
"COM812",
"D100",
"D101",
"D102",
"D103",
"D104",
"D107",
"G004",
"N812",
"PLR0912",
"PLR0913",
"PLR0915",
"PTH",
"S320",
"S321",
]
Expand Down
31 changes: 31 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,44 @@
from io import StringIO

import boto3
import pytest
from click.testing import CliRunner
from moto import mock_aws

from dsc.config import Config
from dsc.s3 import S3Client


@pytest.fixture(autouse=True)
def _test_env(monkeypatch):
monkeypatch.setenv("SENTRY_DSN", "None")
monkeypatch.setenv("WORKSPACE", "test")
monkeypatch.setenv("AWS_REGION_NAME", "us-east-1")


@pytest.fixture
def config_instance() -> Config:
return Config()


@pytest.fixture
def mocked_s3(config_instance):
with mock_aws():
s3 = boto3.client("s3", region_name=config_instance.AWS_REGION_NAME)
s3.create_bucket(Bucket="awd")
yield s3


@pytest.fixture
def runner():
return CliRunner()


@pytest.fixture
def s3_client():
return S3Client()


@pytest.fixture
def stream():
return StringIO()
38 changes: 23 additions & 15 deletions tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,45 @@
import logging

from dsc.config import configure_logger, configure_sentry
import pytest


def test_configure_logger_not_verbose():
def test_check_required_env_vars(monkeypatch, config_instance):
monkeypatch.delenv("WORKSPACE")
with pytest.raises(OSError, match="Missing required environment variables:"):
config_instance.check_required_env_vars()


def test_configure_logger_not_verbose(config_instance, stream):
logger = logging.getLogger(__name__)
result = configure_logger(logger, verbose=False)
info_log_level = 20
assert logger.getEffectiveLevel() == info_log_level
result = config_instance.configure_logger(logger, stream, verbose=False)
assert logger.getEffectiveLevel() == logging.INFO
assert result == "Logger 'tests.test_config' configured with level=INFO"
stream.seek(0)
assert next(stream) == "INFO\n"


def test_configure_logger_verbose():
def test_configure_logger_verbose(config_instance, stream):
logger = logging.getLogger(__name__)
result = configure_logger(logger, verbose=True)
debug_log_level = 10
assert logger.getEffectiveLevel() == debug_log_level
result = config_instance.configure_logger(logger, stream, verbose=True)
assert logger.getEffectiveLevel() == logging.DEBUG
assert result == "Logger 'tests.test_config' configured with level=DEBUG"
stream.seek(0)
assert next(stream) == "DEBUG\n"


def test_configure_sentry_no_env_variable(monkeypatch):
def test_configure_sentry_no_env_variable(monkeypatch, config_instance):
monkeypatch.delenv("SENTRY_DSN", raising=False)
result = configure_sentry()
result = config_instance.configure_sentry()
assert result == "No Sentry DSN found, exceptions will not be sent to Sentry"


def test_configure_sentry_env_variable_is_none(monkeypatch):
def test_configure_sentry_env_variable_is_none(monkeypatch, config_instance):
monkeypatch.setenv("SENTRY_DSN", "None")
result = configure_sentry()
result = config_instance.configure_sentry()
assert result == "No Sentry DSN found, exceptions will not be sent to Sentry"


def test_configure_sentry_env_variable_is_dsn(monkeypatch):
def test_configure_sentry_env_variable_is_dsn(monkeypatch, config_instance):
monkeypatch.setenv("SENTRY_DSN", "https://1234567890@00000.ingest.sentry.io/123456")
result = configure_sentry()
result = config_instance.configure_sentry()
assert result == "Sentry DSN found, exceptions will be sent to Sentry with env=test"
Loading
Loading