Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
84 commits
Select commit Hold shift + click to select a range
f79f6fd
some missing parts
sanderegg Jan 23, 2024
3748cca
default to tls security
sanderegg Jan 23, 2024
cf126b9
remove useless parts
sanderegg Jan 23, 2024
2fe234a
recipes to create certificates
sanderegg Jan 23, 2024
6ed0a52
added gitignore
sanderegg Jan 23, 2024
2e65c13
pass secrets into the dask backend
sanderegg Jan 23, 2024
dd2a5d7
ensure we have certificates
sanderegg Jan 24, 2024
75ffa3d
use yaml anchors
sanderegg Jan 24, 2024
abe9fad
cleanup
sanderegg Jan 24, 2024
2ba173c
clusters-keeper also talks with dask
sanderegg Jan 24, 2024
470e084
default scheduler is now tls
sanderegg Jan 24, 2024
db4c876
use tls certificate for default
sanderegg Jan 24, 2024
9c2ea47
move to .env
sanderegg Jan 24, 2024
005c399
envs
sanderegg Jan 25, 2024
d794178
typo
sanderegg Jan 25, 2024
828deba
adds new setting to pass default authentication in clusters-keeper
sanderegg Jan 25, 2024
b92bd67
use authentication to connect with scheduler
sanderegg Jan 25, 2024
1c135f7
fix startup of stack, ensure we have certificates
sanderegg Jan 25, 2024
5071ead
same
sanderegg Jan 25, 2024
0fe9316
ensure we have the necessary settings set
sanderegg Jan 25, 2024
41d98d5
fix no security
sanderegg Jan 25, 2024
164054a
fix syntax
sanderegg Jan 25, 2024
b07dda0
all tests go through
sanderegg Jan 25, 2024
2599662
added a test to detect missing ENVs
sanderegg Jan 25, 2024
e3b9286
ruff
sanderegg Jan 25, 2024
5605764
added the secrets
sanderegg Jan 25, 2024
9e15c93
improving tests
sanderegg Jan 25, 2024
ca529f6
minor
sanderegg Jan 25, 2024
b540522
mypy
sanderegg Jan 25, 2024
154c687
missing ENVs
sanderegg Jan 25, 2024
772d727
missing variables
sanderegg Jan 25, 2024
7086aa0
revert
sanderegg Jan 25, 2024
ab447b8
missing env
sanderegg Jan 25, 2024
86bd4b2
missing ENV
sanderegg Jan 25, 2024
34c6285
ensure the dashboard is accessible. since this is where we check the …
sanderegg Jan 25, 2024
941e34f
revert that change
sanderegg Jan 25, 2024
0ac337f
add authentication
sanderegg Jan 25, 2024
b1bb1bb
adding new variable to connect with secured scheduler
sanderegg Jan 25, 2024
008f40d
fixed tests to new syntax
sanderegg Jan 25, 2024
5141e92
missing ENV
sanderegg Jan 25, 2024
44946a1
remove validator
sanderegg Jan 25, 2024
1b26b84
fix test
sanderegg Jan 25, 2024
da6a8a1
fixed test
sanderegg Jan 25, 2024
dc2f563
make the tls certificates optional
sanderegg Jan 26, 2024
4558bf5
upgrade v0.9.0
sanderegg Jan 26, 2024
f2bcfc8
pass iam role with ec2 instances
sanderegg Jan 26, 2024
e377456
new ENV
sanderegg Jan 26, 2024
fcff7e0
add call
sanderegg Jan 26, 2024
c5405f4
add missing env PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE
sanderegg Jan 26, 2024
f5309d4
fix code
sanderegg Jan 26, 2024
caa7cbe
fix call syntax
sanderegg Jan 26, 2024
606dbb9
fix indentation
sanderegg Jan 26, 2024
ec21501
fix protocol
sanderegg Jan 26, 2024
680f387
ensure the path used is the correct one
sanderegg Jan 26, 2024
2eb75a3
add settings to download data
sanderegg Jan 26, 2024
2bc0f27
we download certificates
sanderegg Jan 26, 2024
c79d969
ensure we go tls
sanderegg Jan 26, 2024
0874f88
return the correct auth
sanderegg Jan 26, 2024
ab98145
fix auth in autoscaling
sanderegg Jan 26, 2024
a490395
added missing envs
sanderegg Jan 26, 2024
463f676
fixed test
sanderegg Jan 26, 2024
8fd34a3
fixed test
sanderegg Jan 27, 2024
8783033
unnecessary env
sanderegg Jan 27, 2024
c1e4f47
doc
sanderegg Jan 27, 2024
65acc82
fixed test
sanderegg Jan 27, 2024
86df492
it is now tls
sanderegg Jan 27, 2024
d1cc94f
tls
sanderegg Jan 27, 2024
c979265
ensure starts when DASK_TLS_CA_FILE not set
sanderegg Jan 27, 2024
40ad607
mock dask client
sanderegg Jan 29, 2024
c00d770
use a non secure scheduler for this test
sanderegg Jan 29, 2024
4d3b9d8
ruff
sanderegg Jan 29, 2024
96779ab
use local ip instead of fixed 127.0.0.1
sanderegg Jan 29, 2024
4a7f5cd
ensure migration has access
sanderegg Jan 29, 2024
8e93c14
show log for TLS
sanderegg Jan 29, 2024
ef15300
add scheduler auth
sanderegg Jan 29, 2024
7eb1747
added auth
sanderegg Jan 29, 2024
59c0b3a
fixed mypy
sanderegg Jan 29, 2024
ae41532
missing quote
sanderegg Jan 29, 2024
5a21185
missing ENV
sanderegg Jan 29, 2024
917521e
missing envs
sanderegg Jan 29, 2024
43ff295
use defaults
sanderegg Jan 29, 2024
99d534e
simplify
sanderegg Jan 29, 2024
445cdda
fix settings
sanderegg Jan 29, 2024
c864a1f
@pcrespov review: mistake
sanderegg Jan 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .env-devel
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,15 @@ CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX=""

DASK_SCHEDULER_HOST=dask-scheduler
DASK_SCHEDULER_PORT=8786
DASK_TLS_CA_FILE=/home/scu/.dask/dask-crt.pem
DASK_TLS_KEY=/home/scu/.dask/dask-key.pem
DASK_TLS_CERT=/home/scu/.dask/dask-crt.pem

DIRECTOR_REGISTRY_CACHING_TTL=900
DIRECTOR_REGISTRY_CACHING=True
DIRECTOR_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS='{}'

COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tcp://dask-scheduler:8786
COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tls://dask-scheduler:8786
COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=S3
COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=PRESIGNED
COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=PRESIGNED
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,7 @@ show-endpoints:
up-devel: .stack-simcore-development.yml .init-swarm $(CLIENT_WEB_OUTPUT) ## Deploys local development stack, qx-compile+watch and ops stack (pass 'make ops_disabled=1 up-...' to disable)
# Start compile+watch front-end container [front-end]
@$(MAKE_C) services/static-webserver/client down compile-dev flags=--watch
@$(MAKE_C) services/dask-sidecar certificates
# Deploy stack $(SWARM_STACK_NAME) [back-end]
@docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME)
@$(MAKE) .deploy-ops
Expand All @@ -341,6 +342,7 @@ up-devel: .stack-simcore-development.yml .init-swarm $(CLIENT_WEB_OUTPUT) ## Dep
up-devel-frontend: .stack-simcore-development-frontend.yml .init-swarm ## Every service in production except static-webserver. For front-end development
# Start compile+watch front-end container [front-end]
@$(MAKE_C) services/static-webserver/client down compile-dev flags=--watch
@$(MAKE_C) services/dask-sidecar certificates
# Deploy stack $(SWARM_STACK_NAME) [back-end]
@docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME)
@$(MAKE) .deploy-ops
Expand All @@ -350,6 +352,7 @@ up-devel-frontend: .stack-simcore-development-frontend.yml .init-swarm ## Every

up-prod: .stack-simcore-production.yml .init-swarm ## Deploys local production stack and ops stack (pass 'make ops_disabled=1 ops_ci=1 up-...' to disable or target=<service-name> to deploy a single service)
ifeq ($(target),)
@$(MAKE_C) services/dask-sidecar certificates
# Deploy stack $(SWARM_STACK_NAME)
@docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME)
@$(MAKE) .deploy-ops
Expand All @@ -360,6 +363,7 @@ endif
@$(_show_endpoints)

up-version: .stack-simcore-version.yml .init-swarm ## Deploys versioned stack '$(DOCKER_REGISTRY)/{service}:$(DOCKER_IMAGE_TAG)' and ops stack (pass 'make ops_disabled=1 up-...' to disable)
@$(MAKE_C) services/dask-sidecar certificates
# Deploy stack $(SWARM_STACK_NAME)
@docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME)
@$(MAKE) .deploy-ops
Expand Down
3 changes: 3 additions & 0 deletions packages/aws-library/src/aws_library/ec2/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,9 @@ async def start_aws_instance(
ImageId=instance_config.ami_id,
MinCount=number_of_instances,
MaxCount=number_of_instances,
IamInstanceProfile={"Arn": instance_config.iam_instance_profile}
if instance_config.iam_instance_profile
else {},
InstanceType=instance_config.type.name,
InstanceInitiatedShutdownBehavior="terminate",
KeyName=instance_config.key_name,
Expand Down
1 change: 1 addition & 0 deletions packages/aws-library/src/aws_library/ec2/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ class EC2InstanceConfig:
key_name: str
security_group_ids: list[str]
subnet_id: str
iam_instance_profile: str


AMIIdStr: TypeAlias = str
Expand Down
1 change: 1 addition & 0 deletions packages/aws-library/tests/test_ec2_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ def ec2_instance_config(
key_name=faker.pystr(),
security_group_ids=[aws_security_group_id],
subnet_id=aws_subnet_id,
iam_instance_profile="",
)


Expand Down
23 changes: 22 additions & 1 deletion packages/models-library/src/models_library/clusters.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from enum import auto
from pathlib import Path
from typing import Any, ClassVar, Final, Literal, TypeAlias

from pydantic import (
Expand Down Expand Up @@ -45,6 +46,7 @@ class BaseAuthentication(BaseModel):
type: str

class Config:
frozen = True
extra = Extra.forbid


Expand Down Expand Up @@ -95,7 +97,26 @@ class NoAuthentication(BaseAuthentication):
type: Literal["none"] = "none"


InternalClusterAuthentication: TypeAlias = NoAuthentication
class TLSAuthentication(BaseAuthentication):
type: Literal["tls"] = "tls"
tls_ca_file: Path
tls_client_cert: Path
tls_client_key: Path

class Config(BaseAuthentication.Config):
schema_extra: ClassVar[dict[str, Any]] = {
"examples": [
{
"type": "tls",
"tls_ca_file": "/path/to/ca_file",
"tls_client_cert": "/path/to/cert_file",
"tls_client_key": "/path/to/key_file",
},
]
}


InternalClusterAuthentication: TypeAlias = NoAuthentication | TLSAuthentication
ExternalClusterAuthentication: TypeAlias = (
SimpleAuthentication | KerberosAuthentication | JupyterHubTokenAuthentication
)
Expand Down
3 changes: 2 additions & 1 deletion packages/postgres-database/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,8 @@ FROM base as production
ENV PYTHONOPTIMIZE=TRUE

WORKDIR /home/scu

# ensure home folder is read/writable for user scu
RUN chown -R scu /home/scu
# bring installed package without build tools
COPY --from=build ${VIRTUAL_ENV} ${VIRTUAL_ENV}
COPY entrypoint.bash /home/entrypoint.bash
Expand Down
23 changes: 14 additions & 9 deletions packages/pytest-simcore/src/pytest_simcore/docker_swarm.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,19 +232,24 @@ def _deploy_stack(compose_file: Path, stack_name: str) -> None:
except subprocess.CalledProcessError as err:
if b"update out of sequence" in err.stderr:
raise TryAgain from err
print(
"docker_stack failed",
f"{' '.join(err.cmd)}",
f"returncode={err.returncode}",
f"stdout={err.stdout}",
f"stderr={err.stderr}",
"\nTIP: frequent failure is due to a corrupt .env file: Delete .env and .env.bak",
pytest.fail(
reason=f"deploying docker_stack failed: {err.cmd=}, {err.returncode=}, {err.stdout=}, {err.stderr=}\nTIP: frequent failure is due to a corrupt .env file: Delete .env and .env.bak"
)
raise


def _make_dask_sidecar_certificates(simcore_service_folder: Path) -> None:
dask_sidecar_root_folder = simcore_service_folder / "dask-sidecar"
subprocess.run(
["make", "certificates"], # noqa: S603, S607
cwd=dask_sidecar_root_folder,
check=True,
capture_output=True,
)


@pytest.fixture(scope="module")
def docker_stack(
osparc_simcore_services_dir: Path,
docker_swarm: None,
docker_client: docker.client.DockerClient,
core_docker_compose_file: Path,
Expand Down Expand Up @@ -276,7 +281,7 @@ def docker_stack(
# NOTE: if the migration service was already running prior to this call it must
# be force updated so that it does its job. else it remains and tests will fail
_force_remove_migration_service(docker_client)

_make_dask_sidecar_certificates(osparc_simcore_services_dir)
# make up-version
stacks_deployed: dict[str, dict] = {}
for key, stack_name, compose_file in stacks:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,35 +2,104 @@
# pylint:disable=unused-argument
# pylint:disable=redefined-outer-name

from typing import Iterator
from collections.abc import Iterator
from dataclasses import dataclass
from pathlib import Path

import distributed
import pytest
from distributed import Client
from models_library.clusters import InternalClusterAuthentication, TLSAuthentication
from pydantic import AnyUrl

from .helpers.utils_docker import get_service_published_port
from .helpers.utils_host import get_localhost_ip


@pytest.fixture(scope="function")
async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str:
@pytest.fixture
async def dask_scheduler_service(
simcore_services_ready: None, monkeypatch: pytest.MonkeyPatch
) -> str:
# the dask scheduler has a UI for the dashboard and a secondary port for the API
# simcore_services fixture already ensure the dask-scheduler is up and running
dask_scheduler_api_port = get_service_published_port(
"dask-scheduler", target_ports=[8786]
)
# override the port
monkeypatch.setenv("DASK_SCHEDULER_PORT", f"{dask_scheduler_api_port}")
return AnyUrl.build(scheme="tcp", host="127.0.0.1", port=dask_scheduler_api_port)
return AnyUrl.build(
scheme="tls", host=get_localhost_ip(), port=dask_scheduler_api_port
)


@pytest.fixture
def dask_sidecar_dir(osparc_simcore_services_dir: Path) -> Path:
path = osparc_simcore_services_dir / "dask-sidecar"
assert path.exists()
return path


@pytest.fixture
def dask_backend_tls_certificates_dir(dask_sidecar_dir: Path) -> Path:
path = dask_sidecar_dir / ".dask-certificates"
assert path.exists()
return path


@dataclass(frozen=True, slots=True, kw_only=True)
class _TLSCertificates:
tls_ca_file: Path
tls_cert_file: Path
tls_key_file: Path


@pytest.fixture
def dask_backend_tls_certificates(
dask_backend_tls_certificates_dir,
) -> _TLSCertificates:
certs = _TLSCertificates(
tls_ca_file=dask_backend_tls_certificates_dir / "dask-cert.pem",
tls_cert_file=dask_backend_tls_certificates_dir / "dask-cert.pem",
tls_key_file=dask_backend_tls_certificates_dir / "dask-key.pem",
)
assert certs.tls_ca_file.exists()
assert certs.tls_cert_file.exists()
assert certs.tls_key_file.exists()
return certs


@pytest.fixture
def dask_scheduler_auth(
dask_backend_tls_certificates: _TLSCertificates,
) -> InternalClusterAuthentication:
return TLSAuthentication(
tls_ca_file=dask_backend_tls_certificates.tls_ca_file,
tls_client_cert=dask_backend_tls_certificates.tls_cert_file,
tls_client_key=dask_backend_tls_certificates.tls_key_file,
)


@pytest.fixture
def dask_client_security(
dask_backend_tls_certificates: _TLSCertificates,
) -> distributed.Security:
return distributed.Security(
tls_ca_file=f"{dask_backend_tls_certificates.tls_ca_file}",
tls_client_cert=f"{dask_backend_tls_certificates.tls_cert_file}",
tls_client_key=f"{dask_backend_tls_certificates.tls_key_file}",
require_encryption=True,
)

@pytest.fixture(scope="function")
def dask_client(dask_scheduler_service: str) -> Iterator[Client]:

client = Client(dask_scheduler_service)
@pytest.fixture
def dask_client(
dask_scheduler_service: str, dask_client_security: distributed.Security
) -> Iterator[Client]:
client = Client(dask_scheduler_service, security=dask_client_security)
yield client
client.close()


@pytest.fixture(scope="function")
@pytest.fixture
def dask_sidecar_service(dask_client: Client) -> None:
dask_client.wait_for_workers(n_workers=1, timeout=30)
2 changes: 1 addition & 1 deletion scripts/shellcheck.bash
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,4 @@
# - VS extension: https://github.com/timonwong/vscode-shellcheck
#

exec docker run --rm --interactive --volume "$PWD:/mnt:ro" koalaman/shellcheck:v0.7.0 "$@"
exec docker run --rm --interactive --volume "$PWD:/mnt:ro" koalaman/shellcheck:v0.9.0 "$@"
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
PortInt,
VersionTag,
)
from models_library.clusters import InternalClusterAuthentication
from models_library.docker import DockerLabelKey
from pydantic import (
AnyUrl,
Expand Down Expand Up @@ -152,6 +153,10 @@ class DaskMonitoringSettings(BaseCustomSettings):
DASK_MONITORING_URL: AnyUrl = Field(
..., description="the url to the osparc-dask-scheduler"
)
DASK_SCHEDULER_AUTH: InternalClusterAuthentication = Field(
...,
description="defines the authentication of the clusters created via clusters-keeper (can be None or TLS)",
)


class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -601,6 +601,7 @@ async def _start_instances(
key_name=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME,
security_group_ids=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SECURITY_GROUP_IDS,
subnet_id=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SUBNET_ID,
iam_instance_profile="",
),
number_of_instances=instance_num,
max_number_of_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources
from fastapi import FastAPI
from models_library.clusters import InternalClusterAuthentication
from models_library.docker import (
DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY,
DockerLabelKey,
Expand Down Expand Up @@ -36,6 +37,12 @@ def _scheduler_url(app: FastAPI) -> AnyUrl:
return app_settings.AUTOSCALING_DASK.DASK_MONITORING_URL


def _scheduler_auth(app: FastAPI) -> InternalClusterAuthentication:
app_settings = get_application_settings(app)
assert app_settings.AUTOSCALING_DASK # nosec
return app_settings.AUTOSCALING_DASK.DASK_SCHEDULER_AUTH


class ComputationalAutoscaling(BaseAutoscaling):
@staticmethod
async def get_monitored_nodes(app: FastAPI) -> list[Node]:
Expand All @@ -58,10 +65,12 @@ def get_new_node_docker_tags(
@staticmethod
async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]:
try:
unrunnable_tasks = await dask.list_unrunnable_tasks(_scheduler_url(app))
unrunnable_tasks = await dask.list_unrunnable_tasks(
_scheduler_url(app), _scheduler_auth(app)
)
# NOTE: any worker "processing" more than 1 task means that the other tasks are queued!
processing_tasks_by_worker = await dask.list_processing_tasks_per_worker(
_scheduler_url(app)
_scheduler_url(app), _scheduler_auth(app)
)
queued_tasks = []
for tasks in processing_tasks_by_worker.values():
Expand Down Expand Up @@ -107,13 +116,13 @@ async def compute_node_used_resources(
) -> Resources:
try:
num_results_in_memory = await dask.get_worker_still_has_results_in_memory(
_scheduler_url(app), instance.ec2_instance
_scheduler_url(app), _scheduler_auth(app), instance.ec2_instance
)
if num_results_in_memory > 0:
# NOTE: this is a trick to consider the node still useful
return Resources(cpus=0, ram=ByteSize(1024 * 1024 * 1024))
return await dask.get_worker_used_resources(
_scheduler_url(app), instance.ec2_instance
_scheduler_url(app), _scheduler_auth(app), instance.ec2_instance
)
except (DaskWorkerNotFoundError, DaskNoWorkersError):
return Resources.create_as_empty()
Expand All @@ -139,7 +148,7 @@ async def compute_cluster_total_resources(
) -> Resources:
try:
return await dask.compute_cluster_total_resources(
_scheduler_url(app), instances
_scheduler_url(app), _scheduler_auth(app), instances
)
except DaskNoWorkersError:
return Resources.create_as_empty()
Expand All @@ -153,9 +162,9 @@ async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool

# now check if dask-scheduler is available
return await dask.is_worker_connected(
_scheduler_url(app), instance.ec2_instance
_scheduler_url(app), _scheduler_auth(app), instance.ec2_instance
)

@staticmethod
async def try_retire_nodes(app: FastAPI) -> None:
await dask.try_retire_nodes(_scheduler_url(app))
await dask.try_retire_nodes(_scheduler_url(app), _scheduler_auth(app))
Loading