From f79f6fdd82592a6db393cc5ea6c7c06e22f97934 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Jan 2024 11:56:39 +0100 Subject: [PATCH 01/84] some missing parts --- services/docker-compose.devel.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/docker-compose.devel.yml b/services/docker-compose.devel.yml index 2597d16ae79d..37161e6aa2f9 100644 --- a/services/docker-compose.devel.yml +++ b/services/docker-compose.devel.yml @@ -136,6 +136,7 @@ services: environment: SC_BOOT_MODE: debug-ptvsd SIDECAR_LOGLEVEL: DEBUG + LOG_LEVEL: DEBUG ports: - "3000" deploy: @@ -143,6 +144,10 @@ services: dask-scheduler: volumes: *dev-dask-sidecar-volumes + environment: + SC_BOOT_MODE: debug-ptvsd + SIDECAR_LOGLEVEL: DEBUG + LOG_LEVEL: DEBUG ports: - "3000" deploy: From 3748cca73b3c1e14ca1f78d7d182358d16bea052 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Jan 2024 16:23:12 +0100 Subject: [PATCH 02/84] default to tls security --- .../autoscaling/tests/manual/docker-compose-computational.yml | 4 ++-- .../simcore_service_clusters_keeper/data/docker-compose.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/autoscaling/tests/manual/docker-compose-computational.yml b/services/autoscaling/tests/manual/docker-compose-computational.yml index 462e39904764..5061f73b1488 100644 --- a/services/autoscaling/tests/manual/docker-compose-computational.yml +++ b/services/autoscaling/tests/manual/docker-compose-computational.yml @@ -2,7 +2,7 @@ version: "3.8" services: autoscaling: environment: - - DASK_MONITORING_URL=tcp://dask-scheduler:8786 + - DASK_MONITORING_URL=tls://dask-scheduler:8786 dask-sidecar: dns: 8.8.8.8 # needed to access internet image: itisfoundation/dask-sidecar:master-github-latest @@ -16,7 +16,7 @@ services: environment: DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1 DASK_NPROCS: 1 - DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tcp://dask-scheduler:8786} + DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tls://dask-scheduler:8786} DASK_SIDECAR_NON_USABLE_RAM: 0 DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0 LOG_LEVEL: ${LOG_LEVEL:-INFO} diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index 69e7aaeca6f8..e103b2db11d1 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -29,7 +29,7 @@ services: DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1 DASK_NPROCS: 1 DASK_NTHREADS: ${WORKERS_NTHREADS} - DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tcp://dask-scheduler:8786} + DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tls://dask-scheduler:8786} DASK_SIDECAR_NON_USABLE_RAM: 0 DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0 LOG_LEVEL: ${LOG_LEVEL:-WARNING} From cf126b9abe10a3d6cb9b97712e354f4ceb904449 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Jan 2024 16:26:15 +0100 Subject: [PATCH 03/84] remove useless parts --- services/dask-sidecar/Makefile | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/services/dask-sidecar/Makefile b/services/dask-sidecar/Makefile index 46e62bf61c00..376e0a19bac7 100644 --- a/services/dask-sidecar/Makefile +++ b/services/dask-sidecar/Makefile @@ -7,18 +7,6 @@ include ../../scripts/common-service.Makefile TEMP_DIR := $(shell mktemp -d -t dask-docker-XXX) -PHONY: build-official-dask -# -# NOTE: At this moment, this does not seem to work https://docs.docker.com/engine/reference/commandline/build/ -# export DOCKER_BUILDKIT=0; docker build --tag local/dask:master https://github.com/dask/dask-docker.git#:base -# -build-official-dask: # builds official dask container from master branch repo - git clone --depth 1 https://github.com/dask/dask-docker.git ${TEMP_DIR} && \ - docker build --tag local/dask:master ${TEMP_DIR}/base && \ - rm -rf ${TEMP_DIR} - - - .PHONY: settings-schema.json settings-schema.json: ## [container] dumps json-shcema of this service settings # Dumping settings schema of ${DOCKER_REGISTRY}/${APP_NAME}:${DOCKER_IMAGE_TAG} From 2fe234a2e9aa0ebb53fb8abdb81795a207b1dc5b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Jan 2024 18:42:20 +0100 Subject: [PATCH 04/84] recipes to create certificates --- services/dask-sidecar/Makefile | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/services/dask-sidecar/Makefile b/services/dask-sidecar/Makefile index 376e0a19bac7..07c06669cd8d 100644 --- a/services/dask-sidecar/Makefile +++ b/services/dask-sidecar/Makefile @@ -17,3 +17,30 @@ settings-schema.json: ## [container] dumps json-shcema of this service settings | sed --expression='1,/{/ {/{/!d}' \ > $@ # Dumped '$(CURDIR)/$@' + +.dask-certificates: + # create new certificates + mkdir --parents $@ + # Set variables for the key and certificate paths + # Run openssl without prompts using the -subj argument to pass subject information + key_path="$@/dask-key.pem" && \ + cert_path="$@/dask-cert.pem" && \ + subj="/C=CH/ST=ZH/L=ZH/O=ITIS/OU=OSPARC/CN=osparc.io" && \ + openssl req -x509 -newkey rsa:4096 -nodes -keyout "$$key_path" -out "$$cert_path" -days 365 -subj "$$subj" + + +.PHONY: certificates info-certificates clean-certificates + +certificates: .dask-certificates ## creates a self-signed certificate for use with dask communication + # validating certificates + @openssl verify -CAfile $ Date: Tue, 23 Jan 2024 18:42:50 +0100 Subject: [PATCH 05/84] added gitignore --- services/dask-sidecar/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 services/dask-sidecar/.gitignore diff --git a/services/dask-sidecar/.gitignore b/services/dask-sidecar/.gitignore new file mode 100644 index 000000000000..37979bc37288 --- /dev/null +++ b/services/dask-sidecar/.gitignore @@ -0,0 +1 @@ +.dask-certificates From 2e65c13c2981dc3cb68d3e4e334ece3e120aace0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Tue, 23 Jan 2024 19:30:25 +0100 Subject: [PATCH 06/84] pass secrets into the dask backend --- services/dask-sidecar/docker/boot.sh | 51 ++++++++++++++++++++-------- services/docker-compose.yml | 19 +++++++++++ 2 files changed, 55 insertions(+), 15 deletions(-) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index a6499a5d938d..1197871ee5c1 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -34,35 +34,56 @@ fi # RUNNING application ---------------------------------------- # # - If DASK_START_AS_SCHEDULER is set, then it boots as scheduler otherwise as worker -# - SEE https://docs.dask.org/en/latest/setup/cli.html -# - SEE https://stackoverflow.com/questions/3601515/how-to-check-if-a-variable-is-set-in-bash -# - FIXME: create command prefix: https://unix.stackexchange.com/questions/444946/how-can-we-run-a-command-stored-in-a-variable # +mkdir --parents /home/scu/.config/dask +cat >/home/scu/.config/dask/distributed.yaml <> /home/scu/.config/dask/distributed.yaml - print_info "Starting as dask scheduler:${scheduler_version}..." if [ "${SC_BOOT_MODE}" = "debug-ptvsd" ]; then exec watchmedo auto-restart \ - --recursive \ - --pattern="*.py;*/src/*" \ - --ignore-patterns="*test*;pytest_simcore/*;setup.py;*ignore*" \ - --ignore-directories -- \ - dask scheduler \ - --preload simcore_service_dask_sidecar.scheduler + --recursive \ + --pattern="*.py;*/src/*" \ + --ignore-patterns="*test*;pytest_simcore/*;setup.py;*ignore*" \ + --ignore-directories -- \ + dask scheduler \ + --preload simcore_service_dask_sidecar.scheduler else exec dask scheduler \ - --preload simcore_service_dask_sidecar.scheduler + --preload simcore_service_dask_sidecar.scheduler fi else DASK_WORKER_VERSION=$(dask worker --version) - DASK_SCHEDULER_URL=${DASK_SCHEDULER_URL:="tcp://${DASK_SCHEDULER_HOST}:8786"} + DASK_SCHEDULER_URL=${DASK_SCHEDULER_URL:="tls://${DASK_SCHEDULER_HOST}:8786"} # # DASK RESOURCES DEFINITION diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 33981bc1aa58..56c7ddb6f21f 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -882,6 +882,9 @@ services: - computational_shared_data:${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} - /var/run/docker.sock:/var/run/docker.sock environment: &sidecar-environment + DASK_TLS_CA_FILE: /home/scu/.dask/dask-crt.pem + DASK_TLS_KEY: /home/scu/.dask/dask-key.pem + DASK_TLS_CERT: /home/scu/.dask/dask-crt.pem DASK_SCHEDULER_HOST: ${DASK_SCHEDULER_HOST:-dask-scheduler} DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: ${LOG_FORMAT_LOCAL_DEV_ENABLED} SIDECAR_LOGLEVEL: ${LOG_LEVEL:-WARNING} @@ -889,6 +892,13 @@ services: SIDECAR_COMP_SERVICES_SHARED_FOLDER: ${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} networks: - computational_services_subnet + secrets: &sidecar-secrets + - source: dask_tls_key + target: /home/scu/.dask/dask-key.pem + mode: 0444 + - source: dask_tls_cert + target: /home/scu/.dask/dask-crt.pem + mode: 0444 dask-scheduler: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest} @@ -900,6 +910,7 @@ services: networks: - computational_services_subnet + secrets: *sidecar-secrets datcore-adapter: image: ${DOCKER_REGISTRY:-itisfoundation}/datcore-adapter:${DOCKER_IMAGE_TAG:-latest} @@ -1158,3 +1169,11 @@ networks: internal: false labels: com.simcore.description: "computational services network" + +secrets: + dask_tls_key: + file: ./dask-sidecar/.dask-certificates/dask-key.pem + name: ${SWARM_STACK_NAME}_dask_tls_key + dask_tls_cert: + file: ./dask-sidecar/.dask-certificates/dask-cert.pem + name: ${SWARM_STACK_NAME}_dask_tls_cert From dd2a5d738d69f33f3c31cf128c8563f57aafd6e2 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 09:29:43 +0100 Subject: [PATCH 07/84] ensure we have certificates --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index c3f229187af9..2d46e154b411 100644 --- a/Makefile +++ b/Makefile @@ -332,6 +332,7 @@ show-endpoints: up-devel: .stack-simcore-development.yml .init-swarm $(CLIENT_WEB_OUTPUT) ## Deploys local development stack, qx-compile+watch and ops stack (pass 'make ops_disabled=1 up-...' to disable) # Start compile+watch front-end container [front-end] @$(MAKE_C) services/static-webserver/client down compile-dev flags=--watch + @$(MAKE_C) services/dask-sidecar certificates # Deploy stack $(SWARM_STACK_NAME) [back-end] @docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME) @$(MAKE) .deploy-ops @@ -341,6 +342,7 @@ up-devel: .stack-simcore-development.yml .init-swarm $(CLIENT_WEB_OUTPUT) ## Dep up-devel-frontend: .stack-simcore-development-frontend.yml .init-swarm ## Every service in production except static-webserver. For front-end development # Start compile+watch front-end container [front-end] @$(MAKE_C) services/static-webserver/client down compile-dev flags=--watch + @$(MAKE_C) services/dask-sidecar certificates # Deploy stack $(SWARM_STACK_NAME) [back-end] @docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME) @$(MAKE) .deploy-ops @@ -350,6 +352,7 @@ up-devel-frontend: .stack-simcore-development-frontend.yml .init-swarm ## Every up-prod: .stack-simcore-production.yml .init-swarm ## Deploys local production stack and ops stack (pass 'make ops_disabled=1 ops_ci=1 up-...' to disable or target= to deploy a single service) ifeq ($(target),) + @$(MAKE_C) services/dask-sidecar certificates # Deploy stack $(SWARM_STACK_NAME) @docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME) @$(MAKE) .deploy-ops @@ -360,6 +363,7 @@ endif @$(_show_endpoints) up-version: .stack-simcore-version.yml .init-swarm ## Deploys versioned stack '$(DOCKER_REGISTRY)/{service}:$(DOCKER_IMAGE_TAG)' and ops stack (pass 'make ops_disabled=1 up-...' to disable) + @$(MAKE_C) services/dask-sidecar certificates # Deploy stack $(SWARM_STACK_NAME) @docker stack deploy --with-registry-auth -c $< $(SWARM_STACK_NAME) @$(MAKE) .deploy-ops From 75ffa3dceb5e258802d5bac441b5a00def550256 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 10:35:06 +0100 Subject: [PATCH 08/84] use yaml anchors --- services/docker-compose.yml | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 56c7ddb6f21f..5dff0cc80003 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1,4 +1,12 @@ version: "3.8" +x-dask-tls-secrets: &dask_tls_secrets + - source: dask_tls_key + target: /home/scu/.dask/dask-key.pem + mode: 0444 + - source: dask_tls_cert + target: /home/scu/.dask/dask-crt.pem + mode: 0444 + services: api-server: image: ${DOCKER_REGISTRY:-itisfoundation}/api-server:${DOCKER_IMAGE_TAG:-latest} @@ -316,6 +324,8 @@ services: - default - interactive_services_subnet - computational_services_subnet + secrets: + - *dask_tls_secrets invitations: image: ${DOCKER_REGISTRY:-itisfoundation}/invitations:${DOCKER_IMAGE_TAG:-latest} @@ -892,13 +902,8 @@ services: SIDECAR_COMP_SERVICES_SHARED_FOLDER: ${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} networks: - computational_services_subnet - secrets: &sidecar-secrets - - source: dask_tls_key - target: /home/scu/.dask/dask-key.pem - mode: 0444 - - source: dask_tls_cert - target: /home/scu/.dask/dask-crt.pem - mode: 0444 + secrets: + - *dask_tls_secrets dask-scheduler: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest} @@ -910,7 +915,8 @@ services: networks: - computational_services_subnet - secrets: *sidecar-secrets + secrets: + - *dask_tls_secrets datcore-adapter: image: ${DOCKER_REGISTRY:-itisfoundation}/datcore-adapter:${DOCKER_IMAGE_TAG:-latest} From abe9fad0a44ef8393cf113f06b861701c1f49161 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 10:35:12 +0100 Subject: [PATCH 09/84] cleanup --- services/docker-compose.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 5dff0cc80003..7d553ad55c36 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -252,10 +252,6 @@ services: - DIRECTOR_HOST=${DIRECTOR_HOST} - DIRECTOR_PORT=${DIRECTOR_PORT} - - DIRECTOR_SELF_SIGNED_SSL_FILENAME=${DIRECTOR_SELF_SIGNED_SSL_FILENAME} - - DIRECTOR_SELF_SIGNED_SSL_SECRET_ID=${DIRECTOR_SELF_SIGNED_SSL_SECRET_ID} - - DIRECTOR_SELF_SIGNED_SSL_SECRET_NAME=${DIRECTOR_SELF_SIGNED_SSL_SECRET_NAME} - - DIRECTOR_SERVICES_CUSTOM_CONSTRAINTS=${DIRECTOR_SERVICES_CUSTOM_CONSTRAINTS} - DIRECTOR_V2_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS=${DIRECTOR_V2_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS} - DIRECTOR_V2_DEV_FEATURES_ENABLED=${DIRECTOR_V2_DEV_FEATURES_ENABLED} From 2ba173cce9101f4ae2f07bc1714267ae9c3c02ba Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:02:35 +0100 Subject: [PATCH 10/84] clusters-keeper also talks with dask --- services/docker-compose.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 7d553ad55c36..bddf3646f8eb 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -186,6 +186,8 @@ services: - WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS=${WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS} - WORKERS_EC2_INSTANCES_SUBNET_ID=${WORKERS_EC2_INSTANCES_SUBNET_ID} - WORKERS_EC2_INSTANCES_CUSTOM_TAGS=${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} + secrets: + - *dask_tls_secrets director: image: ${DOCKER_REGISTRY:-itisfoundation}/director:${DOCKER_IMAGE_TAG:-latest} From 470e0840d261f90de204abbaa3bba8bfcd1e6142 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:03:04 +0100 Subject: [PATCH 11/84] default scheduler is now tls --- .env-devel | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env-devel b/.env-devel index 2bbf74c8eb86..9ac530c8230e 100644 --- a/.env-devel +++ b/.env-devel @@ -49,7 +49,7 @@ DIRECTOR_REGISTRY_CACHING_TTL=900 DIRECTOR_REGISTRY_CACHING=True DIRECTOR_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS='{}' -COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tcp://dask-scheduler:8786 +COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tls://dask-scheduler:8786 COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=S3 COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=PRESIGNED COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=PRESIGNED From db4c87662d79ddeecdcc250a48aa9cf3486773c1 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 11:59:41 +0100 Subject: [PATCH 12/84] use tls certificate for default --- .../src/models_library/clusters.py | 30 ++++++++++++++++++- .../core/settings.py | 6 ++-- .../utils/dask_client_utils.py | 20 ++++++++++--- services/docker-compose.yml | 17 ++++++----- 4 files changed, 57 insertions(+), 16 deletions(-) diff --git a/packages/models-library/src/models_library/clusters.py b/packages/models-library/src/models_library/clusters.py index 87b266b06d73..fd0e86d259a9 100644 --- a/packages/models-library/src/models_library/clusters.py +++ b/packages/models-library/src/models_library/clusters.py @@ -1,4 +1,5 @@ from enum import auto +from pathlib import Path from typing import Any, ClassVar, Final, Literal, TypeAlias from pydantic import ( @@ -95,7 +96,34 @@ class NoAuthentication(BaseAuthentication): type: Literal["none"] = "none" -InternalClusterAuthentication: TypeAlias = NoAuthentication +class TLSAuthentication(BaseAuthentication): + type: Literal["tls"] = "tls" + tls_ca_file: Path + tls_client_cert: Path + tls_client_key: Path + + class Config(BaseAuthentication.Config): + schema_extra: ClassVar[dict[str, Any]] = { + "examples": [ + { + "type": "tls", + "tls_ca_file": "/path/to/ca_file", + "tls_client_cert": "/path/to/cert_file", + "tls_client_key": "/path/to/key_file", + }, + ] + } + + @validator("tls_ca_file", "tls_client_cert", "tls_client_key") + @classmethod + def _file_exists(cls, v: Path) -> Path: + if not v.exists(): + msg = f"{v} is missing!" + raise ValueError(msg) + return v + + +InternalClusterAuthentication: TypeAlias = NoAuthentication | TLSAuthentication ExternalClusterAuthentication: TypeAlias = ( SimpleAuthentication | KerberosAuthentication | JupyterHubTokenAuthentication ) diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index 18b44284ccd7..95e9a835ca29 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -18,7 +18,7 @@ ClusterAuthentication, NoAuthentication, ) -from pydantic import AnyHttpUrl, AnyUrl, Field, NonNegativeInt, parse_obj_as, validator +from pydantic import AnyHttpUrl, AnyUrl, Field, NonNegativeInt, validator from settings_library.base import BaseCustomSettings from settings_library.catalog import CatalogSettings from settings_library.docker_registry import RegistrySettings @@ -69,14 +69,14 @@ class ComputationalBackendSettings(BaseCustomSettings): default=True, ) COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL: AnyUrl = Field( - parse_obj_as(AnyUrl, "tcp://dask-scheduler:8786"), + ..., description="This is the cluster that will be used by default" " when submitting computational services (typically " "tcp://dask-scheduler:8786 for the internal cluster, or " "http(s)/GATEWAY_IP:8000 for a osparc-dask-gateway)", ) COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ClusterAuthentication | None = Field( - NoAuthentication(), + ..., description="Empty for the internal cluster, must be one " "of simple/kerberos/jupyterhub for the osparc-dask-gateway", ) diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py index 2ea8be624fa7..a7c2a9a64443 100644 --- a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py @@ -16,10 +16,11 @@ ) from models_library.clusters import ( ClusterAuthentication, + InternalClusterAuthentication, JupyterHubTokenAuthentication, KerberosAuthentication, - NoAuthentication, SimpleAuthentication, + TLSAuthentication, ) from pydantic import AnyUrl @@ -73,12 +74,23 @@ async def close(self) -> None: await wrap_client_async_routine(self.gateway.close()) -async def _connect_to_dask_scheduler(endpoint: AnyUrl) -> DaskSubSystem: +async def _connect_to_dask_scheduler( + endpoint: AnyUrl, authentication: InternalClusterAuthentication +) -> DaskSubSystem: try: + security = False + if isinstance(authentication, TLSAuthentication): + security = distributed.Security( + tls_ca_file=f"{authentication.tls_ca_file}", + tls_client_cert=f"{authentication.tls_client_cert}", + tls_client_key=f"{authentication.tls_client_key}", + require_encryption=True, + ) client = await distributed.Client( f"{endpoint}", asynchronous=True, name=f"director-v2_{socket.gethostname()}_{os.getpid()}", + security=security, ) return DaskSubSystem( client=client, @@ -155,7 +167,7 @@ async def _connect_with_gateway_and_create_cluster( def _is_dask_scheduler(authentication: ClusterAuthentication) -> bool: - return isinstance(authentication, NoAuthentication) + return isinstance(authentication, InternalClusterAuthentication) async def create_internal_client_based_on_auth( @@ -163,7 +175,7 @@ async def create_internal_client_based_on_auth( ) -> DaskSubSystem: if _is_dask_scheduler(authentication): # if no auth then we go for a standard scheduler connection - return await _connect_to_dask_scheduler(endpoint) + return await _connect_to_dask_scheduler(endpoint, authentication) # we do have some auth, so it is going through a gateway return await _connect_with_gateway_and_create_cluster(endpoint, authentication) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index bddf3646f8eb..b569ee5eaf50 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -6,6 +6,10 @@ x-dask-tls-secrets: &dask_tls_secrets - source: dask_tls_cert target: /home/scu/.dask/dask-crt.pem mode: 0444 +x-dask-tls-environment: &dask_tls_environment + - DASK_TLS_CA_FILE: /home/scu/.dask/dask-crt.pem + - DASK_TLS_KEY: /home/scu/.dask/dask-key.pem + - DASK_TLS_CERT: /home/scu/.dask/dask-crt.pem services: api-server: @@ -186,8 +190,7 @@ services: - WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS=${WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS} - WORKERS_EC2_INSTANCES_SUBNET_ID=${WORKERS_EC2_INSTANCES_SUBNET_ID} - WORKERS_EC2_INSTANCES_CUSTOM_TAGS=${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} - secrets: - - *dask_tls_secrets + secrets: *dask_tls_secrets director: image: ${DOCKER_REGISTRY:-itisfoundation}/director:${DOCKER_IMAGE_TAG:-latest} @@ -249,6 +252,7 @@ services: - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL} + - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"} - COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE} @@ -322,8 +326,7 @@ services: - default - interactive_services_subnet - computational_services_subnet - secrets: - - *dask_tls_secrets + secrets: *dask_tls_secrets invitations: image: ${DOCKER_REGISTRY:-itisfoundation}/invitations:${DOCKER_IMAGE_TAG:-latest} @@ -900,8 +903,7 @@ services: SIDECAR_COMP_SERVICES_SHARED_FOLDER: ${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} networks: - computational_services_subnet - secrets: - - *dask_tls_secrets + secrets: *dask_tls_secrets dask-scheduler: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG:-latest} @@ -913,8 +915,7 @@ services: networks: - computational_services_subnet - secrets: - - *dask_tls_secrets + secrets: *dask_tls_secrets datcore-adapter: image: ${DOCKER_REGISTRY:-itisfoundation}/datcore-adapter:${DOCKER_IMAGE_TAG:-latest} From 9c2ea4783deba31ce68fd9e192b711f61159575c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Wed, 24 Jan 2024 12:59:56 +0100 Subject: [PATCH 13/84] move to .env --- .env-devel | 1 + services/docker-compose.yml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.env-devel b/.env-devel index 9ac530c8230e..ec897dc8bd6b 100644 --- a/.env-devel +++ b/.env-devel @@ -50,6 +50,7 @@ DIRECTOR_REGISTRY_CACHING=True DIRECTOR_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS='{}' COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tls://dask-scheduler:8786 +COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"}' COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=S3 COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=PRESIGNED COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=PRESIGNED diff --git a/services/docker-compose.yml b/services/docker-compose.yml index b569ee5eaf50..dd8b1b44712c 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -252,7 +252,7 @@ services: - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL} - - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"} + - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} - COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE} From 005c3994dbfdf580b338ca5cc1600f4194382e50 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 09:34:43 +0100 Subject: [PATCH 14/84] envs --- .env-devel | 3 +++ services/docker-compose.yml | 15 ++++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.env-devel b/.env-devel index ec897dc8bd6b..ccf39a5b9ed3 100644 --- a/.env-devel +++ b/.env-devel @@ -44,6 +44,9 @@ CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX="" DASK_SCHEDULER_HOST=dask-scheduler DASK_SCHEDULER_PORT=8786 +DASK_TLS_CA_FILE=/home/scu/.dask/dask-crt.pem +DASK_TLS_KEY=/home/scu/.dask/dask-key.pem +DASK_TLS_CERT=/home/scu/.dask/dask-crt.pem DIRECTOR_REGISTRY_CACHING_TTL=900 DIRECTOR_REGISTRY_CACHING=True diff --git a/services/docker-compose.yml b/services/docker-compose.yml index dd8b1b44712c..0476012ddd66 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -1,15 +1,12 @@ version: "3.8" x-dask-tls-secrets: &dask_tls_secrets - source: dask_tls_key - target: /home/scu/.dask/dask-key.pem + target: ${DASK_TLS_KEY} mode: 0444 - source: dask_tls_cert - target: /home/scu/.dask/dask-crt.pem + target: ${DASK_TLS_CERT} mode: 0444 -x-dask-tls-environment: &dask_tls_environment - - DASK_TLS_CA_FILE: /home/scu/.dask/dask-crt.pem - - DASK_TLS_KEY: /home/scu/.dask/dask-key.pem - - DASK_TLS_CERT: /home/scu/.dask/dask-crt.pem + services: api-server: @@ -893,9 +890,9 @@ services: - computational_shared_data:${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} - /var/run/docker.sock:/var/run/docker.sock environment: &sidecar-environment - DASK_TLS_CA_FILE: /home/scu/.dask/dask-crt.pem - DASK_TLS_KEY: /home/scu/.dask/dask-key.pem - DASK_TLS_CERT: /home/scu/.dask/dask-crt.pem + DASK_TLS_CA_FILE: ${DASK_TLS_CA_FILE} + DASK_TLS_KEY: ${DASK_TLS_KEY} + DASK_TLS_CERT: ${DASK_TLS_CERT} DASK_SCHEDULER_HOST: ${DASK_SCHEDULER_HOST:-dask-scheduler} DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: ${LOG_FORMAT_LOCAL_DEV_ENABLED} SIDECAR_LOGLEVEL: ${LOG_LEVEL:-WARNING} From d79417876429d1878b8ee85bf7f8b16bc287c5ff Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 11:30:38 +0100 Subject: [PATCH 15/84] typo --- .../src/simcore_service_director_v2/core/settings.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index 95e9a835ca29..6b9e7bec3600 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -16,6 +16,7 @@ DEFAULT_CLUSTER_ID, Cluster, ClusterAuthentication, + ClusterTypeInModel, NoAuthentication, ) from pydantic import AnyHttpUrl, AnyUrl, Field, NonNegativeInt, validator @@ -32,7 +33,6 @@ ) from settings_library.storage import StorageSettings from settings_library.utils_logging import MixinLoggingSettings -from simcore_postgres_database.models.clusters import ClusterType from simcore_sdk.node_ports_common.settings import ( NODE_PORTS_400_REQUEST_TIMEOUT_ATTEMPTS_DEFAULT_VALUE, ) @@ -75,7 +75,7 @@ class ComputationalBackendSettings(BaseCustomSettings): "tcp://dask-scheduler:8786 for the internal cluster, or " "http(s)/GATEWAY_IP:8000 for a osparc-dask-gateway)", ) - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ClusterAuthentication | None = Field( + COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ClusterAuthentication = Field( ..., description="Empty for the internal cluster, must be one " "of simple/kerberos/jupyterhub for the osparc-dask-gateway", @@ -101,7 +101,7 @@ def default_cluster(self) -> Cluster: endpoint=self.COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL, authentication=self.COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH, owner=1, # NOTE: currently this is a soft hack (the group of everyone is the group 1) - type=ClusterType.ON_PREMISE, + type=ClusterTypeInModel.ON_PREMISE, ) @validator("COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH", pre=True) From 828deba693d73ca68c92168f6cde5b13be5f95bc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:04:15 +0100 Subject: [PATCH 16/84] adds new setting to pass default authentication in clusters-keeper --- .../src/simcore_service_clusters_keeper/core/settings.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index b70463505b9f..4ea23b435d08 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -10,6 +10,7 @@ LogLevel, VersionTag, ) +from models_library.clusters import InternalClusterAuthentication from pydantic import Field, NonNegativeInt, PositiveInt, parse_obj_as, validator from settings_library.base import BaseCustomSettings from settings_library.docker_registry import RegistrySettings @@ -256,6 +257,11 @@ class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): description="defines the image tag to use for the computational backend sidecar image (NOTE: it currently defaults to use itisfoundation organisation in Dockerhub)", ) + CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: InternalClusterAuthentication = Field( + ..., + description="defines the authentication of the clusters created via clusters-keeper (can be None or TLS)", + ) + CLUSTERS_KEEPER_DASK_NTHREADS: NonNegativeInt = Field( ..., description="overrides the default number of threads in the dask-sidecars, setting it to 0 will use the default (see description in dask-sidecar)", From b92bd67fcb719279adc8597a68335d1d9efee676 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:04:52 +0100 Subject: [PATCH 17/84] use authentication to connect with scheduler --- .../modules/clusters_management_core.py | 9 ++++-- .../modules/dask.py | 31 ++++++++++++++++--- .../rpc/clusters.py | 6 ++-- .../utils/dask.py | 10 ++++++ 4 files changed, 47 insertions(+), 9 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py index ce02d45166c2..da19781fc744 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters_management_core.py @@ -14,7 +14,7 @@ get_cluster_workers, set_instance_heartbeat, ) -from ..utils.dask import get_scheduler_url +from ..utils.dask import get_scheduler_auth, get_scheduler_url from ..utils.ec2 import HEARTBEAT_TAG_KEY from .dask import is_scheduler_busy, ping_scheduler @@ -78,14 +78,17 @@ async def _find_terminateable_instances( async def check_clusters(app: FastAPI) -> None: + instances = await get_all_clusters(app) connected_intances = [ instance for instance in instances - if await ping_scheduler(get_scheduler_url(instance)) + if await ping_scheduler(get_scheduler_url(instance), get_scheduler_auth(app)) ] for instance in connected_intances: - is_busy = await is_scheduler_busy(get_scheduler_url(instance)) + is_busy = await is_scheduler_busy( + get_scheduler_url(instance), get_scheduler_auth(app) + ) _logger.info( "%s currently %s", f"{instance.id=} for {instance.tags=}", diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py index 6356327a0e84..66b6978c236e 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py @@ -3,6 +3,7 @@ from typing import Any import distributed +from models_library.clusters import InternalClusterAuthentication, TLSAuthentication from pydantic import AnyUrl _logger = logging.getLogger(__name__) @@ -17,9 +18,21 @@ async def _wrap_client_async_routine( return await client_coroutine -async def ping_scheduler(url: AnyUrl) -> bool: +async def ping_scheduler( + url: AnyUrl, authentication: InternalClusterAuthentication +) -> bool: try: - async with distributed.Client(url, asynchronous=True, timeout="5"): + security = False + if isinstance(authentication, TLSAuthentication): + security = distributed.Security( + tls_ca_file=f"{authentication.tls_ca_file}", + tls_client_cert=f"{authentication.tls_client_cert}", + tls_client_key=f"{authentication.tls_client_key}", + require_encryption=True, + ) + async with distributed.Client( + url, asynchronous=True, timeout="5", security=security + ): ... return True except OSError: @@ -31,8 +44,18 @@ async def ping_scheduler(url: AnyUrl) -> bool: return False -async def is_scheduler_busy(url: AnyUrl) -> bool: - async with distributed.Client(url, asynchronous=True) as client: +async def is_scheduler_busy( + url: AnyUrl, authentication: InternalClusterAuthentication +) -> bool: + security = False + if isinstance(authentication, TLSAuthentication): + security = distributed.Security( + tls_ca_file=f"{authentication.tls_ca_file}", + tls_client_cert=f"{authentication.tls_client_cert}", + tls_client_key=f"{authentication.tls_client_key}", + require_encryption=True, + ) + async with distributed.Client(url, asynchronous=True, security=security) as client: datasets_on_scheduler = await _wrap_client_async_routine(client.list_datasets()) _logger.info("cluster currently has %s datasets", len(datasets_on_scheduler)) num_processing_tasks = 0 diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py index ce6ab0b54c17..dcd08c9e3a6b 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py @@ -10,7 +10,7 @@ from ..modules.dask import ping_scheduler from ..modules.redis import get_redis_client from ..utils.clusters import create_cluster_from_ec2_instance -from ..utils.dask import get_scheduler_url +from ..utils.dask import get_scheduler_auth, get_scheduler_url router = RPCRouter() @@ -50,6 +50,8 @@ async def get_or_create_cluster( wallet_id, dask_scheduler_ready=bool( ec2_instance.state == "running" - and await ping_scheduler(url=get_scheduler_url(ec2_instance)) + and await ping_scheduler( + get_scheduler_url(ec2_instance), get_scheduler_auth(app) + ) ), ) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py index ac5bb9400de5..363ffc973795 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py @@ -1,7 +1,17 @@ from aws_library.ec2.models import EC2InstanceData +from fastapi import FastAPI +from models_library.clusters import InternalClusterAuthentication from pydantic import AnyUrl, parse_obj_as +from ..core.settings import get_application_settings + def get_scheduler_url(ec2_instance: EC2InstanceData) -> AnyUrl: url: AnyUrl = parse_obj_as(AnyUrl, f"tcp://{ec2_instance.aws_public_ip}:8786") return url + + +def get_scheduler_auth(app: FastAPI) -> InternalClusterAuthentication: + return get_application_settings( + app + ).CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH From 1c135f735b2494639d68868836960a9e62a433ff Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:19:32 +0100 Subject: [PATCH 18/84] fix startup of stack, ensure we have certificates --- .../src/pytest_simcore/docker_swarm.py | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py b/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py index e7a7fbe7c543..daba02f15d82 100644 --- a/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py +++ b/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py @@ -232,19 +232,24 @@ def _deploy_stack(compose_file: Path, stack_name: str) -> None: except subprocess.CalledProcessError as err: if b"update out of sequence" in err.stderr: raise TryAgain from err - print( - "docker_stack failed", - f"{' '.join(err.cmd)}", - f"returncode={err.returncode}", - f"stdout={err.stdout}", - f"stderr={err.stderr}", - "\nTIP: frequent failure is due to a corrupt .env file: Delete .env and .env.bak", + pytest.fail( + reason=f"deploying docker_stack failed: {err.cmd=}, {err.returncode=}, {err.stdout=}, {err.stderr=}\nTIP: frequent failure is due to a corrupt .env file: Delete .env and .env.bak" ) - raise + + +def _make_dask_sidecar_certificates(simcore_service_folder: Path) -> None: + dask_sidecar_root_folder = simcore_service_folder / "dask-sidecar" + subprocess.run( + ["make", "certificates"], + cwd=dask_sidecar_root_folder, + check=True, + capture_output=True, + ) # noqa: S603, S607 @pytest.fixture(scope="module") def docker_stack( + osparc_simcore_services_dir: Path, docker_swarm: None, docker_client: docker.client.DockerClient, core_docker_compose_file: Path, @@ -276,7 +281,7 @@ def docker_stack( # NOTE: if the migration service was already running prior to this call it must # be force updated so that it does its job. else it remains and tests will fail _force_remove_migration_service(docker_client) - + _make_dask_sidecar_certificates(osparc_simcore_services_dir) # make up-version stacks_deployed: dict[str, dict] = {} for key, stack_name, compose_file in stacks: From 5071eadc9f1464362e4ea9700abd2b0b68ce41f6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:19:52 +0100 Subject: [PATCH 19/84] same --- packages/pytest-simcore/src/pytest_simcore/docker_swarm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py b/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py index daba02f15d82..b75538ae1bc0 100644 --- a/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py +++ b/packages/pytest-simcore/src/pytest_simcore/docker_swarm.py @@ -240,11 +240,11 @@ def _deploy_stack(compose_file: Path, stack_name: str) -> None: def _make_dask_sidecar_certificates(simcore_service_folder: Path) -> None: dask_sidecar_root_folder = simcore_service_folder / "dask-sidecar" subprocess.run( - ["make", "certificates"], + ["make", "certificates"], # noqa: S603, S607 cwd=dask_sidecar_root_folder, check=True, capture_output=True, - ) # noqa: S603, S607 + ) @pytest.fixture(scope="module") From 0fe9316d69f6b1d891be398d3f966b942af4f08e Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:22:10 +0100 Subject: [PATCH 20/84] ensure we have the necessary settings set --- services/clusters-keeper/tests/unit/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index 8282bc203c4e..cdb579b41354 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -102,6 +102,7 @@ def app_environment( "CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES": "{}", "CLUSTERS_KEEPER_EC2_INSTANCES_PREFIX": faker.pystr(), "CLUSTERS_KEEPER_DASK_NTHREADS": f"{faker.pyint(min_value=0)}", + "CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": "{}", "PRIMARY_EC2_INSTANCES_KEY_NAME": faker.pystr(), "PRIMARY_EC2_INSTANCES_SECURITY_GROUP_IDS": json.dumps( faker.pylist(allowed_types=(str,)) From 41d98d57862461b6b23ad416247c4eb97ff5fdb8 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:27:05 +0100 Subject: [PATCH 21/84] fix no security --- .../src/simcore_service_clusters_keeper/modules/dask.py | 4 ++-- .../simcore_service_director_v2/utils/dask_client_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py index 66b6978c236e..16493abe5da3 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/dask.py @@ -22,7 +22,7 @@ async def ping_scheduler( url: AnyUrl, authentication: InternalClusterAuthentication ) -> bool: try: - security = False + security = distributed.Security() if isinstance(authentication, TLSAuthentication): security = distributed.Security( tls_ca_file=f"{authentication.tls_ca_file}", @@ -47,7 +47,7 @@ async def ping_scheduler( async def is_scheduler_busy( url: AnyUrl, authentication: InternalClusterAuthentication ) -> bool: - security = False + security = distributed.Security() if isinstance(authentication, TLSAuthentication): security = distributed.Security( tls_ca_file=f"{authentication.tls_ca_file}", diff --git a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py index a7c2a9a64443..670a312ae886 100644 --- a/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py +++ b/services/director-v2/src/simcore_service_director_v2/utils/dask_client_utils.py @@ -78,7 +78,7 @@ async def _connect_to_dask_scheduler( endpoint: AnyUrl, authentication: InternalClusterAuthentication ) -> DaskSubSystem: try: - security = False + security = distributed.Security() if isinstance(authentication, TLSAuthentication): security = distributed.Security( tls_ca_file=f"{authentication.tls_ca_file}", From 164054acc53c7a9523939d1e8c27716f9b0cdc64 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:27:20 +0100 Subject: [PATCH 22/84] fix syntax --- .../clusters-keeper/tests/unit/test_modules_dask.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/services/clusters-keeper/tests/unit/test_modules_dask.py b/services/clusters-keeper/tests/unit/test_modules_dask.py index ca85c31e920f..7b39a5c95446 100644 --- a/services/clusters-keeper/tests/unit/test_modules_dask.py +++ b/services/clusters-keeper/tests/unit/test_modules_dask.py @@ -6,6 +6,7 @@ import distributed from distributed import SpecCluster from faker import Faker +from models_library.clusters import NoAuthentication from pydantic import AnyUrl, parse_obj_as from simcore_service_clusters_keeper.modules.dask import ( is_scheduler_busy, @@ -20,7 +21,8 @@ async def test_ping_scheduler_non_existing_scheduler(faker: Faker): assert ( await ping_scheduler( - url=parse_obj_as(AnyUrl, f"tcp://{faker.ipv4()}:{faker.port_number()}") + parse_obj_as(AnyUrl, f"tcp://{faker.ipv4()}:{faker.port_number()}"), + NoAuthentication(), ) is False ) @@ -29,7 +31,8 @@ async def test_ping_scheduler_non_existing_scheduler(faker: Faker): async def test_ping_scheduler(dask_spec_local_cluster: SpecCluster): assert ( await ping_scheduler( - parse_obj_as(AnyUrl, dask_spec_local_cluster.scheduler_address) + parse_obj_as(AnyUrl, dask_spec_local_cluster.scheduler_address), + NoAuthentication(), ) is True ) @@ -42,7 +45,7 @@ async def test_ping_scheduler(dask_spec_local_cluster: SpecCluster): ) async def _assert_scheduler_is_busy(url: AnyUrl, *, busy: bool) -> None: print(f"--> waiting for osparc-dask-scheduler to become {busy=}") - assert await is_scheduler_busy(url=url) is busy + assert await is_scheduler_busy(url, NoAuthentication()) is busy print(f"scheduler is now {busy=}") @@ -52,7 +55,7 @@ async def test_is_scheduler_busy( ): # nothing runs right now scheduler_address = parse_obj_as(AnyUrl, dask_spec_local_cluster.scheduler_address) - assert await is_scheduler_busy(url=scheduler_address) is False + assert await is_scheduler_busy(scheduler_address, NoAuthentication()) is False _SLEEP_TIME = 5 def _some_long_running_fct(sleep_time: int) -> str: From b07dda05f947252e180d1d20b2550908c91e34bd Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 14:46:35 +0100 Subject: [PATCH 23/84] all tests go through --- .../tests/unit/test_modules_dask.py | 21 ++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/services/clusters-keeper/tests/unit/test_modules_dask.py b/services/clusters-keeper/tests/unit/test_modules_dask.py index 7b39a5c95446..db1833ffd91f 100644 --- a/services/clusters-keeper/tests/unit/test_modules_dask.py +++ b/services/clusters-keeper/tests/unit/test_modules_dask.py @@ -4,9 +4,14 @@ import time import distributed +import pytest from distributed import SpecCluster from faker import Faker -from models_library.clusters import NoAuthentication +from models_library.clusters import ( + InternalClusterAuthentication, + NoAuthentication, + TLSAuthentication, +) from pydantic import AnyUrl, parse_obj_as from simcore_service_clusters_keeper.modules.dask import ( is_scheduler_busy, @@ -17,12 +22,22 @@ from tenacity.stop import stop_after_delay from tenacity.wait import wait_fixed +_authentication_types = [ + NoAuthentication(), + TLSAuthentication.construct(**TLSAuthentication.Config.schema_extra["examples"][0]), +] + -async def test_ping_scheduler_non_existing_scheduler(faker: Faker): +@pytest.mark.parametrize( + "authentication", _authentication_types, ids=lambda p: f"authentication-{p.type}" +) +async def test_ping_scheduler_non_existing_scheduler( + faker: Faker, authentication: InternalClusterAuthentication +): assert ( await ping_scheduler( parse_obj_as(AnyUrl, f"tcp://{faker.ipv4()}:{faker.port_number()}"), - NoAuthentication(), + authentication, ) is False ) From 25996621b87dc8d2ac06613327e3f6fb9982ee62 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:15:16 +0100 Subject: [PATCH 24/84] added a test to detect missing ENVs --- .../utils/clusters.py | 26 ++++++++--- .../clusters-keeper/tests/unit/conftest.py | 9 +++- .../tests/unit/test_utils_clusters.py | 44 +++++++++++++++++++ 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index b700033fc60d..585793adba06 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -30,13 +30,12 @@ def _docker_compose_yml_base64_encoded() -> str: return base64.b64encode(f.read()).decode("utf-8") -def create_startup_script( +def _prepare_environment_variables( app_settings: ApplicationSettings, *, cluster_machines_name_prefix: str, - ec2_boot_specific: EC2InstanceBootSpecific, additional_custom_tags: EC2Tags, -) -> str: +) -> list[str]: assert app_settings.CLUSTERS_KEEPER_EC2_ACCESS # nosec assert app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES # nosec @@ -47,7 +46,7 @@ def _convert_to_env_list(entries: list[Any]) -> str: def _convert_to_env_dict(entries: dict[str, Any]) -> str: return f"'{json.dumps(jsonable_encoder(entries))}'" - environment_variables = [ + return [ f"DOCKER_IMAGE_TAG={app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG}", f"DASK_NTHREADS={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS or ''}", f"CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ACCESS_KEY_ID}", @@ -61,10 +60,27 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: f"WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS={_convert_to_env_list(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS)}", f"WORKERS_EC2_INSTANCES_SUBNET_ID={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_SUBNET_ID}", f"WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION}", - f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", + f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", # type: ignore f"LOG_LEVEL={app_settings.LOG_LEVEL}", ] + +def create_startup_script( + app_settings: ApplicationSettings, + *, + cluster_machines_name_prefix: str, + ec2_boot_specific: EC2InstanceBootSpecific, + additional_custom_tags: EC2Tags, +) -> str: + assert app_settings.CLUSTERS_KEEPER_EC2_ACCESS # nosec + assert app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES # nosec + + environment_variables = _prepare_environment_variables( + app_settings, + cluster_machines_name_prefix=cluster_machines_name_prefix, + additional_custom_tags=additional_custom_tags, + ) + startup_commands = ec2_boot_specific.custom_boot_scripts.copy() startup_commands.extend( [ diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index cdb579b41354..aa0d4a9e0e7a 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -53,7 +53,7 @@ @pytest.fixture(scope="session") def project_slug_dir(osparc_simcore_root_dir: Path) -> Path: # fixtures in pytest_simcore.environs - service_folder = osparc_simcore_root_dir / "services" / "clusters_keeper" + service_folder = osparc_simcore_root_dir / "services" / "clusters-keeper" assert service_folder.exists() assert any(service_folder.glob("src/simcore_service_clusters_keeper")) return service_folder @@ -237,6 +237,13 @@ async def async_docker_client() -> AsyncIterator[aiodocker.Docker]: yield docker_client +@pytest.fixture +def clusters_keeper_docker_compose_file(installed_package_dir: Path) -> Path: + docker_compose_path = installed_package_dir / "data" / "docker-compose.yml" + assert docker_compose_path.exists() + return docker_compose_path + + @pytest.fixture def clusters_keeper_docker_compose() -> dict[str, Any]: data = importlib.resources.read_text( diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index 92a84b153813..d3730ade0c91 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -3,7 +3,9 @@ # pylint: disable=unused-variable import re +import subprocess from collections.abc import Callable +from pathlib import Path from typing import Any import pytest @@ -18,6 +20,7 @@ from pytest_simcore.helpers.utils_envs import EnvVarsDict from simcore_service_clusters_keeper.core.settings import ApplicationSettings from simcore_service_clusters_keeper.utils.clusters import ( + _prepare_environment_variables, create_cluster_from_ec2_instance, create_startup_script, ) @@ -132,6 +135,47 @@ def test_create_startup_script( ) +def test_startup_script_defines_all_envs_for_docker_compose( + disabled_rabbitmq: None, + mocked_ec2_server_envs: EnvVarsDict, + mocked_redis_server: None, + app_settings: ApplicationSettings, + cluster_machines_name_prefix: str, + ec2_boot_specs: EC2InstanceBootSpecific, + clusters_keeper_docker_compose_file: Path, +): + additional_custom_tags = { + AWSTagKey("pytest-tag-key"): AWSTagValue("pytest-tag-value") + } + environment_variables = _prepare_environment_variables( + app_settings, + cluster_machines_name_prefix=cluster_machines_name_prefix, + additional_custom_tags=additional_custom_tags, + ) + assert environment_variables + process = subprocess.run( + [ + "docker", + "compose", + "--dry-run", + f"--file={clusters_keeper_docker_compose_file}", + "up", + ], + capture_output=True, + check=True, + env={ + e.split("=", maxsplit=1)[0]: e.split("=", maxsplit=1)[1] + for e in environment_variables + }, + ) + assert process + assert process.stderr + _ENV_VARIABLE_NOT_SET_ERROR = "variable is not set" + assert _ENV_VARIABLE_NOT_SET_ERROR not in process.stderr.decode() + assert process.stdout + assert process.stdout is None + + @pytest.mark.parametrize( "ec2_state, expected_cluster_state", [ From e3b92864bbcc001b61e244807d11169e9e87806c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:15:46 +0100 Subject: [PATCH 25/84] ruff --- services/clusters-keeper/tests/unit/test_utils_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index d3730ade0c91..4bfe6b6ae027 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -154,7 +154,7 @@ def test_startup_script_defines_all_envs_for_docker_compose( ) assert environment_variables process = subprocess.run( - [ + [ # noqa: S603, S607 "docker", "compose", "--dry-run", From 5605764c5558c03bf37880d6083aadd2a0f4a2cf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 15:15:55 +0100 Subject: [PATCH 26/84] added the secrets --- .../data/docker-compose.yml | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index e103b2db11d1..953812aff417 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -1,4 +1,12 @@ version: "3.8" +x-dask-tls-secrets: &dask_tls_secrets + - source: dask_tls_key + target: ${DASK_TLS_KEY} + mode: 0444 + - source: dask_tls_cert + target: ${DASK_TLS_CERT} + mode: 0444 + services: dask-scheduler: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG} @@ -7,7 +15,10 @@ services: hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" environment: DASK_START_AS_SCHEDULER: 1 - LOG_LEVEL: ${LOG_LEVEL:-WARNING} + DASK_TLS_CA_FILE: ${DASK_TLS_CA_FILE} + DASK_TLS_CERT: ${DASK_TLS_CERT} + DASK_TLS_KEY: ${DASK_TLS_KEY} + LOG_LEVEL: ${LOG_LEVEL} ports: - 8786:8786 # dask-scheduler tcp access - 8787:8787 # dashboard @@ -15,6 +26,7 @@ services: placement: constraints: - "node.role==manager" + secrets: *dask_tls_secrets dask-sidecar: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG} @@ -28,11 +40,14 @@ services: environment: DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1 DASK_NPROCS: 1 - DASK_NTHREADS: ${WORKERS_NTHREADS} - DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tls://dask-scheduler:8786} + DASK_NTHREADS: ${DASK_NTHREADS} + DASK_SCHEDULER_URL: tls://dask-scheduler:8786 DASK_SIDECAR_NON_USABLE_RAM: 0 DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0 - LOG_LEVEL: ${LOG_LEVEL:-WARNING} + DASK_TLS_CA_FILE: ${DASK_TLS_CA_FILE} + DASK_TLS_CERT: ${DASK_TLS_CERT} + DASK_TLS_KEY: ${DASK_TLS_KEY} + LOG_LEVEL: ${LOG_LEVEL} SIDECAR_COMP_SERVICES_SHARED_FOLDER: ${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME: computational_shared_data deploy: @@ -40,6 +55,7 @@ services: placement: constraints: - "node.role==worker" + secrets: *dask_tls_secrets autoscaling: image: ${DOCKER_REGISTRY:-itisfoundation}/autoscaling:${DOCKER_IMAGE_TAG} @@ -47,14 +63,15 @@ services: init: true hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" environment: - DASK_MONITORING_URL: tcp://dask-scheduler:8786 AUTOSCALING_EC2_ACCESS_KEY_ID: ${CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID} AUTOSCALING_EC2_ENDPOINT: ${CLUSTERS_KEEPER_EC2_ENDPOINT} AUTOSCALING_EC2_REGION_NAME: ${CLUSTERS_KEEPER_EC2_REGION_NAME} AUTOSCALING_EC2_SECRET_ACCESS_KEY: ${CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY} AUTOSCALING_NODES_MONITORING: null AUTOSCALING_POLL_INTERVAL: 10 + DASK_MONITORING_URL: tls://dask-scheduler:8786 EC2_INSTANCES_ALLOWED_TYPES: ${WORKERS_EC2_INSTANCES_ALLOWED_TYPES} + EC2_INSTANCES_CUSTOM_TAGS: ${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} EC2_INSTANCES_KEY_NAME: ${WORKERS_EC2_INSTANCES_KEY_NAME} EC2_INSTANCES_MACHINES_BUFFER: 0 EC2_INSTANCES_MAX_INSTANCES: ${WORKERS_EC2_INSTANCES_MAX_INSTANCES} @@ -62,7 +79,6 @@ services: EC2_INSTANCES_SECURITY_GROUP_IDS: ${WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS} EC2_INSTANCES_SUBNET_ID: ${WORKERS_EC2_INSTANCES_SUBNET_ID} EC2_INSTANCES_TIME_BEFORE_TERMINATION: ${WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION} - EC2_INSTANCES_CUSTOM_TAGS: ${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} LOG_FORMAT_LOCAL_DEV_ENABLED: 1 LOG_LEVEL: ${LOG_LEVEL:-WARNING} REDIS_HOST: redis @@ -73,6 +89,7 @@ services: placement: constraints: - "node.role==manager" + secrets: *dask_tls_secrets redis: # NOTE: currently autoscaling requires redis to run @@ -96,3 +113,9 @@ volumes: computational_shared_data: name: computational_shared_data redis-data: + +secrets: + dask_tls_key: + file: ./dask-sidecar/.dask-certificates/dask-key.pem + dask_tls_cert: + file: ./dask-sidecar/.dask-certificates/dask-cert.pem From 9e15c9342b89028b18032666aec378bef38f8010 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:09:22 +0100 Subject: [PATCH 27/84] improving tests --- .../src/models_library/clusters.py | 2 + .../data/docker-compose.yml | 11 +++- .../utils/clusters.py | 57 +++++++++++++++---- .../tests/unit/test_utils_clusters.py | 19 +++---- 4 files changed, 65 insertions(+), 24 deletions(-) diff --git a/packages/models-library/src/models_library/clusters.py b/packages/models-library/src/models_library/clusters.py index fd0e86d259a9..90ae1626eb19 100644 --- a/packages/models-library/src/models_library/clusters.py +++ b/packages/models-library/src/models_library/clusters.py @@ -2,6 +2,7 @@ from pathlib import Path from typing import Any, ClassVar, Final, Literal, TypeAlias +from attr import frozen from pydantic import ( AnyUrl, BaseModel, @@ -46,6 +47,7 @@ class BaseAuthentication(BaseModel): type: str class Config: + frozen = True extra = Extra.forbid diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index 953812aff417..a8801672c452 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -1,5 +1,8 @@ version: "3.8" x-dask-tls-secrets: &dask_tls_secrets + - source: dask_tls_ca + target: ${DASK_TLS_KEY} + mode: 0444 - source: dask_tls_key target: ${DASK_TLS_KEY} mode: 0444 @@ -48,7 +51,7 @@ services: DASK_TLS_CERT: ${DASK_TLS_CERT} DASK_TLS_KEY: ${DASK_TLS_KEY} LOG_LEVEL: ${LOG_LEVEL} - SIDECAR_COMP_SERVICES_SHARED_FOLDER: ${SIDECAR_COMP_SERVICES_SHARED_FOLDER:-/home/scu/computational_shared_data} + SIDECAR_COMP_SERVICES_SHARED_FOLDER: /home/scu/computational_shared_data SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME: computational_shared_data deploy: mode: global @@ -115,7 +118,9 @@ volumes: redis-data: secrets: + dask_tls_ca: + file: .dask-certificates/tls_dask_ca.pem dask_tls_key: - file: ./dask-sidecar/.dask-certificates/dask-key.pem + file: .dask-certificates/tls_dask_cert.pem dask_tls_cert: - file: ./dask-sidecar/.dask-certificates/dask-cert.pem + file: .dask-certificates/tls_dask_key.pem diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 585793adba06..321fee467be4 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -2,6 +2,7 @@ import datetime import functools import json +from pathlib import Path from typing import Any, Final from aws_library.ec2.models import EC2InstanceBootSpecific, EC2InstanceData, EC2Tags @@ -10,7 +11,7 @@ ClusterState, OnDemandCluster, ) -from models_library.clusters import NoAuthentication +from models_library.clusters import NoAuthentication, TLSAuthentication from models_library.users import UserID from models_library.wallets import WalletID from types_aiobotocore_ec2.literals import InstanceStateNameType @@ -20,14 +21,35 @@ from .dask import get_scheduler_url _DOCKER_COMPOSE_FILE_NAME: Final[str] = "docker-compose.yml" +_HOST_DOCKER_COMPOSE_PATH: Final[Path] = Path(f"/{_DOCKER_COMPOSE_FILE_NAME}") +_HOST_CERTIFICATES_BASE_PATH: Final[Path] = Path("/.dask-sidecar-certificates") +_HOST_TLS_CA_FILE_PATH: Final[Path] = _HOST_CERTIFICATES_BASE_PATH / "tls_dask_ca.pem" +_HOST_TLS_CERT_FILE_PATH: Final[Path] = ( + _HOST_CERTIFICATES_BASE_PATH / "tls_dask_cert.pem" +) +_HOST_TLS_KEY_FILE_PATH: Final[Path] = _HOST_CERTIFICATES_BASE_PATH / "tls_dask_key.pem" + + +def _base_64_encode(file: Path) -> str: + assert file.exists() # nosec + with file.open("rb") as f: + return base64.b64encode(f.read()).decode("utf-8") @functools.lru_cache def _docker_compose_yml_base64_encoded() -> str: file_path = PACKAGE_DATA_FOLDER / _DOCKER_COMPOSE_FILE_NAME - assert file_path.exists() # nosec - with file_path.open("rb") as f: - return base64.b64encode(f.read()).decode("utf-8") + return _base_64_encode(file_path) + + +@functools.lru_cache +def _write_tls_certificates_commands(auth: TLSAuthentication) -> list[str]: + return [ + f"mkdir --parents {_HOST_CERTIFICATES_BASE_PATH}", + f"echo '{_base_64_encode(auth.tls_ca_file)}' > {_HOST_TLS_CA_FILE_PATH}", + f"echo '{_base_64_encode(auth.tls_client_cert)}' > {_HOST_TLS_CERT_FILE_PATH}", + f"echo '{_base_64_encode(auth.tls_client_key)}' > {_HOST_TLS_KEY_FILE_PATH}", + ] def _prepare_environment_variables( @@ -47,21 +69,24 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: return f"'{json.dumps(jsonable_encoder(entries))}'" return [ - f"DOCKER_IMAGE_TAG={app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG}", - f"DASK_NTHREADS={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS or ''}", f"CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ACCESS_KEY_ID}", f"CLUSTERS_KEEPER_EC2_ENDPOINT={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ENDPOINT}", f"CLUSTERS_KEEPER_EC2_REGION_NAME={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_REGION_NAME}", f"CLUSTERS_KEEPER_EC2_SECRET_ACCESS_KEY={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_SECRET_ACCESS_KEY}", + f"DASK_NTHREADS={app_settings.CLUSTERS_KEEPER_DASK_NTHREADS or ''}", + f"DASK_TLS_CA_FILE={_HOST_TLS_CA_FILE_PATH}", + f"DASK_TLS_CERT={_HOST_TLS_CERT_FILE_PATH}", + f"DASK_TLS_KEY={_HOST_TLS_KEY_FILE_PATH}", + f"DOCKER_IMAGE_TAG={app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG}", + f"EC2_INSTANCES_NAME_PREFIX={cluster_machines_name_prefix}", + f"LOG_LEVEL={app_settings.LOG_LEVEL}", f"WORKERS_EC2_INSTANCES_ALLOWED_TYPES={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_ALLOWED_TYPES)}", + f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", # type: ignore f"WORKERS_EC2_INSTANCES_KEY_NAME={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_KEY_NAME}", f"WORKERS_EC2_INSTANCES_MAX_INSTANCES={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_MAX_INSTANCES}", - f"EC2_INSTANCES_NAME_PREFIX={cluster_machines_name_prefix}", f"WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS={_convert_to_env_list(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS)}", f"WORKERS_EC2_INSTANCES_SUBNET_ID={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_SUBNET_ID}", f"WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_TIME_BEFORE_TERMINATION}", - f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", # type: ignore - f"LOG_LEVEL={app_settings.LOG_LEVEL}", ] @@ -82,13 +107,23 @@ def create_startup_script( ) startup_commands = ec2_boot_specific.custom_boot_scripts.copy() + + if isinstance( + app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH, + TLSAuthentication, + ): + write_certificates_commands = _write_tls_certificates_commands( + app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH + ) + startup_commands.extend(write_certificates_commands) + startup_commands.extend( [ # NOTE: https://stackoverflow.com/questions/41203492/solving-redis-warnings-on-overcommit-memory-and-transparent-huge-pages-for-ubunt "sysctl vm.overcommit_memory=1", - f"echo '{_docker_compose_yml_base64_encoded()}' | base64 -d > docker-compose.yml", + f"echo '{_docker_compose_yml_base64_encoded()}' | base64 -d > {_HOST_DOCKER_COMPOSE_PATH}", "docker swarm init", - f"{' '.join(environment_variables)} docker stack deploy --with-registry-auth --compose-file=docker-compose.yml dask_stack", + f"{' '.join(environment_variables)} docker stack deploy --with-registry-auth --compose-file={_HOST_DOCKER_COMPOSE_PATH} dask_stack", ] ) return "\n".join(startup_commands) diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index 4bfe6b6ae027..c49945b61692 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -67,12 +67,12 @@ def test_create_startup_script( for boot_script in ec2_boot_specs.custom_boot_scripts: assert boot_script in startup_script # we have commands to pipe into a docker-compose file - assert " | base64 -d > docker-compose.yml" in startup_script + assert " | base64 -d > /docker-compose.yml" in startup_script # we have commands to init a docker-swarm assert "docker swarm init" in startup_script # we have commands to deploy a stack assert ( - "docker stack deploy --with-registry-auth --compose-file=docker-compose.yml dask_stack" + "docker stack deploy --with-registry-auth --compose-file=/docker-compose.yml dask_stack" in startup_script ) # before that we have commands that setup ENV variables, let's check we have all of them as defined in the docker-compose @@ -87,20 +87,20 @@ def test_create_startup_script( ) startup_script_env_keys_names = [key for key, _ in startup_script_key_value_pairs] # docker-compose expected values + docker_compose_expected_environment: dict[str, str] = {} assert "services" in clusters_keeper_docker_compose - assert "autoscaling" in clusters_keeper_docker_compose["services"] - assert "environment" in clusters_keeper_docker_compose["services"]["autoscaling"] - docker_compose_expected_environment: dict[ - str, str - ] = clusters_keeper_docker_compose["services"]["autoscaling"]["environment"] - assert isinstance(docker_compose_expected_environment, dict) + assert isinstance(clusters_keeper_docker_compose["services"], dict) + for service_details in clusters_keeper_docker_compose["services"].values(): + if "environment" in service_details: + assert isinstance(service_details["environment"], dict) + docker_compose_expected_environment |= service_details["environment"] # check the expected environment variables are set so the docker-compose will be complete (we define enough) expected_env_keys = [ v[2:-1].split(":")[0] for v in docker_compose_expected_environment.values() if isinstance(v, str) and v.startswith("${") - ] + ["DASK_NTHREADS", "DOCKER_IMAGE_TAG"] + ] + ["DOCKER_IMAGE_TAG"] for env_key in expected_env_keys: assert ( env_key in startup_script_env_keys_names @@ -173,7 +173,6 @@ def test_startup_script_defines_all_envs_for_docker_compose( _ENV_VARIABLE_NOT_SET_ERROR = "variable is not set" assert _ENV_VARIABLE_NOT_SET_ERROR not in process.stderr.decode() assert process.stdout - assert process.stdout is None @pytest.mark.parametrize( From ca529f6d50b45b09718bb4acd97cfe484ecf6aea Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:18:37 +0100 Subject: [PATCH 28/84] minor --- packages/models-library/src/models_library/clusters.py | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/models-library/src/models_library/clusters.py b/packages/models-library/src/models_library/clusters.py index 90ae1626eb19..37521e877d53 100644 --- a/packages/models-library/src/models_library/clusters.py +++ b/packages/models-library/src/models_library/clusters.py @@ -2,7 +2,6 @@ from pathlib import Path from typing import Any, ClassVar, Final, Literal, TypeAlias -from attr import frozen from pydantic import ( AnyUrl, BaseModel, From b5405223ab72c05d164858c849fe689fc8cdebfe Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:22:40 +0100 Subject: [PATCH 29/84] mypy --- .../src/simcore_service_clusters_keeper/utils/clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 321fee467be4..438645aca203 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -81,7 +81,7 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: f"EC2_INSTANCES_NAME_PREFIX={cluster_machines_name_prefix}", f"LOG_LEVEL={app_settings.LOG_LEVEL}", f"WORKERS_EC2_INSTANCES_ALLOWED_TYPES={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_ALLOWED_TYPES)}", - f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", # type: ignore + f"WORKERS_EC2_INSTANCES_CUSTOM_TAGS={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_CUSTOM_TAGS | additional_custom_tags)}", f"WORKERS_EC2_INSTANCES_KEY_NAME={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_KEY_NAME}", f"WORKERS_EC2_INSTANCES_MAX_INSTANCES={app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_MAX_INSTANCES}", f"WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS={_convert_to_env_list(app_settings.CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES.WORKERS_EC2_INSTANCES_SECURITY_GROUP_IDS)}", From 154c687089dc1d18da17d7c5b7e279a063822efb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:36:01 +0100 Subject: [PATCH 30/84] missing ENVs --- services/director-v2/tests/conftest.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/conftest.py b/services/director-v2/tests/conftest.py index 16498498047c..3479f905ce87 100644 --- a/services/director-v2/tests/conftest.py +++ b/services/director-v2/tests/conftest.py @@ -16,6 +16,7 @@ import pytest import simcore_service_director_v2 from asgi_lifespan import LifespanManager +from faker import Faker from fastapi import FastAPI from models_library.projects import Node, NodesDict from pytest_mock import MockerFixture @@ -142,7 +143,9 @@ def dynamic_sidecar_docker_image_name() -> str: @pytest.fixture def mock_env( - monkeypatch: pytest.MonkeyPatch, dynamic_sidecar_docker_image_name: str + monkeypatch: pytest.MonkeyPatch, + dynamic_sidecar_docker_image_name: str, + faker: Faker, ) -> EnvVarsDict: """This is the base mock envs used to configure the app. @@ -155,6 +158,8 @@ def mock_env( "SWARM_STACK_NAME": "test_swarm_name", "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": "false", "COMPUTATIONAL_BACKEND_ENABLED": "false", + "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL": f"{faker.url()}", + "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": "{}", "DIRECTOR_V2_DYNAMIC_SCHEDULER_ENABLED": "false", "RABBIT_HOST": "mocked_host", "RABBIT_SECURE": "false", From 772d727938f8a74890e8d16756240bb0289365d0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:38:59 +0100 Subject: [PATCH 31/84] missing variables --- services/director-v2/.env-devel | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/director-v2/.env-devel b/services/director-v2/.env-devel index c76d8413a28c..86b19e130713 100644 --- a/services/director-v2/.env-devel +++ b/services/director-v2/.env-devel @@ -11,6 +11,11 @@ SC_BOOT_MODE=debug-ptvsd # Variables typically passed upon start via services/docker-compose.yml files + +DASK_TLS_CA_FILE=./dask-sidecar/.dask-certificates/dask-cert.pem +DASK_TLS_KEY=./dask-sidecar/.dask-certificates/dask-key.pem +DASK_TLS_CERT=./dask-sidecar/.dask-certificates/dask-cert.pem + DYNAMIC_SIDECAR_IMAGE=local/dynamic-sidecar:development DYNAMIC_SIDECAR_PROMETHEUS_SERVICE_LABELS={} From 7086aa07b3099d7e536869d32ad1d8f9feb1229d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 16:39:43 +0100 Subject: [PATCH 32/84] revert --- services/director-v2/.env-devel | 5 ----- 1 file changed, 5 deletions(-) diff --git a/services/director-v2/.env-devel b/services/director-v2/.env-devel index 86b19e130713..c76d8413a28c 100644 --- a/services/director-v2/.env-devel +++ b/services/director-v2/.env-devel @@ -11,11 +11,6 @@ SC_BOOT_MODE=debug-ptvsd # Variables typically passed upon start via services/docker-compose.yml files - -DASK_TLS_CA_FILE=./dask-sidecar/.dask-certificates/dask-cert.pem -DASK_TLS_KEY=./dask-sidecar/.dask-certificates/dask-key.pem -DASK_TLS_CERT=./dask-sidecar/.dask-certificates/dask-cert.pem - DYNAMIC_SIDECAR_IMAGE=local/dynamic-sidecar:development DYNAMIC_SIDECAR_PROMETHEUS_SERVICE_LABELS={} From ab447b8d1e16c949132e837126f3048bd2037749 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:14:02 +0100 Subject: [PATCH 33/84] missing env --- services/docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 0476012ddd66..9ab3913436ea 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -153,6 +153,7 @@ services: - default environment: - CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG} + - CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} - CLUSTERS_KEEPER_DASK_NTHREADS=${CLUSTERS_KEEPER_DASK_NTHREADS} - CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=${CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION} - CLUSTERS_KEEPER_TASK_INTERVAL=${CLUSTERS_KEEPER_TASK_INTERVAL} From 86bd4b26650f8de8e7105c3697daaa8848eb0627 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:15:03 +0100 Subject: [PATCH 34/84] missing ENV --- .env-devel | 1 + 1 file changed, 1 insertion(+) diff --git a/.env-devel b/.env-devel index ccf39a5b9ed3..058921b9daab 100644 --- a/.env-devel +++ b/.env-devel @@ -33,6 +33,7 @@ CATALOG_SERVICES_DEFAULT_RESOURCES='{"CPU": {"limit": 0.1, "reservation": 0.1}, CATALOG_SERVICES_DEFAULT_SPECIFICATIONS='{}' CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest +CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"}' CLUSTERS_KEEPER_DASK_NTHREADS=0 CLUSTERS_KEEPER_EC2_ACCESS=null CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=5 From 34c62856e1756e7e568efe176c70e3c339a6ed23 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:15:17 +0100 Subject: [PATCH 35/84] ensure the dashboard is accessible. since this is where we check the health --- services/dask-sidecar/docker/boot.sh | 6 ------ 1 file changed, 6 deletions(-) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index 1197871ee5c1..97d62b1e31db 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -56,12 +56,6 @@ distributed: client: key: ${DASK_TLS_KEY} cert: ${DASK_TLS_CERT} - scheduler: - dashboard: - tls: - ca-file: ${DASK_TLS_CA_FILE} - key: ${DASK_TLS_KEY} - cert: ${DASK_TLS_CERT} EOF if [ ${DASK_START_AS_SCHEDULER+x} ]; then From 941e34f4d0d77c007b8db405c1b58ea3163f49e0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:18:24 +0100 Subject: [PATCH 36/84] revert that change --- .../autoscaling/tests/manual/docker-compose-computational.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/autoscaling/tests/manual/docker-compose-computational.yml b/services/autoscaling/tests/manual/docker-compose-computational.yml index 5061f73b1488..462e39904764 100644 --- a/services/autoscaling/tests/manual/docker-compose-computational.yml +++ b/services/autoscaling/tests/manual/docker-compose-computational.yml @@ -2,7 +2,7 @@ version: "3.8" services: autoscaling: environment: - - DASK_MONITORING_URL=tls://dask-scheduler:8786 + - DASK_MONITORING_URL=tcp://dask-scheduler:8786 dask-sidecar: dns: 8.8.8.8 # needed to access internet image: itisfoundation/dask-sidecar:master-github-latest @@ -16,7 +16,7 @@ services: environment: DASK_LOG_FORMAT_LOCAL_DEV_ENABLED: 1 DASK_NPROCS: 1 - DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tls://dask-scheduler:8786} + DASK_SCHEDULER_URL: ${DASK_SCHEDULER_URL:-tcp://dask-scheduler:8786} DASK_SIDECAR_NON_USABLE_RAM: 0 DASK_SIDECAR_NUM_NON_USABLE_CPUS: 0 LOG_LEVEL: ${LOG_LEVEL:-INFO} From 0ac337fcfc6fb55f3acc8b39afd455329fa910ff Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:35:29 +0100 Subject: [PATCH 37/84] add authentication --- .../src/simcore_service_autoscaling/core/settings.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py index 352c3731e429..691ca4727512 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/core/settings.py +++ b/services/autoscaling/src/simcore_service_autoscaling/core/settings.py @@ -11,6 +11,7 @@ PortInt, VersionTag, ) +from models_library.clusters import InternalClusterAuthentication from models_library.docker import DockerLabelKey from pydantic import ( AnyUrl, @@ -152,6 +153,10 @@ class DaskMonitoringSettings(BaseCustomSettings): DASK_MONITORING_URL: AnyUrl = Field( ..., description="the url to the osparc-dask-scheduler" ) + DASK_SCHEDULER_AUTH: InternalClusterAuthentication = Field( + ..., + description="defines the authentication of the clusters created via clusters-keeper (can be None or TLS)", + ) class ApplicationSettings(BaseCustomSettings, MixinLoggingSettings): From b1bb1bb8e944295c74fb79e9c563921b7bd6bdcb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:38:46 +0100 Subject: [PATCH 38/84] adding new variable to connect with secured scheduler --- .../auto_scaling_mode_computational.py | 23 +++++-- .../modules/dask.py | 64 +++++++++++++------ services/autoscaling/tests/manual/README.md | 2 +- .../manual/docker-compose-computational.yml | 1 + services/autoscaling/tests/unit/conftest.py | 1 + .../data/docker-compose.yml | 1 + services/docker-compose.yml | 1 + 7 files changed, 67 insertions(+), 26 deletions(-) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py index 2feeb12bdbe1..1e332b1d8901 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_mode_computational.py @@ -4,6 +4,7 @@ from aws_library.ec2.models import EC2InstanceData, EC2Tags, Resources from fastapi import FastAPI +from models_library.clusters import InternalClusterAuthentication from models_library.docker import ( DOCKER_TASK_EC2_INSTANCE_TYPE_PLACEMENT_CONSTRAINT_KEY, DockerLabelKey, @@ -36,6 +37,12 @@ def _scheduler_url(app: FastAPI) -> AnyUrl: return app_settings.AUTOSCALING_DASK.DASK_MONITORING_URL +def _scheduler_auth(app: FastAPI) -> InternalClusterAuthentication: + app_settings = get_application_settings(app) + assert app_settings.AUTOSCALING_DASK # nosec + return app_settings.AUTOSCALING_DASK.DASK_SCHEDULER_AUTH + + class ComputationalAutoscaling(BaseAutoscaling): @staticmethod async def get_monitored_nodes(app: FastAPI) -> list[Node]: @@ -58,10 +65,12 @@ def get_new_node_docker_tags( @staticmethod async def list_unrunnable_tasks(app: FastAPI) -> list[DaskTask]: try: - unrunnable_tasks = await dask.list_unrunnable_tasks(_scheduler_url(app)) + unrunnable_tasks = await dask.list_unrunnable_tasks( + _scheduler_url(app), _scheduler_auth(app) + ) # NOTE: any worker "processing" more than 1 task means that the other tasks are queued! processing_tasks_by_worker = await dask.list_processing_tasks_per_worker( - _scheduler_url(app) + _scheduler_url(app), _scheduler_auth(app) ) queued_tasks = [] for tasks in processing_tasks_by_worker.values(): @@ -107,13 +116,13 @@ async def compute_node_used_resources( ) -> Resources: try: num_results_in_memory = await dask.get_worker_still_has_results_in_memory( - _scheduler_url(app), instance.ec2_instance + _scheduler_url(app), _scheduler_auth(app), instance.ec2_instance ) if num_results_in_memory > 0: # NOTE: this is a trick to consider the node still useful return Resources(cpus=0, ram=ByteSize(1024 * 1024 * 1024)) return await dask.get_worker_used_resources( - _scheduler_url(app), instance.ec2_instance + _scheduler_url(app), _scheduler_auth(app), instance.ec2_instance ) except (DaskWorkerNotFoundError, DaskNoWorkersError): return Resources.create_as_empty() @@ -139,7 +148,7 @@ async def compute_cluster_total_resources( ) -> Resources: try: return await dask.compute_cluster_total_resources( - _scheduler_url(app), instances + _scheduler_url(app), _scheduler_auth(app), instances ) except DaskNoWorkersError: return Resources.create_as_empty() @@ -153,9 +162,9 @@ async def is_instance_active(app: FastAPI, instance: AssociatedInstance) -> bool # now check if dask-scheduler is available return await dask.is_worker_connected( - _scheduler_url(app), instance.ec2_instance + _scheduler_url(app), _scheduler_auth(app), instance.ec2_instance ) @staticmethod async def try_retire_nodes(app: FastAPI) -> None: - await dask.try_retire_nodes(_scheduler_url(app)) + await dask.try_retire_nodes(_scheduler_url(app), _scheduler_auth(app)) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py index 47a5488b244f..512984a1fdb4 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/dask.py @@ -8,6 +8,7 @@ import distributed from aws_library.ec2.models import EC2InstanceData, Resources from dask_task_models_library.resource_constraints import DaskTaskResources +from models_library.clusters import InternalClusterAuthentication, TLSAuthentication from pydantic import AnyUrl, ByteSize, parse_obj_as from ..core.errors import ( @@ -37,14 +38,27 @@ async def _wrap_client_async_routine( @contextlib.asynccontextmanager -async def _scheduler_client(url: AnyUrl) -> AsyncIterator[distributed.Client]: +async def _scheduler_client( + url: AnyUrl, authentication: InternalClusterAuthentication +) -> AsyncIterator[distributed.Client]: """ Raises: DaskSchedulerNotFoundError: if the scheduler was not found/cannot be reached """ try: + security = distributed.Security() + if isinstance(authentication, TLSAuthentication): + security = distributed.Security( + tls_ca_file=f"{authentication.tls_ca_file}", + tls_client_cert=f"{authentication.tls_client_cert}", + tls_client_key=f"{authentication.tls_client_key}", + require_encryption=True, + ) async with distributed.Client( - url, asynchronous=True, timeout=f"{_DASK_SCHEDULER_CONNECT_TIMEOUT_S}" + url, + asynchronous=True, + timeout=f"{_DASK_SCHEDULER_CONNECT_TIMEOUT_S}", + security=security, ) as client: yield client except OSError as exc: @@ -97,16 +111,21 @@ def _find_by_worker_host( async def is_worker_connected( - scheduler_url: AnyUrl, worker_ec2_instance: EC2InstanceData + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, + worker_ec2_instance: EC2InstanceData, ) -> bool: with contextlib.suppress(DaskNoWorkersError, DaskWorkerNotFoundError): - async with _scheduler_client(scheduler_url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: _dask_worker_from_ec2_instance(client, worker_ec2_instance) return True return False -async def list_unrunnable_tasks(url: AnyUrl) -> list[DaskTask]: +async def list_unrunnable_tasks( + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, +) -> list[DaskTask]: """ Raises: DaskSchedulerNotFoundError @@ -119,7 +138,7 @@ def _list_tasks( task.key: task.resource_restrictions for task in dask_scheduler.unrunnable } - async with _scheduler_client(url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: list_of_tasks: dict[ DaskTaskId, DaskTaskResources ] = await _wrap_client_async_routine(client.run_on_scheduler(_list_tasks)) @@ -131,7 +150,8 @@ def _list_tasks( async def list_processing_tasks_per_worker( - url: AnyUrl, + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, ) -> dict[DaskWorkerUrl, list[DaskTask]]: """ Raises: @@ -149,7 +169,7 @@ def _list_processing_tasks( ) return worker_to_processing_tasks - async with _scheduler_client(url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: worker_to_tasks: dict[ str, list[tuple[DaskTaskId, DaskTaskResources]] ] = await _wrap_client_async_routine( @@ -166,7 +186,9 @@ def _list_processing_tasks( async def get_worker_still_has_results_in_memory( - url: AnyUrl, ec2_instance: EC2InstanceData + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, + ec2_instance: EC2InstanceData, ) -> int: """ Raises: @@ -175,7 +197,7 @@ async def get_worker_still_has_results_in_memory( DaskWorkerNotFoundError DaskNoWorkersError """ - async with _scheduler_client(url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: _, worker_details = _dask_worker_from_ec2_instance(client, ec2_instance) worker_metrics: dict[str, Any] = worker_details["metrics"] @@ -183,7 +205,9 @@ async def get_worker_still_has_results_in_memory( async def get_worker_used_resources( - url: AnyUrl, ec2_instance: EC2InstanceData + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, + ec2_instance: EC2InstanceData, ) -> Resources: """ Raises: @@ -205,7 +229,7 @@ def _get_worker_used_resources( return dict(worker_state.used_resources) return None - async with _scheduler_client(url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: worker_url, _ = _dask_worker_from_ec2_instance(client, ec2_instance) # now get the used resources @@ -213,7 +237,7 @@ def _get_worker_used_resources( client.run_on_scheduler(_get_worker_used_resources, worker_url=worker_url), ) if worker_used_resources is None: - raise DaskWorkerNotFoundError(worker_host=worker_url, url=url) + raise DaskWorkerNotFoundError(worker_host=worker_url, url=scheduler_url) return Resources( cpus=worker_used_resources.get("CPU", 0), ram=parse_obj_as(ByteSize, worker_used_resources.get("RAM", 0)), @@ -221,17 +245,19 @@ def _get_worker_used_resources( async def compute_cluster_total_resources( - url: AnyUrl, instances: list[AssociatedInstance] + scheduler_url: AnyUrl, + authentication: InternalClusterAuthentication, + instances: list[AssociatedInstance], ) -> Resources: if not instances: return Resources.create_as_empty() - async with _scheduler_client(url) as client: + async with _scheduler_client(scheduler_url, authentication) as client: instance_hosts = ( node_ip_from_ec2_private_dns(i.ec2_instance) for i in instances ) scheduler_info = client.scheduler_info() if "workers" not in scheduler_info or not scheduler_info["workers"]: - raise DaskNoWorkersError(url=url) + raise DaskNoWorkersError(url=scheduler_url) workers: dict[str, Any] = scheduler_info["workers"] for worker_details in workers.values(): if worker_details["host"] not in instance_hosts: @@ -240,8 +266,10 @@ async def compute_cluster_total_resources( return Resources.create_as_empty() -async def try_retire_nodes(url: AnyUrl) -> None: - async with _scheduler_client(url) as client: +async def try_retire_nodes( + scheduler_url: AnyUrl, authentication: InternalClusterAuthentication +) -> None: + async with _scheduler_client(scheduler_url, authentication) as client: await _wrap_client_async_routine( client.retire_workers(close_workers=False, remove=False) ) diff --git a/services/autoscaling/tests/manual/README.md b/services/autoscaling/tests/manual/README.md index b7a2f1750092..ed08a7244600 100644 --- a/services/autoscaling/tests/manual/README.md +++ b/services/autoscaling/tests/manual/README.md @@ -14,7 +14,7 @@ The dynamic mode is used directly with docker swarm facilities. ## computational mode -When ```DASK_MONITORING_URL``` is set the computational mode is enabled. +When ```DASK_MONITORING_URL``` and ```DASK_SCHEDULER_AUTH``` is set the computational mode is enabled. ### instructions diff --git a/services/autoscaling/tests/manual/docker-compose-computational.yml b/services/autoscaling/tests/manual/docker-compose-computational.yml index 462e39904764..4a61c3439f3f 100644 --- a/services/autoscaling/tests/manual/docker-compose-computational.yml +++ b/services/autoscaling/tests/manual/docker-compose-computational.yml @@ -3,6 +3,7 @@ services: autoscaling: environment: - DASK_MONITORING_URL=tcp://dask-scheduler:8786 + - DASK_SCHEDULER_AUTH='{}' dask-sidecar: dns: 8.8.8.8 # needed to access internet image: itisfoundation/dask-sidecar:master-github-latest diff --git a/services/autoscaling/tests/unit/conftest.py b/services/autoscaling/tests/unit/conftest.py index ca0f36c640ad..8c6df4c75e25 100644 --- a/services/autoscaling/tests/unit/conftest.py +++ b/services/autoscaling/tests/unit/conftest.py @@ -215,6 +215,7 @@ def enabled_computational_mode( { "AUTOSCALING_DASK": "{}", "DASK_MONITORING_URL": faker.url(), + "DASK_SCHEDULER_AUTH": "{}", "DASK_MONITORING_USER_NAME": faker.user_name(), "DASK_MONITORING_PASSWORD": faker.password(), }, diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index a8801672c452..192ca54698d6 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -73,6 +73,7 @@ services: AUTOSCALING_NODES_MONITORING: null AUTOSCALING_POLL_INTERVAL: 10 DASK_MONITORING_URL: tls://dask-scheduler:8786 + DASK_SCHEDULER_AUTH: ${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} EC2_INSTANCES_ALLOWED_TYPES: ${WORKERS_EC2_INSTANCES_ALLOWED_TYPES} EC2_INSTANCES_CUSTOM_TAGS: ${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} EC2_INSTANCES_KEY_NAME: ${WORKERS_EC2_INSTANCES_KEY_NAME} diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 9ab3913436ea..3383fbe17dad 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -75,6 +75,7 @@ services: - AUTOSCALING_DASK=${AUTOSCALING_DASK} # comp autoscaling - DASK_MONITORING_URL=${DASK_MONITORING_URL} + - DASK_SCHEDULER_AUTH=${DASK_SCHEDULER_AUTH} - AUTOSCALING_EC2_ACCESS=${AUTOSCALING_EC2_ACCESS} # used to enable/disable - AUTOSCALING_EC2_ACCESS_KEY_ID=${AUTOSCALING_EC2_ACCESS_KEY_ID} From 008f40dbd5e2c5748c82eed3b77c4d4bb61ae074 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 17:52:29 +0100 Subject: [PATCH 39/84] fixed tests to new syntax --- .../tests/unit/test_modules_dask.py | 101 +++++++++++++----- 1 file changed, 77 insertions(+), 24 deletions(-) diff --git a/services/autoscaling/tests/unit/test_modules_dask.py b/services/autoscaling/tests/unit/test_modules_dask.py index 4b30887cf0e7..fffa9d69fae6 100644 --- a/services/autoscaling/tests/unit/test_modules_dask.py +++ b/services/autoscaling/tests/unit/test_modules_dask.py @@ -12,6 +12,11 @@ from arrow import utcnow from aws_library.ec2.models import Resources from faker import Faker +from models_library.clusters import ( + InternalClusterAuthentication, + NoAuthentication, + TLSAuthentication, +) from pydantic import AnyUrl, ByteSize, parse_obj_as from pytest_simcore.helpers.utils_host import get_localhost_ip from simcore_service_autoscaling.core.errors import ( @@ -35,11 +40,22 @@ ) from tenacity import retry, stop_after_delay, wait_fixed +_authentication_types = [ + NoAuthentication(), + TLSAuthentication.construct(**TLSAuthentication.Config.schema_extra["examples"][0]), +] + -async def test__scheduler_client_with_wrong_url(faker: Faker): +@pytest.mark.parametrize( + "authentication", _authentication_types, ids=lambda p: f"authentication-{p.type}" +) +async def test__scheduler_client_with_wrong_url( + faker: Faker, authentication: InternalClusterAuthentication +): with pytest.raises(DaskSchedulerNotFoundError): async with _scheduler_client( - parse_obj_as(AnyUrl, f"tcp://{faker.ipv4()}:{faker.port_number()}") + parse_obj_as(AnyUrl, f"tcp://{faker.ipv4()}:{faker.port_number()}"), + authentication, ): ... @@ -49,6 +65,11 @@ def scheduler_url(dask_spec_local_cluster: distributed.SpecCluster) -> AnyUrl: return parse_obj_as(AnyUrl, dask_spec_local_cluster.scheduler_address) +@pytest.fixture +def scheduler_authentication() -> InternalClusterAuthentication: + return NoAuthentication() + + @pytest.fixture def dask_workers_config() -> dict[str, Any]: # NOTE: override of pytest-simcore dask_workers_config to have only 1 worker @@ -64,8 +85,10 @@ def dask_workers_config() -> dict[str, Any]: } -async def test__scheduler_client(scheduler_url: AnyUrl): - async with _scheduler_client(scheduler_url): +async def test__scheduler_client( + scheduler_url: AnyUrl, scheduler_authentication: InternalClusterAuthentication +): + async with _scheduler_client(scheduler_url, scheduler_authentication): ... @@ -75,25 +98,26 @@ async def test_list_unrunnable_tasks_with_no_workers( scheduler_url = parse_obj_as( AnyUrl, dask_local_cluster_without_workers.scheduler_address ) - assert await list_unrunnable_tasks(scheduler_url) == [] + assert await list_unrunnable_tasks(scheduler_url, NoAuthentication()) == [] async def test_list_unrunnable_tasks( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, create_dask_task: Callable[[DaskTaskResources], distributed.Future], ): # we have nothing running now - assert await list_unrunnable_tasks(scheduler_url) == [] + assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [] # start a task that cannot run dask_task_impossible_resources = {"XRAM": 213} future = create_dask_task(dask_task_impossible_resources) assert future - assert await list_unrunnable_tasks(scheduler_url) == [ + assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [ DaskTask(task_id=future.key, required_resources=dask_task_impossible_resources) ] # remove that future, will remove the task del future - assert await list_unrunnable_tasks(scheduler_url) == [] + assert await list_unrunnable_tasks(scheduler_url, scheduler_authentication) == [] _REMOTE_FCT_SLEEP_TIME_S: Final[int] = 3 @@ -101,6 +125,7 @@ async def test_list_unrunnable_tasks( async def test_list_processing_tasks( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, dask_spec_cluster_client: distributed.Client, ): def _add_fct(x: int, y: int) -> int: @@ -110,13 +135,18 @@ def _add_fct(x: int, y: int) -> int: return x + y # there is nothing now - assert await list_processing_tasks_per_worker(url=scheduler_url) == {} + assert ( + await list_processing_tasks_per_worker(scheduler_url, scheduler_authentication) + == {} + ) # this function will be queued and executed as there are no specific resources needed future_queued_task = dask_spec_cluster_client.submit(_add_fct, 2, 5) assert future_queued_task - assert await list_processing_tasks_per_worker(scheduler_url) == { + assert await list_processing_tasks_per_worker( + scheduler_url, scheduler_authentication + ) == { next(iter(dask_spec_cluster_client.scheduler_info()["workers"])): [ DaskTask(task_id=DaskTaskId(future_queued_task.key), required_resources={}) ] @@ -126,7 +156,10 @@ def _add_fct(x: int, y: int) -> int: assert result == 7 # nothing processing anymore - assert await list_processing_tasks_per_worker(url=scheduler_url) == {} + assert ( + await list_processing_tasks_per_worker(scheduler_url, scheduler_authentication) + == {} + ) _DASK_SCHEDULER_REACTION_TIME_S: Final[int] = 4 @@ -151,11 +184,14 @@ def fake_ec2_instance_data_with_invalid_ec2_name( async def test_get_worker_still_has_results_in_memory_with_invalid_ec2_name_raises( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, fake_ec2_instance_data_with_invalid_ec2_name: EC2InstanceData, ): with pytest.raises(Ec2InvalidDnsNameError): await get_worker_still_has_results_in_memory( - scheduler_url, fake_ec2_instance_data_with_invalid_ec2_name + scheduler_url, + scheduler_authentication, + fake_ec2_instance_data_with_invalid_ec2_name, ) @@ -168,22 +204,26 @@ async def test_get_worker_still_has_results_in_memory_with_no_workers_raises( ) with pytest.raises(DaskNoWorkersError): await get_worker_still_has_results_in_memory( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, NoAuthentication(), fake_localhost_ec2_instance_data ) async def test_get_worker_still_has_results_in_memory_with_invalid_worker_host_raises( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, fake_ec2_instance_data: Callable[..., EC2InstanceData], ): ec2_instance_data = fake_ec2_instance_data() with pytest.raises(DaskWorkerNotFoundError): - await get_worker_still_has_results_in_memory(scheduler_url, ec2_instance_data) + await get_worker_still_has_results_in_memory( + scheduler_url, scheduler_authentication, ec2_instance_data + ) @pytest.mark.parametrize("fct_shall_err", [True, False], ids=str) async def test_get_worker_still_has_results_in_memory( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, dask_spec_cluster_client: distributed.Client, fake_localhost_ec2_instance_data: EC2InstanceData, fct_shall_err: bool, @@ -191,7 +231,7 @@ async def test_get_worker_still_has_results_in_memory( # nothing ran, so it's 0 assert ( await get_worker_still_has_results_in_memory( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data ) == 0 ) @@ -209,7 +249,7 @@ def _add_fct(x: int, y: int) -> int: await _wait_for_task_done(future_queued_task) assert ( await get_worker_still_has_results_in_memory( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data ) == 1 ) @@ -227,7 +267,7 @@ def _add_fct(x: int, y: int) -> int: await _wait_for_dask_scheduler_to_change_state() assert ( await get_worker_still_has_results_in_memory( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data ) == 1 ) @@ -237,7 +277,7 @@ def _add_fct(x: int, y: int) -> int: await _wait_for_dask_scheduler_to_change_state() assert ( await get_worker_still_has_results_in_memory( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data ) == 0 ) @@ -245,11 +285,14 @@ def _add_fct(x: int, y: int) -> int: async def test_worker_used_resources_with_invalid_ec2_name_raises( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, fake_ec2_instance_data_with_invalid_ec2_name: EC2InstanceData, ): with pytest.raises(Ec2InvalidDnsNameError): await get_worker_used_resources( - scheduler_url, fake_ec2_instance_data_with_invalid_ec2_name + scheduler_url, + scheduler_authentication, + fake_ec2_instance_data_with_invalid_ec2_name, ) @@ -261,26 +304,34 @@ async def test_worker_used_resources_with_no_workers_raises( AnyUrl, dask_local_cluster_without_workers.scheduler_address ) with pytest.raises(DaskNoWorkersError): - await get_worker_used_resources(scheduler_url, fake_localhost_ec2_instance_data) + await get_worker_used_resources( + scheduler_url, NoAuthentication(), fake_localhost_ec2_instance_data + ) async def test_worker_used_resources_with_invalid_worker_host_raises( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, fake_ec2_instance_data: Callable[..., EC2InstanceData], ): ec2_instance_data = fake_ec2_instance_data() with pytest.raises(DaskWorkerNotFoundError): - await get_worker_used_resources(scheduler_url, ec2_instance_data) + await get_worker_used_resources( + scheduler_url, scheduler_authentication, ec2_instance_data + ) async def test_worker_used_resources( scheduler_url: AnyUrl, + scheduler_authentication: InternalClusterAuthentication, dask_spec_cluster_client: distributed.Client, fake_localhost_ec2_instance_data: EC2InstanceData, ): # initial state assert ( - await get_worker_used_resources(scheduler_url, fake_localhost_ec2_instance_data) + await get_worker_used_resources( + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data + ) == Resources.create_as_empty() ) @@ -298,7 +349,7 @@ def _add_fct(x: int, y: int) -> int: assert future_queued_task await _wait_for_dask_scheduler_to_change_state() assert await get_worker_used_resources( - scheduler_url, fake_localhost_ec2_instance_data + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data ) == Resources(cpus=num_cpus, ram=ByteSize(0)) result = await future_queued_task.result(timeout=_DASK_SCHEDULER_REACTION_TIME_S) # type: ignore @@ -306,6 +357,8 @@ def _add_fct(x: int, y: int) -> int: # back to no use assert ( - await get_worker_used_resources(scheduler_url, fake_localhost_ec2_instance_data) + await get_worker_used_resources( + scheduler_url, scheduler_authentication, fake_localhost_ec2_instance_data + ) == Resources.create_as_empty() ) From 5141e92cc043e5571b8703b8707ccea86781427f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:05:57 +0100 Subject: [PATCH 40/84] missing ENV --- .../src/simcore_service_clusters_keeper/utils/clusters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 438645aca203..110176acbbdc 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -69,6 +69,7 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: return f"'{json.dumps(jsonable_encoder(entries))}'" return [ + f"CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH.json()}", f"CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ACCESS_KEY_ID}", f"CLUSTERS_KEEPER_EC2_ENDPOINT={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ENDPOINT}", f"CLUSTERS_KEEPER_EC2_REGION_NAME={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_REGION_NAME}", From 44946a1b2a979d1e99db221a35a650e614610f37 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:10:29 +0100 Subject: [PATCH 41/84] remove validator --- packages/models-library/src/models_library/clusters.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/packages/models-library/src/models_library/clusters.py b/packages/models-library/src/models_library/clusters.py index 37521e877d53..c51598b06ee1 100644 --- a/packages/models-library/src/models_library/clusters.py +++ b/packages/models-library/src/models_library/clusters.py @@ -115,14 +115,6 @@ class Config(BaseAuthentication.Config): ] } - @validator("tls_ca_file", "tls_client_cert", "tls_client_key") - @classmethod - def _file_exists(cls, v: Path) -> Path: - if not v.exists(): - msg = f"{v} is missing!" - raise ValueError(msg) - return v - InternalClusterAuthentication: TypeAlias = NoAuthentication | TLSAuthentication ExternalClusterAuthentication: TypeAlias = ( From 1b26b841991157dc193f600dc1406dc9c1d22d76 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:13:10 +0100 Subject: [PATCH 42/84] fix test --- services/director-v2/.env-devel | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/director-v2/.env-devel b/services/director-v2/.env-devel index c76d8413a28c..3a6ba8b92264 100644 --- a/services/director-v2/.env-devel +++ b/services/director-v2/.env-devel @@ -11,6 +11,9 @@ SC_BOOT_MODE=debug-ptvsd # Variables typically passed upon start via services/docker-compose.yml files +COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tcp://dask-scheduler:8786 +COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{}' + DYNAMIC_SIDECAR_IMAGE=local/dynamic-sidecar:development DYNAMIC_SIDECAR_PROMETHEUS_SERVICE_LABELS={} From da6a8a1fc4eeb4118afa397aa1fa98fbc0be1941 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Thu, 25 Jan 2024 18:39:15 +0100 Subject: [PATCH 43/84] fixed test --- .../tests/unit/with_dbs/test_api_route_dynamic_services.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py index 1ba72da7fb85..5028d733ca6a 100644 --- a/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py +++ b/services/director-v2/tests/unit/with_dbs/test_api_route_dynamic_services.py @@ -90,7 +90,10 @@ def dynamic_sidecar_headers() -> dict[str, str]: @pytest.fixture() def mock_env( - disable_postgres: None, disable_rabbitmq: None, monkeypatch: pytest.MonkeyPatch + disable_postgres: None, + disable_rabbitmq: None, + monkeypatch: pytest.MonkeyPatch, + faker: Faker, ) -> None: # Works as below line in docker.compose.yml # ${DOCKER_REGISTRY:-itisfoundation}/dynamic-sidecar:${DOCKER_IMAGE_TAG:-latest} @@ -109,6 +112,8 @@ def mock_env( monkeypatch.setenv("SWARM_STACK_NAME", "test_swarm_name") monkeypatch.setenv("COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED", "false") monkeypatch.setenv("COMPUTATIONAL_BACKEND_ENABLED", "false") + monkeypatch.setenv("COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL", f"{faker.url()}") + monkeypatch.setenv("COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH", "{}") monkeypatch.setenv("DIRECTOR_V2_DYNAMIC_SCHEDULER_ENABLED", "true") monkeypatch.setenv("RABBIT_HOST", "mocked_host") From dc2f5631fe8257b39821df1d84de0efcd33a57b0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 08:09:52 +0100 Subject: [PATCH 44/84] make the tls certificates optional --- services/dask-sidecar/docker/boot.sh | 36 ++++++++++++++++------------ test.sh | 4 ++++ 2 files changed, 25 insertions(+), 15 deletions(-) create mode 100755 test.sh diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index 97d62b1e31db..e171240a2cce 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -41,23 +41,29 @@ cat >/home/scu/.config/dask/distributed.yaml <> /home/scu/.config/dask/distributed.yaml < Date: Fri, 26 Jan 2024 11:39:57 +0100 Subject: [PATCH 45/84] upgrade v0.9.0 --- scripts/shellcheck.bash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/shellcheck.bash b/scripts/shellcheck.bash index 2fea124a9281..e90777d7fd48 100755 --- a/scripts/shellcheck.bash +++ b/scripts/shellcheck.bash @@ -5,4 +5,4 @@ # - VS extension: https://github.com/timonwong/vscode-shellcheck # -exec docker run --rm --interactive --volume "$PWD:/mnt:ro" koalaman/shellcheck:v0.7.0 "$@" +exec docker run --rm --interactive --volume "$PWD:/mnt:ro" koalaman/shellcheck:v0.9.0 "$@" From f2bcfc8f2442d56884674f1076aff60eb48c815d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:55:14 +0100 Subject: [PATCH 46/84] pass iam role with ec2 instances --- packages/aws-library/src/aws_library/ec2/client.py | 3 +++ packages/aws-library/src/aws_library/ec2/models.py | 1 + 2 files changed, 4 insertions(+) diff --git a/packages/aws-library/src/aws_library/ec2/client.py b/packages/aws-library/src/aws_library/ec2/client.py index efdd22c3642c..99661c7cac0f 100644 --- a/packages/aws-library/src/aws_library/ec2/client.py +++ b/packages/aws-library/src/aws_library/ec2/client.py @@ -128,6 +128,9 @@ async def start_aws_instance( ImageId=instance_config.ami_id, MinCount=number_of_instances, MaxCount=number_of_instances, + IamInstanceProfile={"Arn": instance_config.iam_instance_profile} + if instance_config.iam_instance_profile + else {}, InstanceType=instance_config.type.name, InstanceInitiatedShutdownBehavior="terminate", KeyName=instance_config.key_name, diff --git a/packages/aws-library/src/aws_library/ec2/models.py b/packages/aws-library/src/aws_library/ec2/models.py index 79c73e89fae8..6c0593d66ef2 100644 --- a/packages/aws-library/src/aws_library/ec2/models.py +++ b/packages/aws-library/src/aws_library/ec2/models.py @@ -103,6 +103,7 @@ class EC2InstanceConfig: key_name: str security_group_ids: list[str] subnet_id: str + iam_instance_profile: str AMIIdStr: TypeAlias = str From e377456426e95fbccc7574e2f5336c2ddebbe199 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:55:39 +0100 Subject: [PATCH 47/84] new ENV --- .../src/simcore_service_clusters_keeper/core/settings.py | 4 ++++ services/docker-compose.yml | 1 + 2 files changed, 5 insertions(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index 4ea23b435d08..02ffa27770c7 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -141,6 +141,10 @@ class PrimaryEC2InstancesSettings(BaseCustomSettings): description="Allows to define tags that should be added to the created EC2 instance default tags. " "a tag must have a key and an optional value. see [https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html]", ) + PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE: str = Field( + ..., + description="ARN the EC2 instance should be attached to (example: arn:aws:iam::XXXXX:role/NAME)", + ) @validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES") @classmethod diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 3383fbe17dad..72d823d6b24d 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -173,6 +173,7 @@ services: - PRIMARY_EC2_INSTANCES_SECURITY_GROUP_IDS=${PRIMARY_EC2_INSTANCES_SECURITY_GROUP_IDS} - PRIMARY_EC2_INSTANCES_SUBNET_ID=${PRIMARY_EC2_INSTANCES_SUBNET_ID} - PRIMARY_EC2_INSTANCES_CUSTOM_TAGS=${PRIMARY_EC2_INSTANCES_CUSTOM_TAGS} + - PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE=${PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE} - RABBIT_HOST=${RABBIT_HOST} - RABBIT_PASSWORD=${RABBIT_PASSWORD} - RABBIT_PORT=${RABBIT_PORT} From fcff7e0da3468ba9512511a9545ea9acc6e07c40 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:55:47 +0100 Subject: [PATCH 48/84] add call --- .../src/simcore_service_clusters_keeper/modules/clusters.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py index c4171cdc037d..dd4e2800fd73 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py @@ -84,6 +84,7 @@ async def create_cluster( key_name=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME, security_group_ids=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SECURITY_GROUP_IDS, subnet_id=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SUBNET_ID, + iam_instance_profile=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_IAM_PROFILES, ) new_ec2_instance_data: list[EC2InstanceData] = await ec2_client.start_aws_instance( instance_config, From c5405f4ebf73a9d372773e7433455145ed099a8d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 11:59:16 +0100 Subject: [PATCH 49/84] add missing env PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE --- services/clusters-keeper/tests/unit/conftest.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index aa0d4a9e0e7a..486adc7dbc9d 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -120,6 +120,7 @@ def app_environment( "PRIMARY_EC2_INSTANCES_CUSTOM_TAGS": json.dumps( {"osparc-tag": "the pytest tag is here"} ), + "PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE": faker.pystr(), "CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES": "{}", "WORKERS_EC2_INSTANCES_ALLOWED_TYPES": json.dumps( { From f5309d4dc3e29edb6b9980460c3f000a80d9682b Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 12:01:00 +0100 Subject: [PATCH 50/84] fix code --- .../src/simcore_service_clusters_keeper/modules/clusters.py | 2 +- services/clusters-keeper/tests/unit/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py index dd4e2800fd73..092443723859 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/modules/clusters.py @@ -84,7 +84,7 @@ async def create_cluster( key_name=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_KEY_NAME, security_group_ids=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SECURITY_GROUP_IDS, subnet_id=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SUBNET_ID, - iam_instance_profile=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_IAM_PROFILES, + iam_instance_profile=app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE, ) new_ec2_instance_data: list[EC2InstanceData] = await ec2_client.start_aws_instance( instance_config, diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index 486adc7dbc9d..f74513c9a844 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -120,7 +120,7 @@ def app_environment( "PRIMARY_EC2_INSTANCES_CUSTOM_TAGS": json.dumps( {"osparc-tag": "the pytest tag is here"} ), - "PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE": faker.pystr(), + "PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE": "", # must be empty since we would need to add it to moto as well "CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES": "{}", "WORKERS_EC2_INSTANCES_ALLOWED_TYPES": json.dumps( { From caa7cbeb374c5f819d18f11ade8488ce3dbb3fb6 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 14:25:12 +0100 Subject: [PATCH 51/84] fix call syntax --- .../src/simcore_service_autoscaling/modules/auto_scaling_core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py index 63a31cfa195b..34bd5e916db4 100644 --- a/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py +++ b/services/autoscaling/src/simcore_service_autoscaling/modules/auto_scaling_core.py @@ -601,6 +601,7 @@ async def _start_instances( key_name=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_KEY_NAME, security_group_ids=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SECURITY_GROUP_IDS, subnet_id=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_SUBNET_ID, + iam_instance_profile="", ), number_of_instances=instance_num, max_number_of_instances=app_settings.AUTOSCALING_EC2_INSTANCES.EC2_INSTANCES_MAX_INSTANCES, From 606dbb9dbabd01c7c6e48007c4a4224a1726e569 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 14:29:39 +0100 Subject: [PATCH 52/84] fix indentation --- services/dask-sidecar/docker/boot.sh | 30 ++++++++++++++-------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index e171240a2cce..8b87372f612d 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -46,21 +46,21 @@ EOF # Check if DASK_TLS_CA_FILE is present if [ -n "$DASK_TLS_CA_FILE" ]; then cat >> /home/scu/.config/dask/distributed.yaml < Date: Fri, 26 Jan 2024 14:50:40 +0000 Subject: [PATCH 53/84] fix protocol --- .../src/simcore_service_clusters_keeper/utils/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py index 363ffc973795..1189a0d77489 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/dask.py @@ -7,7 +7,7 @@ def get_scheduler_url(ec2_instance: EC2InstanceData) -> AnyUrl: - url: AnyUrl = parse_obj_as(AnyUrl, f"tcp://{ec2_instance.aws_public_ip}:8786") + url: AnyUrl = parse_obj_as(AnyUrl, f"tls://{ec2_instance.aws_public_ip}:8786") return url From 680f387aab132edfcbadfa17b536bc164863a27c Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 14:52:54 +0000 Subject: [PATCH 54/84] ensure the path used is the correct one --- .../data/docker-compose.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index 192ca54698d6..8d338e21530d 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.8" x-dask-tls-secrets: &dask_tls_secrets - source: dask_tls_ca - target: ${DASK_TLS_KEY} + target: ${DASK_TLS_CA_FILE} mode: 0444 - source: dask_tls_key target: ${DASK_TLS_KEY} @@ -23,7 +23,7 @@ services: DASK_TLS_KEY: ${DASK_TLS_KEY} LOG_LEVEL: ${LOG_LEVEL} ports: - - 8786:8786 # dask-scheduler tcp access + - 8786:8786 # dask-scheduler access - 8787:8787 # dashboard deploy: placement: @@ -120,8 +120,8 @@ volumes: secrets: dask_tls_ca: - file: .dask-certificates/tls_dask_ca.pem + file: ${DASK_TLS_CA_FILE} dask_tls_key: - file: .dask-certificates/tls_dask_cert.pem + file: ${DASK_TLS_KEY} dask_tls_cert: - file: .dask-certificates/tls_dask_key.pem + file: ${DASK_TLS_CERT} From 2eb75a3e1a2b93e36dbb5f1f7c17cc8017909621 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:03:44 +0000 Subject: [PATCH 55/84] add settings to download data --- .../simcore_service_clusters_keeper/core/settings.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py index 02ffa27770c7..4275e8efa1e9 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/core/settings.py @@ -143,7 +143,16 @@ class PrimaryEC2InstancesSettings(BaseCustomSettings): ) PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE: str = Field( ..., - description="ARN the EC2 instance should be attached to (example: arn:aws:iam::XXXXX:role/NAME)", + description="ARN the EC2 instance should be attached to (example: arn:aws:iam::XXXXX:role/NAME), to disable pass an empty string", + ) + PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA: str = Field( + ..., description="Name of the dask TLC CA in AWS Parameter Store" + ) + PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT: str = Field( + ..., description="Name of the dask TLC certificate in AWS Parameter Store" + ) + PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY: str = Field( + ..., description="Name of the dask TLC key in AWS Parameter Store" ) @validator("PRIMARY_EC2_INSTANCES_ALLOWED_TYPES") From 2bc0f2771f56e6cdb7ad6bdf28e3f4f62faca68d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:03:59 +0000 Subject: [PATCH 56/84] we download certificates --- .../utils/clusters.py | 25 ++++++++----------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 110176acbbdc..931ca8012364 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -42,16 +42,6 @@ def _docker_compose_yml_base64_encoded() -> str: return _base_64_encode(file_path) -@functools.lru_cache -def _write_tls_certificates_commands(auth: TLSAuthentication) -> list[str]: - return [ - f"mkdir --parents {_HOST_CERTIFICATES_BASE_PATH}", - f"echo '{_base_64_encode(auth.tls_ca_file)}' > {_HOST_TLS_CA_FILE_PATH}", - f"echo '{_base_64_encode(auth.tls_client_cert)}' > {_HOST_TLS_CERT_FILE_PATH}", - f"echo '{_base_64_encode(auth.tls_client_key)}' > {_HOST_TLS_KEY_FILE_PATH}", - ] - - def _prepare_environment_variables( app_settings: ApplicationSettings, *, @@ -69,7 +59,7 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: return f"'{json.dumps(jsonable_encoder(entries))}'" return [ - f"CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH.json()}", + f"CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH)}", f"CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ACCESS_KEY_ID}", f"CLUSTERS_KEEPER_EC2_ENDPOINT={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ENDPOINT}", f"CLUSTERS_KEEPER_EC2_REGION_NAME={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_REGION_NAME}", @@ -113,10 +103,15 @@ def create_startup_script( app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH, TLSAuthentication, ): - write_certificates_commands = _write_tls_certificates_commands( - app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH - ) - startup_commands.extend(write_certificates_commands) + assert app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES # nosec + download_certificates_commands = [ + "apt install -y awscli", + f"mkdir --parents {_HOST_CERTIFICATES_BASE_PATH}", + f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_CA_FILE_PATH}', + f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_CERT_FILE_PATH}', + f'aws ssm get-parameter --name "{app_settings.CLUSTERS_KEEPER_PRIMARY_EC2_INSTANCES.PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY}" --region us-east-1 --with-decryption --query "Parameter.Value" --output text > {_HOST_TLS_KEY_FILE_PATH}', + ] + startup_commands.extend(download_certificates_commands) startup_commands.extend( [ From c79d9690880cc3a4d66ce11f59063a95e86a9913 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:04:13 +0000 Subject: [PATCH 57/84] ensure we go tls --- .../simcore_service_clusters_keeper/data/docker-compose.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index 8d338e21530d..c814baa8f911 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -17,10 +17,11 @@ services: init: true hostname: "{{.Node.Hostname}}-{{.Service.Name}}-{{.Task.Slot}}" environment: - DASK_START_AS_SCHEDULER: 1 DASK_TLS_CA_FILE: ${DASK_TLS_CA_FILE} DASK_TLS_CERT: ${DASK_TLS_CERT} DASK_TLS_KEY: ${DASK_TLS_KEY} + DASK_SCHEDULER_URL: tls://dask-scheduler:8786 + DASK_START_AS_SCHEDULER: 1 LOG_LEVEL: ${LOG_LEVEL} ports: - 8786:8786 # dask-scheduler access From 0874f884118ef5de6ca3fb8ed500c0dfe891e843 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:51:52 +0000 Subject: [PATCH 58/84] return the correct auth --- .../src/simcore_service_clusters_keeper/rpc/clusters.py | 6 +++--- .../src/simcore_service_clusters_keeper/utils/clusters.py | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py index dcd08c9e3a6b..285b9172a1eb 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/rpc/clusters.py @@ -44,14 +44,14 @@ async def get_or_create_cluster( assert len(new_ec2_instances) == 1 # nosec ec2_instance = new_ec2_instances[0] assert ec2_instance is not None # nosec + cluster_auth = get_scheduler_auth(app) return create_cluster_from_ec2_instance( ec2_instance, user_id, wallet_id, dask_scheduler_ready=bool( ec2_instance.state == "running" - and await ping_scheduler( - get_scheduler_url(ec2_instance), get_scheduler_auth(app) - ) + and await ping_scheduler(get_scheduler_url(ec2_instance), cluster_auth) ), + cluster_auth=cluster_auth, ) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 931ca8012364..2034ee1b6c3f 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -11,7 +11,7 @@ ClusterState, OnDemandCluster, ) -from models_library.clusters import NoAuthentication, TLSAuthentication +from models_library.clusters import InternalClusterAuthentication, TLSAuthentication from models_library.users import UserID from models_library.wallets import WalletID from types_aiobotocore_ec2.literals import InstanceStateNameType @@ -166,10 +166,11 @@ def create_cluster_from_ec2_instance( wallet_id: WalletID | None, *, dask_scheduler_ready: bool, + cluster_auth: InternalClusterAuthentication, ) -> OnDemandCluster: return OnDemandCluster( endpoint=get_scheduler_url(instance), - authentication=NoAuthentication(), + authentication=cluster_auth, state=_convert_ec2_state_to_cluster_state(instance.state), user_id=user_id, wallet_id=wallet_id, From ab98145773aa37b3a70dbb36769c52562fc36b30 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:52:14 +0000 Subject: [PATCH 59/84] fix auth in autoscaling --- .../data/docker-compose.yml | 46 +++++++++++++------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml index c814baa8f911..07c171a1e5d0 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/data/docker-compose.yml @@ -1,15 +1,4 @@ version: "3.8" -x-dask-tls-secrets: &dask_tls_secrets - - source: dask_tls_ca - target: ${DASK_TLS_CA_FILE} - mode: 0444 - - source: dask_tls_key - target: ${DASK_TLS_KEY} - mode: 0444 - - source: dask_tls_cert - target: ${DASK_TLS_CERT} - mode: 0444 - services: dask-scheduler: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG} @@ -30,7 +19,16 @@ services: placement: constraints: - "node.role==manager" - secrets: *dask_tls_secrets + secrets: + - source: dask_tls_ca + target: ${DASK_TLS_CA_FILE} + mode: 0444 + - source: dask_tls_key + target: ${DASK_TLS_KEY} + mode: 0444 + - source: dask_tls_cert + target: ${DASK_TLS_CERT} + mode: 0444 dask-sidecar: image: ${DOCKER_REGISTRY:-itisfoundation}/dask-sidecar:${DOCKER_IMAGE_TAG} @@ -59,7 +57,16 @@ services: placement: constraints: - "node.role==worker" - secrets: *dask_tls_secrets + secrets: + - source: dask_tls_ca + target: ${DASK_TLS_CA_FILE} + mode: 0444 + - source: dask_tls_key + target: ${DASK_TLS_KEY} + mode: 0444 + - source: dask_tls_cert + target: ${DASK_TLS_CERT} + mode: 0444 autoscaling: image: ${DOCKER_REGISTRY:-itisfoundation}/autoscaling:${DOCKER_IMAGE_TAG} @@ -74,7 +81,7 @@ services: AUTOSCALING_NODES_MONITORING: null AUTOSCALING_POLL_INTERVAL: 10 DASK_MONITORING_URL: tls://dask-scheduler:8786 - DASK_SCHEDULER_AUTH: ${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} + DASK_SCHEDULER_AUTH: '{"type":"tls","tls_ca_file":"${DASK_TLS_CA_FILE}","tls_client_cert":"${DASK_TLS_CERT}","tls_client_key":"${DASK_TLS_KEY}"}' EC2_INSTANCES_ALLOWED_TYPES: ${WORKERS_EC2_INSTANCES_ALLOWED_TYPES} EC2_INSTANCES_CUSTOM_TAGS: ${WORKERS_EC2_INSTANCES_CUSTOM_TAGS} EC2_INSTANCES_KEY_NAME: ${WORKERS_EC2_INSTANCES_KEY_NAME} @@ -94,7 +101,16 @@ services: placement: constraints: - "node.role==manager" - secrets: *dask_tls_secrets + secrets: + - source: dask_tls_ca + target: ${DASK_TLS_CA_FILE} + mode: 0444 + - source: dask_tls_key + target: ${DASK_TLS_KEY} + mode: 0444 + - source: dask_tls_cert + target: ${DASK_TLS_CERT} + mode: 0444 redis: # NOTE: currently autoscaling requires redis to run From a490395a5eeee3c8a20ae82c3114208c5d383fbf Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 10:08:02 +0100 Subject: [PATCH 60/84] added missing envs --- services/clusters-keeper/tests/unit/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/clusters-keeper/tests/unit/conftest.py b/services/clusters-keeper/tests/unit/conftest.py index f74513c9a844..2cd3249f2b1b 100644 --- a/services/clusters-keeper/tests/unit/conftest.py +++ b/services/clusters-keeper/tests/unit/conftest.py @@ -121,6 +121,9 @@ def app_environment( {"osparc-tag": "the pytest tag is here"} ), "PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE": "", # must be empty since we would need to add it to moto as well + "PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA": faker.pystr(), + "PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT": faker.pystr(), + "PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY": faker.pystr(), "CLUSTERS_KEEPER_WORKERS_EC2_INSTANCES": "{}", "WORKERS_EC2_INSTANCES_ALLOWED_TYPES": json.dumps( { From 463f676dd3e3d2d058218c5212933ad192c5e1fc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Fri, 26 Jan 2024 10:09:36 +0100 Subject: [PATCH 61/84] fixed test --- packages/aws-library/tests/test_ec2_client.py | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/aws-library/tests/test_ec2_client.py b/packages/aws-library/tests/test_ec2_client.py index 1e85be088d1a..c8890b447dd1 100644 --- a/packages/aws-library/tests/test_ec2_client.py +++ b/packages/aws-library/tests/test_ec2_client.py @@ -96,6 +96,7 @@ def ec2_instance_config( key_name=faker.pystr(), security_group_ids=[aws_security_group_id], subnet_id=aws_subnet_id, + iam_instance_profile="", ) From 8fd34a3c7fb5a5a8ba49bc189f8fb087bc3190a5 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:18:08 +0100 Subject: [PATCH 62/84] fixed test --- .../tests/unit/test_utils_clusters.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/services/clusters-keeper/tests/unit/test_utils_clusters.py b/services/clusters-keeper/tests/unit/test_utils_clusters.py index c49945b61692..44e1b5a91f82 100644 --- a/services/clusters-keeper/tests/unit/test_utils_clusters.py +++ b/services/clusters-keeper/tests/unit/test_utils_clusters.py @@ -17,6 +17,11 @@ ) from faker import Faker from models_library.api_schemas_clusters_keeper.clusters import ClusterState +from models_library.clusters import ( + InternalClusterAuthentication, + NoAuthentication, + TLSAuthentication, +) from pytest_simcore.helpers.utils_envs import EnvVarsDict from simcore_service_clusters_keeper.core.settings import ApplicationSettings from simcore_service_clusters_keeper.utils.clusters import ( @@ -187,11 +192,19 @@ def test_startup_script_defines_all_envs_for_docker_compose( ("whatever", ClusterState.STOPPED), ], ) +@pytest.mark.parametrize( + "authentication", + [ + NoAuthentication(), + TLSAuthentication(**TLSAuthentication.Config.schema_extra["examples"][0]), + ], +) def test_create_cluster_from_ec2_instance( fake_ec2_instance_data: Callable[..., EC2InstanceData], faker: Faker, ec2_state: InstanceStateNameType, expected_cluster_state: ClusterState, + authentication: InternalClusterAuthentication, ): instance_data = fake_ec2_instance_data(state=ec2_state) cluster_instance = create_cluster_from_ec2_instance( @@ -199,6 +212,8 @@ def test_create_cluster_from_ec2_instance( faker.pyint(), faker.pyint(), dask_scheduler_ready=faker.pybool(), + cluster_auth=authentication, ) assert cluster_instance assert cluster_instance.state is expected_cluster_state + assert cluster_instance.authentication == authentication From 87830332eacbc548fe4be0e26272ceb74c053d82 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:20:33 +0100 Subject: [PATCH 63/84] unnecessary env --- .../src/simcore_service_clusters_keeper/utils/clusters.py | 1 - 1 file changed, 1 deletion(-) diff --git a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py index 2034ee1b6c3f..7873ab67db60 100644 --- a/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py +++ b/services/clusters-keeper/src/simcore_service_clusters_keeper/utils/clusters.py @@ -59,7 +59,6 @@ def _convert_to_env_dict(entries: dict[str, Any]) -> str: return f"'{json.dumps(jsonable_encoder(entries))}'" return [ - f"CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH={_convert_to_env_dict(app_settings.CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH)}", f"CLUSTERS_KEEPER_EC2_ACCESS_KEY_ID={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ACCESS_KEY_ID}", f"CLUSTERS_KEEPER_EC2_ENDPOINT={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_ENDPOINT}", f"CLUSTERS_KEEPER_EC2_REGION_NAME={app_settings.CLUSTERS_KEEPER_EC2_ACCESS.EC2_REGION_NAME}", From c1e4f4744e52138b86ba53ea779580ce1a9e232d Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:34:24 +0100 Subject: [PATCH 64/84] doc --- .../src/simcore_service_director_v2/core/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/src/simcore_service_director_v2/core/settings.py b/services/director-v2/src/simcore_service_director_v2/core/settings.py index 6b9e7bec3600..366dc591fb98 100644 --- a/services/director-v2/src/simcore_service_director_v2/core/settings.py +++ b/services/director-v2/src/simcore_service_director_v2/core/settings.py @@ -72,7 +72,7 @@ class ComputationalBackendSettings(BaseCustomSettings): ..., description="This is the cluster that will be used by default" " when submitting computational services (typically " - "tcp://dask-scheduler:8786 for the internal cluster, or " + "tcp://dask-scheduler:8786, tls://dask-scheduler:8786 for the internal cluster, or " "http(s)/GATEWAY_IP:8000 for a osparc-dask-gateway)", ) COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH: ClusterAuthentication = Field( From 65acc824fca9e32d7ddb1715bc791e0ef6e914ca Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:37:43 +0100 Subject: [PATCH 65/84] fixed test --- .../unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py index af199cc59f8f..ba81e0aad1cf 100644 --- a/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py +++ b/services/director-v2/tests/unit/with_dbs/test_modules_comp_scheduler_dask_scheduler.py @@ -1037,7 +1037,7 @@ async def test_task_progress_triggers( ) async def test_handling_of_disconnected_dask_scheduler( with_disabled_scheduler_task: None, - dask_spec_local_cluster: SpecCluster, + mocked_dask_client: mock.MagicMock, scheduler: BaseCompScheduler, aiopg_engine: aiopg.sa.engine.Engine, mocker: MockerFixture, From 86df492bcab7ac6827d48e41c6069beaef9b3323 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:51:23 +0100 Subject: [PATCH 66/84] it is now tls --- .../src/pytest_simcore/simcore_dask_service.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py index cc7f6370a56a..444b2ea297d6 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py @@ -11,7 +11,7 @@ from .helpers.utils_docker import get_service_published_port -@pytest.fixture(scope="function") +@pytest.fixture async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str: # the dask scheduler has a UI for the dashboard and a secondary port for the API # simcore_services fixture already ensure the dask-scheduler is up and running @@ -20,17 +20,16 @@ async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str: ) # override the port monkeypatch.setenv("DASK_SCHEDULER_PORT", f"{dask_scheduler_api_port}") - return AnyUrl.build(scheme="tcp", host="127.0.0.1", port=dask_scheduler_api_port) + return AnyUrl.build(scheme="tls", host="127.0.0.1", port=dask_scheduler_api_port) -@pytest.fixture(scope="function") +@pytest.fixture def dask_client(dask_scheduler_service: str) -> Iterator[Client]: - client = Client(dask_scheduler_service) yield client client.close() -@pytest.fixture(scope="function") +@pytest.fixture def dask_sidecar_service(dask_client: Client) -> None: dask_client.wait_for_workers(n_workers=1, timeout=30) From d1cc94ffa043687d75831d73ab3788248258158f Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 15:54:09 +0100 Subject: [PATCH 67/84] tls --- .../tests/integration/test_dask_sidecar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py b/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py index fa7e35548706..0966a099e1fa 100644 --- a/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py +++ b/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py @@ -132,7 +132,7 @@ async def test_computational_sidecar_properly_start_stop( "ContainerSpec": { "Image": image_name, "Env": sidecar_envs - | {"DASK_SCHEDULER_URL": f"tcp://{get_localhost_ip()}:8786"}, + | {"DASK_SCHEDULER_URL": f"tls://{get_localhost_ip()}:8786"}, "Init": True, "Mounts": sidecar_mounts, } From c979265be24146ffbfe36609be3d85353efd1f44 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Sat, 27 Jan 2024 16:06:00 +0100 Subject: [PATCH 68/84] ensure starts when DASK_TLS_CA_FILE not set --- services/dask-sidecar/docker/boot.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index 8b87372f612d..beb92daac58e 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -44,8 +44,8 @@ logging: EOF # Check if DASK_TLS_CA_FILE is present -if [ -n "$DASK_TLS_CA_FILE" ]; then - cat >> /home/scu/.config/dask/distributed.yaml <>/home/scu/.config/dask/distributed.yaml < Date: Mon, 29 Jan 2024 09:02:23 +0100 Subject: [PATCH 69/84] mock dask client --- services/clusters-keeper/tests/unit/test_rpc_clusters.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/services/clusters-keeper/tests/unit/test_rpc_clusters.py b/services/clusters-keeper/tests/unit/test_rpc_clusters.py index 9c8b171b6c25..6eb2dd2eb1b9 100644 --- a/services/clusters-keeper/tests/unit/test_rpc_clusters.py +++ b/services/clusters-keeper/tests/unit/test_rpc_clusters.py @@ -151,6 +151,7 @@ async def test_get_or_create_cluster_massive_calls( ec2_client: EC2Client, user_id: UserID, wallet_id: WalletID, + mocked_dask_ping_scheduler: MockedDaskModule, ): # NOTE: when a user starts many computational jobs in parallel # the get_or_create_cluster is flooded with a lot of calls for the @@ -172,3 +173,4 @@ async def test_get_or_create_cluster_massive_calls( assert results assert all(isinstance(response, OnDemandCluster) for response in results) + mocked_dask_ping_scheduler.ping_scheduler.assert_called_once() From c00d77012b306c8768e47dd082ed0cc6a324a178 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 10:02:41 +0100 Subject: [PATCH 70/84] use a non secure scheduler for this test --- .../tests/integration/test_dask_sidecar.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py b/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py index 0966a099e1fa..cbf54edb1c0d 100644 --- a/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py +++ b/services/osparc-gateway-server/tests/integration/test_dask_sidecar.py @@ -2,7 +2,8 @@ # pylint: disable=redefined-outer-name import asyncio -from typing import Any, AsyncIterator, Awaitable, Callable +from collections.abc import AsyncIterator, Awaitable, Callable, Mapping +from typing import Any import aiodocker import pytest @@ -18,8 +19,7 @@ async def sidecar_computational_shared_volume( faker: Faker, docker_volume: Callable[[str], Awaitable[dict[str, Any]]], ) -> dict[str, Any]: - volume = await docker_volume(faker.pystr()) - return volume + return await docker_volume(faker.pystr()) @pytest.fixture @@ -32,11 +32,10 @@ def sidecar_envs( computational_sidecar_mounted_folder: str, sidecar_computational_shared_volume: dict[str, Any], ) -> dict[str, str]: - envs = { + return { "SIDECAR_COMP_SERVICES_SHARED_FOLDER": f"{computational_sidecar_mounted_folder}", "SIDECAR_COMP_SERVICES_SHARED_VOLUME_NAME": f"{sidecar_computational_shared_volume['Name']}", } - return envs @pytest.fixture @@ -44,7 +43,7 @@ def sidecar_mounts( sidecar_computational_shared_volume: dict[str, Any], computational_sidecar_mounted_folder: str, ) -> list[dict[str, Any]]: - mounts = [ # docker socket needed to use the docker api + return [ # docker socket needed to use the docker api { "Source": "/var/run/docker.sock", "Target": "/var/run/docker.sock", @@ -59,16 +58,15 @@ def sidecar_mounts( "ReadOnly": False, }, ] - return mounts @pytest.fixture async def create_docker_service( async_docker_client: aiodocker.Docker, -) -> AsyncIterator[Callable[..., Awaitable[dict[str, Any]]]]: +) -> AsyncIterator[Callable[..., Awaitable[Mapping[str, Any]]]]: services = [] - async def service_creator(**service_kwargs) -> dict[str, Any]: + async def service_creator(**service_kwargs) -> Mapping[str, Any]: service = await async_docker_client.services.create(**service_kwargs) assert service assert "ID" in service @@ -116,7 +114,11 @@ async def test_computational_sidecar_properly_start_stop( task_template={ "ContainerSpec": { "Image": image_name, - "Env": sidecar_envs | {"DASK_START_AS_SCHEDULER": "1"}, + "Env": sidecar_envs + | { + "DASK_START_AS_SCHEDULER": "1", + "DASK_SCHEDULER_URL": f"tcp://{get_localhost_ip()}:8786", + }, "Init": True, "Mounts": sidecar_mounts, } @@ -132,7 +134,7 @@ async def test_computational_sidecar_properly_start_stop( "ContainerSpec": { "Image": image_name, "Env": sidecar_envs - | {"DASK_SCHEDULER_URL": f"tls://{get_localhost_ip()}:8786"}, + | {"DASK_SCHEDULER_URL": f"tcp://{get_localhost_ip()}:8786"}, "Init": True, "Mounts": sidecar_mounts, } From 4d3b9d860c1b3c2e83737a2167c8141db7357df4 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 10:03:16 +0100 Subject: [PATCH 71/84] ruff --- .../src/osparc_gateway_server/backend/osparc.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py b/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py index 087180289ef7..3cf97bbfdaee 100644 --- a/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py +++ b/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py @@ -1,7 +1,8 @@ import asyncio import logging +from collections.abc import AsyncGenerator from importlib.metadata import version -from typing import Any, AsyncGenerator +from typing import Any import osparc_gateway_server from aiodocker import Docker @@ -77,7 +78,7 @@ async def do_setup(self) -> None: self.worker_start_timeout = self.settings.GATEWAY_WORKER_START_TIMEOUT self.docker_client = Docker() - print(WELCOME_MSG, flush=True) + print(WELCOME_MSG, flush=True) # noqa: T201 async def do_cleanup(self) -> None: assert isinstance(self.log, logging.Logger) # nosec @@ -106,7 +107,6 @@ async def do_start_cluster( for key, value in modifications.items(): scheduler_cmd = modify_cmd_argument(scheduler_cmd, key, value) # start the scheduler - # asyncio.create_task(_background_task(self, cluster)) async for dask_scheduler_start_result in start_service( docker_client=self.docker_client, settings=self.settings, @@ -135,7 +135,7 @@ async def do_stop_cluster(self, cluster: Cluster) -> None: async def do_check_clusters(self, clusters: list[Cluster]) -> list[bool]: assert isinstance(self.log, logging.Logger) # nosec self.log.debug("--> checking statuses of : %s", f"{clusters=}") - ok: list[bool] = await asyncio.gather( + ok: list[bool | BaseException] = await asyncio.gather( *[self._check_service_status(c) for c in clusters], return_exceptions=True ) self.log.debug("<-- clusters status returned: %s", f"{ok=}") @@ -160,9 +160,8 @@ async def do_start_worker( # this should not happen since calling do_start_worker is done # from the on_cluster_heartbeat that checks if we already reached max worker # What may happen is that a docker node was removed in between and that is an error we can report. - raise PublicException( - "Unexpected error while creating a new worker, there is no available host! Was a docker node removed?" - ) from exc + msg = "Unexpected error while creating a new worker, there is no available host! Was a docker node removed?" + raise PublicException(msg) from exc assert node_hostname is not None # nosec worker_env = self.get_worker_env(worker.cluster) dask_scheduler_url = f"tls://cluster_{worker.cluster.id}_scheduler:{OSPARC_SCHEDULER_API_PORT}" # worker.cluster.scheduler_address @@ -307,7 +306,7 @@ async def on_cluster_heartbeat(self, cluster_name, msg) -> None: for worker in cluster.workers.values(): if worker.status >= JobStatus.STOPPED: continue - elif worker.name in closing_workers: + if worker.name in closing_workers: if worker.status < JobStatus.RUNNING: newly_running.append(worker) close_expected.append(worker) @@ -337,7 +336,7 @@ async def on_cluster_heartbeat(self, cluster_name, msg) -> None: self.queue.put(cluster) self.db.update_workers(target_updates) - for w, u in target_updates: + for w, _u in target_updates: self.queue.put(w) if newly_running: From 96779ab8602602d8c8bbfd906f975f0ba4e43809 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 10:28:50 +0100 Subject: [PATCH 72/84] use local ip instead of fixed 127.0.0.1 --- .../src/pytest_simcore/simcore_dask_service.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py index 444b2ea297d6..6f8fe7935abe 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py @@ -2,11 +2,12 @@ # pylint:disable=unused-argument # pylint:disable=redefined-outer-name -from typing import Iterator +from collections.abc import Iterator import pytest from distributed import Client from pydantic import AnyUrl +from pytest_simcore.helpers.utils_host import get_localhost_ip from .helpers.utils_docker import get_service_published_port @@ -20,7 +21,9 @@ async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str: ) # override the port monkeypatch.setenv("DASK_SCHEDULER_PORT", f"{dask_scheduler_api_port}") - return AnyUrl.build(scheme="tls", host="127.0.0.1", port=dask_scheduler_api_port) + return AnyUrl.build( + scheme="tls", host=get_localhost_ip(), port=dask_scheduler_api_port + ) @pytest.fixture From 4a7f5cd8580690de09994bff2b08bbe4a2f3dfcb Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:27:13 +0100 Subject: [PATCH 73/84] ensure migration has access --- packages/postgres-database/docker/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/packages/postgres-database/docker/Dockerfile b/packages/postgres-database/docker/Dockerfile index db1634bfdf52..bd5e0ae4153e 100644 --- a/packages/postgres-database/docker/Dockerfile +++ b/packages/postgres-database/docker/Dockerfile @@ -43,7 +43,8 @@ FROM base as production ENV PYTHONOPTIMIZE=TRUE WORKDIR /home/scu - +# ensure home folder is read/writable for user scu +RUN chown -R scu /home/scu # bring installed package without build tools COPY --from=build ${VIRTUAL_ENV} ${VIRTUAL_ENV} COPY entrypoint.bash /home/entrypoint.bash From 8e93c14d6375d9f8b7e557f96312e9f0205f1446 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 11:30:04 +0100 Subject: [PATCH 74/84] show log for TLS --- services/dask-sidecar/docker/boot.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index beb92daac58e..16317a241253 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -45,6 +45,7 @@ EOF # Check if DASK_TLS_CA_FILE is present if [ -n "${DASK_TLS_CA_FILE:-}" ]; then + print_info "TLS authentication enabled cat >>/home/scu/.config/dask/distributed.yaml < Date: Mon, 29 Jan 2024 12:21:59 +0100 Subject: [PATCH 75/84] add scheduler auth --- .../pytest_simcore/simcore_dask_service.py | 75 ++++++++++++++++++- 1 file changed, 71 insertions(+), 4 deletions(-) diff --git a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py index 6f8fe7935abe..2fc5220104ae 100644 --- a/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py +++ b/packages/pytest-simcore/src/pytest_simcore/simcore_dask_service.py @@ -3,17 +3,23 @@ # pylint:disable=redefined-outer-name from collections.abc import Iterator +from dataclasses import dataclass +from pathlib import Path +import distributed import pytest from distributed import Client +from models_library.clusters import InternalClusterAuthentication, TLSAuthentication from pydantic import AnyUrl -from pytest_simcore.helpers.utils_host import get_localhost_ip from .helpers.utils_docker import get_service_published_port +from .helpers.utils_host import get_localhost_ip @pytest.fixture -async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str: +async def dask_scheduler_service( + simcore_services_ready: None, monkeypatch: pytest.MonkeyPatch +) -> str: # the dask scheduler has a UI for the dashboard and a secondary port for the API # simcore_services fixture already ensure the dask-scheduler is up and running dask_scheduler_api_port = get_service_published_port( @@ -27,8 +33,69 @@ async def dask_scheduler_service(simcore_services_ready, monkeypatch) -> str: @pytest.fixture -def dask_client(dask_scheduler_service: str) -> Iterator[Client]: - client = Client(dask_scheduler_service) +def dask_sidecar_dir(osparc_simcore_services_dir: Path) -> Path: + path = osparc_simcore_services_dir / "dask-sidecar" + assert path.exists() + return path + + +@pytest.fixture +def dask_backend_tls_certificates_dir(dask_sidecar_dir: Path) -> Path: + path = dask_sidecar_dir / ".dask-certificates" + assert path.exists() + return path + + +@dataclass(frozen=True, slots=True, kw_only=True) +class _TLSCertificates: + tls_ca_file: Path + tls_cert_file: Path + tls_key_file: Path + + +@pytest.fixture +def dask_backend_tls_certificates( + dask_backend_tls_certificates_dir, +) -> _TLSCertificates: + certs = _TLSCertificates( + tls_ca_file=dask_backend_tls_certificates_dir / "dask-cert.pem", + tls_cert_file=dask_backend_tls_certificates_dir / "dask-cert.pem", + tls_key_file=dask_backend_tls_certificates_dir / "dask-key.pem", + ) + assert certs.tls_ca_file.exists() + assert certs.tls_cert_file.exists() + assert certs.tls_key_file.exists() + return certs + + +@pytest.fixture +def dask_scheduler_auth( + dask_backend_tls_certificates: _TLSCertificates, +) -> InternalClusterAuthentication: + return TLSAuthentication( + tls_ca_file=dask_backend_tls_certificates.tls_ca_file, + tls_client_cert=dask_backend_tls_certificates.tls_cert_file, + tls_client_key=dask_backend_tls_certificates.tls_key_file, + ) + + +@pytest.fixture +def dask_client_security( + dask_backend_tls_certificates: _TLSCertificates, +) -> distributed.Security: + return distributed.Security( + tls_ca_file=f"{dask_backend_tls_certificates.tls_ca_file}", + tls_client_cert=f"{dask_backend_tls_certificates.tls_cert_file}", + tls_client_key=f"{dask_backend_tls_certificates.tls_key_file}", + require_encryption=True, + ) + + +@pytest.fixture +def dask_client( + dask_scheduler_service: str, dask_client_security: distributed.Security +) -> Iterator[Client]: + client = Client(dask_scheduler_service, security=dask_client_security) yield client client.close() From 7eb1747ea391c9b8dbf32f1ada6fe202724fc0ec Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 12:22:11 +0100 Subject: [PATCH 76/84] added auth --- .../director-v2/tests/integration/01/test_computation_api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/integration/01/test_computation_api.py b/services/director-v2/tests/integration/01/test_computation_api.py index 8889e441906e..a41db9c16996 100644 --- a/services/director-v2/tests/integration/01/test_computation_api.py +++ b/services/director-v2/tests/integration/01/test_computation_api.py @@ -21,7 +21,7 @@ assert_computation_task_out_obj, ) from models_library.api_schemas_directorv2.comp_tasks import ComputationGet -from models_library.clusters import DEFAULT_CLUSTER_ID +from models_library.clusters import DEFAULT_CLUSTER_ID, InternalClusterAuthentication from models_library.projects import ProjectAtDB from models_library.projects_nodes import NodeState from models_library.projects_nodes_io import NodeID @@ -56,6 +56,7 @@ def mock_env( monkeypatch: pytest.MonkeyPatch, dynamic_sidecar_docker_image_name: str, dask_scheduler_service: str, + dask_scheduler_auth: InternalClusterAuthentication, ) -> None: # used by the client fixture setenvs_from_dict( @@ -64,6 +65,7 @@ def mock_env( "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": "1", "COMPUTATIONAL_BACKEND_ENABLED": "1", "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL": dask_scheduler_service, + "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": dask_scheduler_auth.json(), "DYNAMIC_SIDECAR_IMAGE": dynamic_sidecar_docker_image_name, "SIMCORE_SERVICES_NETWORK_NAME": "test_swarm_network_name", "SWARM_STACK_NAME": "test_mocked_stack_name", From 59c0b3a7ea504c938932b12dc15d9400f3f5e5a0 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 13:14:42 +0100 Subject: [PATCH 77/84] fixed mypy --- .../src/osparc_gateway_server/backend/osparc.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py b/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py index 3cf97bbfdaee..d87b703fcf64 100644 --- a/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py +++ b/services/osparc-gateway-server/src/osparc_gateway_server/backend/osparc.py @@ -135,11 +135,11 @@ async def do_stop_cluster(self, cluster: Cluster) -> None: async def do_check_clusters(self, clusters: list[Cluster]) -> list[bool]: assert isinstance(self.log, logging.Logger) # nosec self.log.debug("--> checking statuses of : %s", f"{clusters=}") - ok: list[bool | BaseException] = await asyncio.gather( + oks: list[bool | BaseException] = await asyncio.gather( *[self._check_service_status(c) for c in clusters], return_exceptions=True ) - self.log.debug("<-- clusters status returned: %s", f"{ok=}") - return ok + self.log.debug("<-- clusters status returned: %s", f"{oks=}") + return [ok if isinstance(ok, bool) else False for ok in oks] async def do_start_worker( self, worker: Worker From ae41532611777705c88c9e887b76d86eb9adc063 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 13:28:54 +0100 Subject: [PATCH 78/84] missing quote --- services/dask-sidecar/docker/boot.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/dask-sidecar/docker/boot.sh b/services/dask-sidecar/docker/boot.sh index 16317a241253..5972cb510d3e 100755 --- a/services/dask-sidecar/docker/boot.sh +++ b/services/dask-sidecar/docker/boot.sh @@ -45,7 +45,7 @@ EOF # Check if DASK_TLS_CA_FILE is present if [ -n "${DASK_TLS_CA_FILE:-}" ]; then - print_info "TLS authentication enabled + print_info "TLS authentication enabled" cat >>/home/scu/.config/dask/distributed.yaml < Date: Mon, 29 Jan 2024 13:38:16 +0100 Subject: [PATCH 79/84] missing ENV --- .../02/test_dynamic_sidecar_nodeports_integration.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py index 169152172b45..8687ebc35312 100644 --- a/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py +++ b/services/director-v2/tests/integration/02/test_dynamic_sidecar_nodeports_integration.py @@ -27,7 +27,7 @@ assert_computation_task_out_obj, ) from models_library.api_schemas_directorv2.comp_tasks import ComputationGet -from models_library.clusters import DEFAULT_CLUSTER_ID +from models_library.clusters import DEFAULT_CLUSTER_ID, InternalClusterAuthentication from models_library.projects import ( Node, NodesDict, @@ -331,6 +331,7 @@ def mock_env( network_name: str, dev_feature_r_clone_enabled: str, dask_scheduler_service: str, + dask_scheduler_auth: InternalClusterAuthentication, minimal_configuration: None, ) -> None: # Works as below line in docker.compose.yml @@ -368,6 +369,7 @@ def mock_env( "COMPUTATIONAL_BACKEND_ENABLED": "true", "COMPUTATIONAL_BACKEND_DASK_CLIENT_ENABLED": "true", "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL": dask_scheduler_service, + "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH": dask_scheduler_auth.json(), }, ) monkeypatch.delenv("DYNAMIC_SIDECAR_MOUNT_PATH_DEV", raising=False) From 917521e03ab3f3fee01aadeb04a06f3b7458386a Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:15:39 +0100 Subject: [PATCH 80/84] missing envs --- services/docker-compose.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 72d823d6b24d..85105db131dd 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -174,6 +174,9 @@ services: - PRIMARY_EC2_INSTANCES_SUBNET_ID=${PRIMARY_EC2_INSTANCES_SUBNET_ID} - PRIMARY_EC2_INSTANCES_CUSTOM_TAGS=${PRIMARY_EC2_INSTANCES_CUSTOM_TAGS} - PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE=${PRIMARY_EC2_INSTANCES_ATTACHED_IAM_PROFILE} + - PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA=${PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CA} + - PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT=${PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_CERT} + - PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY=${PRIMARY_EC2_INSTANCES_SSM_TLS_DASK_KEY} - RABBIT_HOST=${RABBIT_HOST} - RABBIT_PASSWORD=${RABBIT_PASSWORD} - RABBIT_PORT=${RABBIT_PORT} From 43ff295e22e039afe92ca323352281710b50d9bc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:52:31 +0100 Subject: [PATCH 81/84] use defaults --- services/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/docker-compose.yml b/services/docker-compose.yml index 85105db131dd..f09268eca866 100644 --- a/services/docker-compose.yml +++ b/services/docker-compose.yml @@ -154,7 +154,7 @@ services: - default environment: - CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG} - - CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} + - CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH:-{"type":"tls","tls_ca_file":"${DASK_TLS_CERT}","tls_client_cert":"${DASK_TLS_CERT}","tls_client_key":"${DASK_TLS_KEY}"}} - CLUSTERS_KEEPER_DASK_NTHREADS=${CLUSTERS_KEEPER_DASK_NTHREADS} - CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=${CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION} - CLUSTERS_KEEPER_TASK_INTERVAL=${CLUSTERS_KEEPER_TASK_INTERVAL} @@ -255,7 +255,7 @@ services: - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL} - - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH} + - COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH=${COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH:-{"type":"tls","tls_ca_file":"${DASK_TLS_CERT}","tls_client_cert":"${DASK_TLS_CERT}","tls_client_key":"${DASK_TLS_KEY}"}} - COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE} - COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=${COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE} From 99d534e83ad3fc49ff80d0629b206664d2c26f59 Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 15:52:39 +0100 Subject: [PATCH 82/84] simplify --- .env-devel | 2 -- 1 file changed, 2 deletions(-) diff --git a/.env-devel b/.env-devel index 058921b9daab..5a2b72675bc1 100644 --- a/.env-devel +++ b/.env-devel @@ -33,7 +33,6 @@ CATALOG_SERVICES_DEFAULT_RESOURCES='{"CPU": {"limit": 0.1, "reservation": 0.1}, CATALOG_SERVICES_DEFAULT_SPECIFICATIONS='{}' CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DOCKER_IMAGE_TAG=master-github-latest -CLUSTERS_KEEPER_COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"}' CLUSTERS_KEEPER_DASK_NTHREADS=0 CLUSTERS_KEEPER_EC2_ACCESS=null CLUSTERS_KEEPER_MAX_MISSED_HEARTBEATS_BEFORE_CLUSTER_TERMINATION=5 @@ -54,7 +53,6 @@ DIRECTOR_REGISTRY_CACHING=True DIRECTOR_GENERIC_RESOURCE_PLACEMENT_CONSTRAINTS_SUBSTITUTIONS='{}' COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_URL=tls://dask-scheduler:8786 -COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH='{"type":"tls","tls_ca_file":"/home/scu/.dask/dask-crt.pem","tls_client_cert":"/home/scu/.dask/dask-crt.pem","tls_client_key":"/home/scu/.dask/dask-key.pem"}' COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_FILE_LINK_TYPE=S3 COMPUTATIONAL_BACKEND_DEFAULT_FILE_LINK_TYPE=PRESIGNED COMPUTATIONAL_BACKEND_ON_DEMAND_CLUSTERS_FILE_LINK_TYPE=PRESIGNED From 445cdda6d6d377e2459e430087e3143df30087dc Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 16:44:30 +0100 Subject: [PATCH 83/84] fix settings --- services/director-v2/tests/unit/test_core_settings.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/director-v2/tests/unit/test_core_settings.py b/services/director-v2/tests/unit/test_core_settings.py index c36d37e57923..46a7561a9d09 100644 --- a/services/director-v2/tests/unit/test_core_settings.py +++ b/services/director-v2/tests/unit/test_core_settings.py @@ -81,6 +81,9 @@ def test_settings_with_repository_env_devel( mock_env_devel_environment: dict[str, str], monkeypatch: pytest.MonkeyPatch ): monkeypatch.setenv("SC_BOOT_MODE", "production") # defined in Dockerfile + monkeypatch.setenv( + "COMPUTATIONAL_BACKEND_DEFAULT_CLUSTER_AUTH", "{}" + ) # defined in docker-compose settings = AppSettings.create_from_envs() print("captured settings: \n", settings.json(indent=2)) From c864a1f9ac1b9c591c2111ac6dc005c5b4544aaa Mon Sep 17 00:00:00 2001 From: sanderegg <35365065+sanderegg@users.noreply.github.com> Date: Mon, 29 Jan 2024 17:42:57 +0100 Subject: [PATCH 84/84] @pcrespov review: mistake --- test.sh | 4 ---- 1 file changed, 4 deletions(-) delete mode 100755 test.sh diff --git a/test.sh b/test.sh deleted file mode 100755 index 58f66d893225..000000000000 --- a/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/sh -num_cpus=$(nproc) -DASK_NTHREADS=${DASK_NTHREADS:="$num_cpus"} -echo ${DASK_NTHREADS}