diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index 7f8431677d92d..100c8d64fe423 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -67,57 +67,42 @@ jobs: 'App: v0_app': name: "v0_app" dir: "public" - queue_type: "redis" 'App: boring_app': name: "boring_app" dir: "public" - queue_type: "redis" - 'App: boring_app / HTTP': - name: "boring_app" - dir: "public" - queue_type: "http" - 'App: template_streamlit_ui': - name: "template_streamlit_ui" - dir: "public" - queue_type: "redis" + # TODO: RESOLVE ME ASAP + # 'App: template_streamlit_ui': + # name: "template_streamlit_ui" + # dir: "public" 'App: template_react_ui': name: "template_react_ui" dir: "public" - queue_type: "redis" # 'App: template_jupyterlab': # TODO: clarify where these files lives # name: "template_jupyterlab" 'App: installation_commands_app': name: "installation_commands_app" dir: "public" - queue_type: "redis" 'App: drive': name: "drive" dir: "public" - queue_type: "redis" 'App: payload': name: "payload" dir: "public" - queue_type: "redis" 'App: commands_and_api': name: "commands_and_api" dir: "public" - queue_type: "redis" #'App: quick_start': # todo: consider adding back when fixed # name: "quick_start" # dir: "public" - # queue_type: "redis" 'App: idle_timeout': name: "idle_timeout" dir: "local" - queue_type: "redis" 'App: collect_failures': name: "collect_failures" dir: "local" - queue_type: "redis" 'App: custom_work_dependencies': name: "custom_work_dependencies" dir: "local" - queue_type: "redis" timeoutInMinutes: "15" cancelTimeoutInMinutes: "1" # values: https://docs.microsoft.com/en-us/azure/devops/pipelines/process/phases?view=azure-devops&tabs=yaml#workspace @@ -135,7 +120,6 @@ jobs: HAR_LOCATION: './artifacts/hars' SLOW_MO: '50' LIGHTNING_DEBUG: '1' - LIGHTNING_CLOUD_QUEUE_TYPE: $(queue_type) steps: - script: echo '##vso[task.setvariable variable=local_id]$(System.PullRequest.PullRequestNumber)' diff --git a/examples/app/commands_and_api/.lightningignore b/examples/app/commands_and_api/.lightningignore new file mode 100644 index 0000000000000..f7275bbbd035b --- /dev/null +++ b/examples/app/commands_and_api/.lightningignore @@ -0,0 +1 @@ +venv/ diff --git a/src/lightning/app/core/constants.py b/src/lightning/app/core/constants.py index fbd36ea38e771..19bc25ef0d63d 100644 --- a/src/lightning/app/core/constants.py +++ b/src/lightning/app/core/constants.py @@ -20,6 +20,9 @@ def get_lightning_cloud_url() -> str: + # detect local development + if os.getenv("VSCODE_PROXY_URI", "").startswith("http://localhost:9800"): + return "http://localhost:9800" # DO NOT CHANGE! return os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai") @@ -115,17 +118,5 @@ def enable_interruptible_works() -> bool: return bool(int(os.getenv("LIGHTNING_INTERRUPTIBLE_WORKS", "0"))) -# Get Cluster Driver -_CLUSTER_DRIVERS = [None, "k8s", "direct"] - - def get_cluster_driver() -> Optional[str]: - value = os.getenv("LIGHTNING_CLUSTER_DRIVER", None) - if value is None: - if enable_interruptible_works(): - value = "direct" - else: - value = None - if value not in _CLUSTER_DRIVERS: - raise ValueError(f"Found {value} cluster driver. The value needs to be in {_CLUSTER_DRIVERS}.") - return value + return "direct" diff --git a/src/lightning/app/core/queues.py b/src/lightning/app/core/queues.py index 607fd302fc22c..239a7ca1969af 100644 --- a/src/lightning/app/core/queues.py +++ b/src/lightning/app/core/queues.py @@ -388,19 +388,30 @@ def get(self, timeout: Optional[float] = None) -> Any: if timeout is None: while True: try: - return self._get() + try: + return self._get() + except requests.exceptions.HTTPError: + pass except queue.Empty: time.sleep(HTTP_QUEUE_REFRESH_INTERVAL) # make one request and return the result if timeout == 0: - return self._get() + try: + return self._get() + except requests.exceptions.HTTPError: + return None # timeout is some value - loop until the timeout is reached start_time = time.time() while (time.time() - start_time) < timeout: try: - return self._get() + try: + return self._get() + except requests.exceptions.HTTPError: + if timeout > self.default_timeout: + return None + raise queue.Empty except queue.Empty: # Note: In theory, there isn't a need for a sleep as the queue shouldn't # block the flow if the queue is empty. @@ -441,8 +452,11 @@ def length(self) -> int: if not self.app_id: raise ValueError(f"App ID couldn't be extracted from the queue name: {self.name}") - val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length") - return int(val.text) + try: + val = self.client.get(f"/v1/{self.app_id}/{self._name_suffix}/length") + return int(val.text) + except requests.exceptions.HTTPError: + return 0 @staticmethod def _split_app_id_and_queue_name(queue_name: str) -> Tuple[str, str]: diff --git a/src/lightning/app/runners/cloud.py b/src/lightning/app/runners/cloud.py index 2e5918001f727..12ba2f9f0a7c0 100644 --- a/src/lightning/app/runners/cloud.py +++ b/src/lightning/app/runners/cloud.py @@ -78,7 +78,6 @@ ENABLE_PULLING_STATE_ENDPOINT, ENABLE_PUSHING_STATE_ENDPOINT, get_cloud_queue_type, - get_cluster_driver, get_lightning_cloud_url, LIGHTNING_CLOUD_PRINT_SPECS, SYS_CUSTOMIZATIONS_SYNC_ROOT, @@ -874,12 +873,6 @@ def _get_env_vars( if not ENABLE_PUSHING_STATE_ENDPOINT: v1_env_vars.append(V1EnvVar(name="ENABLE_PUSHING_STATE_ENDPOINT", value="0")) - if get_cloud_queue_type(): - v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLOUD_QUEUE_TYPE", value=get_cloud_queue_type())) - - if get_cluster_driver(): - v1_env_vars.append(V1EnvVar(name="LIGHTNING_CLUSTER_DRIVER", value=get_cluster_driver())) - if enable_interruptible_works(): v1_env_vars.append( V1EnvVar( diff --git a/src/lightning/app/storage/path.py b/src/lightning/app/storage/path.py index 6f37abee6231f..0c12c648cd67a 100644 --- a/src/lightning/app/storage/path.py +++ b/src/lightning/app/storage/path.py @@ -427,16 +427,12 @@ def _filesystem() -> AbstractFileSystem: endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "") bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "") if endpoint_url != "" and bucket_name != "": - key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "") - secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "") - # TODO: Remove when updated on the platform side. - if key == "" or secret == "": - key = os.getenv("AWS_ACCESS_KEY_ID", "") - secret = os.getenv("AWS_SECRET_ACCESS_KEY", "") - if key == "" or secret == "": - raise RuntimeError("missing S3 bucket credentials") - - fs = S3FileSystem(key=key, secret=secret, use_ssl=False, client_kwargs={"endpoint_url": endpoint_url}) + # FIXME: Temporary fix until we remove the injection from the platform + if "AWS_ACCESS_KEY_ID" in os.environ: + del os.environ["AWS_ACCESS_KEY_ID"] + del os.environ["AWS_SECRET_ACCESS_KEY"] + + fs = S3FileSystem() app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "") if app_id == "": diff --git a/src/lightning/app/testing/testing.py b/src/lightning/app/testing/testing.py index 9a6a7908336fc..85c8109490ac2 100644 --- a/src/lightning/app/testing/testing.py +++ b/src/lightning/app/testing/testing.py @@ -377,6 +377,8 @@ def run_app_in_cloud( [constants.LIGHTNING_CLOUD_PROJECT_ID], ) + admin_page.reload() + view_page = context.new_page() i = 1 while True: @@ -385,10 +387,10 @@ def run_app_in_cloud( # wait until the app is running and openapi.json is ready if app.status.phase == V1LightningappInstanceState.RUNNING: - view_page.goto(f"{app.status.url}/view") status_code = requests.get(f"{app.status.url}/openapi.json").status_code if status_code == 200: print("App is running, continuing with testing...") + view_page.goto(f"{app.status.url}/view") break msg = f"Received status code {status_code} at {app.status.url!r}" elif app.status.phase not in (V1LightningappInstanceState.PENDING, V1LightningappInstanceState.NOT_STARTED): @@ -478,6 +480,19 @@ def _delete_lightning_app(client, project_id, app_id, app_name): print(f"Failed to delete {app_name}. Exception {ex}") +def _delete_cloud_space(client, project_id, cloud_space_id, app_name): + """Used to delete the parent cloudspace.""" + print(f"Deleting {app_name} id: {cloud_space_id}") + try: + res = client.cloud_space_service_delete_cloud_space( + project_id=project_id, + id=cloud_space_id, + ) + assert res == {} + except ApiException as ex: + print(f"Failed to delete {app_name}. Exception {ex}") + + def delete_cloud_lightning_apps(): """Cleanup cloud apps that start with the name test-{PR_NUMBER}-{TEST_APP_NAME}. @@ -502,6 +517,9 @@ def delete_cloud_lightning_apps(): if pr_number and app_name and not lit_app.name.startswith(f"test-{pr_number}-{app_name}-"): continue _delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name) + _delete_cloud_space( + client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name + ) print("deleting apps that were created more than 1 hour ago.") @@ -509,3 +527,6 @@ def delete_cloud_lightning_apps(): if lit_app.created_at < datetime.datetime.now(lit_app.created_at.tzinfo) - datetime.timedelta(hours=1): _delete_lightning_app(client, project_id=project_id, app_id=lit_app.id, app_name=lit_app.name) + _delete_cloud_space( + client, project_id=project_id, cloud_space_id=lit_app.spec.cloud_space_id, app_name=lit_app.name + ) diff --git a/src/lightning/app/utilities/app_logs.py b/src/lightning/app/utilities/app_logs.py index c03ca162fb8e1..bc21c3d456b47 100644 --- a/src/lightning/app/utilities/app_logs.py +++ b/src/lightning/app/utilities/app_logs.py @@ -27,13 +27,16 @@ @dataclass class _LogEventLabels: - app: str - container: str - filename: str - job: str - namespace: str - node_name: str - pod: str + app: Optional[str] = None + container: Optional[str] = None + filename: Optional[str] = None + job: Optional[str] = None + namespace: Optional[str] = None + node_name: Optional[str] = None + pod: Optional[str] = None + clusterID: Optional[str] = None + component: Optional[str] = None + projectID: Optional[str] = None stream: Optional[str] = None diff --git a/src/lightning/app/utilities/packaging/cloud_compute.py b/src/lightning/app/utilities/packaging/cloud_compute.py index 2474e0b3a7f0c..75c7cb93d6e00 100644 --- a/src/lightning/app/utilities/packaging/cloud_compute.py +++ b/src/lightning/app/utilities/packaging/cloud_compute.py @@ -111,6 +111,11 @@ def __post_init__(self) -> None: if "gpu" not in self.name: raise ValueError("CloudCompute `interruptible=True` is supported only with GPU.") + # FIXME: Clean the mess on the platform side + if self.name == "default" or self.name == "cpu": + self.name = "cpu-small" + self._internal_id = "default" + # TODO: Remove from the platform first. self.preemptible = self.interruptible @@ -147,7 +152,7 @@ def id(self) -> Optional[str]: return self._internal_id def is_default(self) -> bool: - return self.name == "default" + return self.name in ("default", "cpu-small") def _generate_id(self): return "default" if self.name == "default" else uuid4().hex[:7] diff --git a/tests/integrations_app/public/test_v0_app.py b/tests/integrations_app/public/test_v0_app.py index 3e3fe0825109b..634fd10fd93b6 100644 --- a/tests/integrations_app/public/test_v0_app.py +++ b/tests/integrations_app/public/test_v0_app.py @@ -53,6 +53,7 @@ def check_content(button_name, text_content): has_logs = False while not has_logs: for log in fetch_logs(["flow"]): + print(log) if "'a': 'a', 'b': 'b'" in log: has_logs = True sleep(1) diff --git a/tests/tests_app/core/test_constants.py b/tests/tests_app/core/test_constants.py index 4f4965cc388a2..489334a06e87e 100644 --- a/tests/tests_app/core/test_constants.py +++ b/tests/tests_app/core/test_constants.py @@ -1,25 +1,9 @@ import os from unittest import mock -import pytest - -from lightning.app.core.constants import get_cluster_driver, get_lightning_cloud_url +from lightning.app.core.constants import get_lightning_cloud_url @mock.patch.dict(os.environ, {"LIGHTNING_CLOUD_URL": "https://beta.lightning.ai"}) def test_defaults(): assert get_lightning_cloud_url() == "https://beta.lightning.ai" - - -def test_cluster_drive(monkeypatch): - assert get_cluster_driver() is None - - monkeypatch.setenv("LIGHTNING_INTERRUPTIBLE_WORKS", "1") - assert get_cluster_driver() == "direct" - - monkeypatch.setenv("LIGHTNING_CLUSTER_DRIVER", "k8s") - assert get_cluster_driver() == "k8s" - - with pytest.raises(ValueError, match="The value needs to be in"): - monkeypatch.setenv("LIGHTNING_CLUSTER_DRIVER", "something_else") - assert get_cluster_driver() == "k8s" diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index e759d3affc7c5..205a532119df1 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -987,8 +987,8 @@ def run(self): def test_state_size_constant_growth(): app = LightningApp(SizeFlow()) MultiProcessRuntime(app, start_server=False).dispatch() - assert app.root._state_sizes[0] <= 7952 - assert app.root._state_sizes[20] <= 26500 + assert app.root._state_sizes[0] <= 7965 + assert app.root._state_sizes[20] <= 26550 class FlowUpdated(LightningFlow): @@ -1108,7 +1108,6 @@ def __init__(self, flow): def test_cloud_compute_binding(): - cloud_compute.ENABLE_MULTIPLE_WORKS_IN_NON_DEFAULT_CONTAINER = True assert cloud_compute._CLOUD_COMPUTE_STORE == {} diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index 24e803a6b32d5..8b05bc03f3724 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -333,7 +333,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -358,7 +358,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -399,7 +399,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -424,7 +424,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index f7de88baac91d..221440e34244a 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1822,7 +1822,7 @@ def test_load_app_from_file(): app = CloudRuntime.load_app_from_file( os.path.join(test_script_dir, "app_with_env.py"), ) - assert app.works[0].cloud_compute.name == "default" + assert app.works[0].cloud_compute.name == "cpu-small" app = CloudRuntime.load_app_from_file( os.path.join(test_script_dir, "app_with_env.py"), @@ -1850,7 +1850,7 @@ def test_load_app_from_file(): "userRequestedComputeConfig": { "count": 1, "diskSize": 0, - "name": "default", + "name": "cpu-small", "preemptible": "*", "shmSize": 0, }, @@ -1874,7 +1874,7 @@ def test_load_app_from_file(): "user_requested_compute_config": { "count": 1, "disk_size": 0, - "name": "default", + "name": "cpu-small", "preemptible": "*", "shm_size": 0, }, @@ -1933,6 +1933,8 @@ class Work(LightningWork): def __init__(self): super().__init__() self.cloud_compute = CloudCompute(name="default") + # TODO: Remove me + self.cloud_compute.name = "default" self.cloud_build_config = BuildConfig(image="custom") def run(self): diff --git a/tests/tests_app/storage/test_path.py b/tests/tests_app/storage/test_path.py index 07f3f282e2207..78e3c495d0b90 100644 --- a/tests/tests_app/storage/test_path.py +++ b/tests/tests_app/storage/test_path.py @@ -693,8 +693,6 @@ def test_artifacts_path(): @pytest.mark.skipif(not _is_s3fs_available(), reason="This test requires s3fs.") @mock.patch.dict(os.environ, {"LIGHTNING_BUCKET_ENDPOINT_URL": "a"}) @mock.patch.dict(os.environ, {"LIGHTNING_BUCKET_NAME": "b"}) -@mock.patch.dict(os.environ, {"LIGHTNING_AWS_ACCESS_KEY_ID": "c"}) -@mock.patch.dict(os.environ, {"LIGHTNING_AWS_SECRET_ACCESS_KEY": "d"}) @mock.patch.dict(os.environ, {"LIGHTNING_CLOUD_APP_ID": "e"}) def test_filesystem(monkeypatch): from lightning.app.storage import path @@ -702,10 +700,7 @@ def test_filesystem(monkeypatch): mock = MagicMock() monkeypatch.setattr(path, "S3FileSystem", mock) fs = _filesystem() - assert fs._mock_new_parent._mock_mock_calls[0].kwargs["key"] == "c" - assert fs._mock_new_parent._mock_mock_calls[0].kwargs["secret"] == "d" - assert not fs._mock_new_parent._mock_mock_calls[0].kwargs["use_ssl"] - assert fs._mock_new_parent._mock_mock_calls[0].kwargs["client_kwargs"] == {"endpoint_url": "a"} + assert fs == mock() class TestSharedStoragePath(TestCase): diff --git a/tests/tests_app/structures/test_structures.py b/tests/tests_app/structures/test_structures.py index 3ee72ea4c7872..f124b7dd5f611 100644 --- a/tests/tests_app/structures/test_structures.py +++ b/tests/tests_app/structures/test_structures.py @@ -48,7 +48,7 @@ def run(self): "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -82,7 +82,7 @@ def run(self): "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -116,7 +116,7 @@ def run(self): "_internal_ip": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -202,7 +202,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -236,7 +236,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, @@ -265,7 +265,7 @@ def run(self): "_display_name": "", "_cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "mounts": None, diff --git a/tests/tests_app/utilities/packaging/test_cloud_compute.py b/tests/tests_app/utilities/packaging/test_cloud_compute.py index 7d49a967f8013..d954b2507339f 100644 --- a/tests/tests_app/utilities/packaging/test_cloud_compute.py +++ b/tests/tests_app/utilities/packaging/test_cloud_compute.py @@ -5,7 +5,7 @@ def test_cloud_compute_names(): - assert CloudCompute().name == "default" + assert CloudCompute().name == "cpu-small" assert CloudCompute("cpu-small").name == "cpu-small" assert CloudCompute("coconut").name == "coconut" # the backend is responsible for validation of names diff --git a/tests/tests_app/utilities/test_load_app.py b/tests/tests_app/utilities/test_load_app.py index 14da8a8acc1b9..24149a9a5ff88 100644 --- a/tests/tests_app/utilities/test_load_app.py +++ b/tests/tests_app/utilities/test_load_app.py @@ -52,7 +52,7 @@ def test_extract_metadata_from_component(): "cloud_build_config": {"__build_config__": {"requirements": [], "dockerfile": None, "image": None}}, "cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "shm_size": 0, @@ -76,7 +76,7 @@ def test_extract_metadata_from_component(): "cloud_build_config": {"__build_config__": {"requirements": [], "dockerfile": None, "image": None}}, "cloud_compute": { "type": "__cloud_compute__", - "name": "default", + "name": "cpu-small", "disk_size": 0, "idle_timeout": None, "shm_size": 0,