Skip to content

Commit

Permalink
Add 2 more message based deployment checks.
Browse files Browse the repository at this point in the history
Change-Id: Iddb877b3d237851f11335b250c10aaf17740bea0
GitOrigin-RevId: 98b275a30158f17ac9cde21264f2d1a8cff7d306
  • Loading branch information
kramarz authored and Copybara-Service committed Jul 23, 2024
1 parent e43d614 commit 442b2dc
Show file tree
Hide file tree
Showing 10 changed files with 440 additions and 34 deletions.
6 changes: 4 additions & 2 deletions gcpdiag/runbook/cloudrun/flags.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""TODO: String doc"""
"""Cloud Run runbook flags"""
# pylint: disable=unused-wildcard-import, wildcard-import
from gcpdiag.runbook.iam.flags import *
from gcpdiag.runbook.gcp.flags import *

SERVICE_NAME = 'service_name'
83 changes: 69 additions & 14 deletions gcpdiag/runbook/cloudrun/service_deployment.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,7 @@
from gcpdiag import runbook
from gcpdiag.queries import cloudrun, crm
from gcpdiag.runbook import op
from gcpdiag.runbook.gce import flags

SERVICE_NAME = 'service_name'
from gcpdiag.runbook.cloudrun import flags


class ServiceDeployment(runbook.DiagnosticTree):
Expand All @@ -48,7 +46,7 @@ class ServiceDeployment(runbook.DiagnosticTree):
'help': 'Region of the service.',
'required': True
},
SERVICE_NAME: {
flags.SERVICE_NAME: {
'type': str,
'help': 'Name of the Cloud Run service',
'required': True,
Expand Down Expand Up @@ -81,13 +79,12 @@ def execute(self):
project = crm.get_project(op.get(flags.PROJECT_ID))
try:
cloudrun.get_service(op.get(flags.PROJECT_ID), op.get(flags.REGION),
op.get(SERVICE_NAME))
op.get(flags.SERVICE_NAME))
except googleapiclient.errors.HttpError:
op.add_skipped(
project,
reason=
f'Service {op.get(SERVICE_NAME)} does not exist in region {op.get(flags.REGION)} or '
f'project {op.get(flags.PROJECT_ID)}')
reason=f'Service {op.get(flags.SERVICE_NAME)} does not exist in region '
f'{op.get(flags.REGION)} or project {op.get(flags.PROJECT_ID)}')


class ServiceDeploymentCodeStep(runbook.CompositeStep):
Expand All @@ -96,6 +93,8 @@ class ServiceDeploymentCodeStep(runbook.CompositeStep):
def execute(self):
"""Checking for common container and code issues."""
self.add_child(ContainerFailedToStartStep())
self.add_child(ImageWasNotFoundStep())
self.add_child(NoPermissionForImageStep())


class ContainerFailedToStartStep(runbook.Step):
Expand All @@ -104,18 +103,74 @@ class ContainerFailedToStartStep(runbook.Step):
This step will check if the error is present and link to additional troubleshooting steps.
"""

template = 'service_deployment::deployment_working'

STATUS_RE = re.compile(
template = 'service_deployment::starts_correctly'
message_re = re.compile(
r"Revision '[\w-]+' is not ready and cannot serve traffic. The user-provided container "
r'failed to start and listen on the port defined provided by the PORT=\d+ environment '
r'failed to start and listen on the port defined provided by the PORT=(\d+) environment '
r'variable.')

def execute(self):
"""Verifying if there is an error that container failed to start."""
service = cloudrun.get_service(op.get(flags.PROJECT_ID),
op.get(flags.REGION), op.get(SERVICE_NAME))
if self.STATUS_RE.match(service.conditions['RoutesReady'].message):
op.get(flags.REGION),
op.get(flags.SERVICE_NAME))
match = self.message_re.match(service.conditions['RoutesReady'].message)
if match:
op.add_failed(service,
reason=op.prep_msg(op.FAILURE_REASON, name=service.name),
remediation=op.prep_msg(op.FAILURE_REMEDIATION))


class ImageWasNotFoundStep(runbook.Step):
"""Checks if if specified image exists.
This step will check if the error is present and link to additional troubleshooting steps.
"""

template = 'service_deployment::image_exists'
message_re = re.compile(
r"Revision '[\w-]+' is not ready and cannot serve traffic. Image '([^']+)' not found."
)

def execute(self):
"""Verifying if specified image exists."""
service = cloudrun.get_service(op.get(flags.PROJECT_ID),
op.get(flags.REGION),
op.get(flags.SERVICE_NAME))
match = self.message_re.match(service.conditions['RoutesReady'].message)
if match:
op.add_failed(service,
reason=op.prep_msg(op.FAILURE_REASON,
name=service.name,
image=match.group(1)),
remediation=op.prep_msg(op.FAILURE_REMEDIATION,
image=match.group(1)))


class NoPermissionForImageStep(runbook.Step):
"""Checks if Cloud Run service agent can fetch the image.
This step will check if the error is present and link to additional troubleshooting steps.
"""

template = 'service_deployment::has_permission_for_image'
message_re = re.compile(
r"Revision '[\w-]+' is not ready and cannot serve traffic. Google Cloud "
r'Run Service Agent ([^ ]+) must have permission to read the image, '
r'([^ ]+).')

def execute(self):
"""Verifying if Cloud Run service agent can fetch the image."""
service = cloudrun.get_service(op.get(flags.PROJECT_ID),
op.get(flags.REGION),
op.get(flags.SERVICE_NAME))
match = self.message_re.match(service.conditions['RoutesReady'].message)
if match:
op.add_failed(service,
reason=op.prep_msg(op.FAILURE_REASON,
name=service.name,
sa=match.group(1),
image=match.group(2)),
remediation=op.prep_msg(op.FAILURE_REMEDIATION,
sa=match.group(1),
image=match.group(2)))
18 changes: 14 additions & 4 deletions gcpdiag/runbook/cloudrun/service_deployment_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,17 @@ class TestInvalidContainer(snapshot_test_base.RulesSnapshotTestBase):
project_id = 'gcpdiag-cloudrun2-aaaa'
config.init({'auto': True, 'interface': 'cli'}, project_id)

rule_parameters = [{
'service_name': 'invalid-container',
'region': 'us-central1',
}]
rule_parameters = [
{
'service_name': 'invalid-container',
'region': 'us-central1',
},
{
'service_name': 'image-does-not-exist',
'region': 'us-central1',
},
{
'service_name': 'no-image-permission',
'region': 'us-central1',
},
]
65 changes: 65 additions & 0 deletions gcpdiag/runbook/cloudrun/snapshots/service_deployment.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,71 @@ cloudrun/service-deployment: Investigates the necessary GCP components searching
[START]: Verifying context and parameters required for deployment runbook checks.
[COMPOSITE STEP]: Checking for common container and code issues.
[AUTOMATED STEP]: Verifying if there is an error that container failed to start.
[AUTOMATED STEP]: Verifying if specified image exists.
[AUTOMATED STEP]: Verifying if Cloud Run service agent can fetch the image.
[END]: Finalizing runbook investigations...


project: gcpdiag-cloudrun2-aaaa, parameters: {project_id=gcpdiag-cloudrun2-aaaa,region=us-
central1,service_name=image-does-not-exist}

cloudrun/service-deployment: Investigates the necessary GCP components searching for reasons for deployment errors.

This runbook will examine the following key areas:

1. Container and code Checks.
- Ensures the Container is in correct state to run in Cloud Run

Scope of Investigation:
- Note that this runbook does not provide troubleshooting steps for errors
caused by the code running in the container.

[START]: Verifying context and parameters required for deployment runbook checks.
[COMPOSITE STEP]: Checking for common container and code issues.
[AUTOMATED STEP]: Verifying if there is an error that container failed to start.
[AUTOMATED STEP]: Verifying if specified image exists.

- gcpdiag-cloudrun2-aaaa/37324495-a964-42ce-a080-2995f68c8f35 [FAIL]
[REASON]
Provided image us-central1-docker.pkg.dev/gcpdiag-cloudrun2-aaaa/cloudrun-repository/missing-image does not exist.

[REMEDIATION]
The provided image does not exist. Make sure that it is correct. Try following
quickstart guide and see how to correctly specify image names. https://cloud.google.com/run/docs/quickstarts

[AUTOMATED STEP]: Verifying if Cloud Run service agent can fetch the image.
[END]: Finalizing runbook investigations...


project: gcpdiag-cloudrun2-aaaa, parameters: {project_id=gcpdiag-cloudrun2-aaaa,region=us-
central1,service_name=no-image-permission}

cloudrun/service-deployment: Investigates the necessary GCP components searching for reasons for deployment errors.

This runbook will examine the following key areas:

1. Container and code Checks.
- Ensures the Container is in correct state to run in Cloud Run

Scope of Investigation:
- Note that this runbook does not provide troubleshooting steps for errors
caused by the code running in the container.

[START]: Verifying context and parameters required for deployment runbook checks.
[COMPOSITE STEP]: Checking for common container and code issues.
[AUTOMATED STEP]: Verifying if there is an error that container failed to start.
[AUTOMATED STEP]: Verifying if specified image exists.
[AUTOMATED STEP]: Verifying if Cloud Run service agent can fetch the image.

- gcpdiag-cloudrun2-aaaa/7af8a1c8-8758-48d4-9476-024c0f62c4d7 [FAIL]
[REASON]
Cloud Run Service agent service-123400010@serverless-robot-prod.iam.gserviceaccount.com does not have permissions to read image gcr.io/private-project/image..

[REMEDIATION]
Please make sure that service-123400010@serverless-robot-prod.iam.gserviceaccount.com has roles/storage.objectViewer role if the image is stored in Container
Registry or roles/artifactregistry.reader if in Artifact Registry. Please note that the role needs to be granted in the
project where the image is stored.

[END]: Finalizing runbook investigations...


23 changes: 21 additions & 2 deletions gcpdiag/runbook/cloudrun/templates/service_deployment.jinja
Original file line number Diff line number Diff line change
@@ -1,9 +1,28 @@
{% block deployment_working_failure_remediation %}
{% block starts_correctly_failure_remediation %}
The container failed to start. Please check
https://cloud.google.com/run/docs/troubleshooting#container-failed-to-start
for the next steps.
{% endblock %}

{% block deployment_working_failure_reason %}
{% block starts_correctly_failure_reason %}
Container failed to start in service { name }.
{% endblock %}

{% block image_exists_failure_remediation %}
The provided image does not exist. Make sure that it is correct. Try following
quickstart guide and see how to correctly specify image names. https://cloud.google.com/run/docs/quickstarts
{% endblock %}

{% block image_exists_failure_reason %}
Provided image {image} does not exist.
{% endblock %}

{% block has_permission_for_image_failure_remediation %}
Please make sure that {sa} has roles/storage.objectViewer role if the image is stored in Container
Registry or roles/artifactregistry.reader if in Artifact Registry. Please note that the role needs to be granted in the
project where the image is stored.
{% endblock %}

{% block has_permission_for_image_failure_reason %}
Cloud Run Service agent {sa} does not have permissions to read image {image}.
{% endblock %}
28 changes: 28 additions & 0 deletions test-data/cloudrun2/cloud_run.tf
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,31 @@ resource "null_resource" "failed_deployment1" {
}
depends_on = [google_artifact_registry_repository.cloudrun_repo]
}

resource "null_resource" "failed_deployment2" {
provisioner "local-exec" {
command = <<-EOT
gcloud run deploy no-image-permission \
--project ${google_project.project.project_id} \
--region us-central1 \
--image gcr.io/private-project/image \
--no-allow-unauthenticated \
|| true
EOT
}
depends_on = [google_artifact_registry_repository.cloudrun_repo]
}

resource "null_resource" "failed_deployment3" {
provisioner "local-exec" {
command = <<-EOT
gcloud run deploy image-does-not-exist \
--project ${google_project.project.project_id} \
--region us-central1 \
--image ${local.repository_url}/missing-image \
--no-allow-unauthenticated \
|| true
EOT
}
depends_on = [google_artifact_registry_repository.cloudrun_repo]
}
Loading

0 comments on commit 442b2dc

Please sign in to comment.