Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cloudtest: Fortify the test_oom_clusterd test #18861

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions src/adapter/src/catalog/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ use std::net::Ipv4Addr;
use std::sync::Arc;
use std::time::Duration;

use bytesize::ByteSize;
use serde::Deserialize;

use mz_build_info::BuildInfo;
use mz_cloud_resources::AwsExternalIdPrefix;
use mz_controller::clusters::ReplicaAllocation;
use mz_orchestrator::MemoryLimit;
use mz_ore::cast::CastFrom;
use mz_ore::metrics::MetricsRegistry;
use mz_repr::GlobalId;
use mz_secrets::SecretsReader;
Expand Down Expand Up @@ -95,6 +98,10 @@ impl Default for ClusterReplicaSizeMap {
// "2-1": {"scale": 2, "workers": 1},
// ...
// "16-1": {"scale": 16, "workers": 1},
// /// Used in the cloudtest tests that force OOMs
// "mem-2": { "memory_limit": 2Gb },
// ...
// "mem-16": { "memory_limit": 16Gb },
// }
let mut inner = (0..=5)
.map(|i| {
Expand Down Expand Up @@ -132,6 +139,16 @@ impl Default for ClusterReplicaSizeMap {
workers: scale.into(),
},
);

inner.insert(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new SIZE type , along with all the other types in this file, are defaults that are only used for testing. In the cloud, a completely separate set of SIZEs is installed and used.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd feel more confident if we had this behind a if testing

Copy link
Contributor Author

@philip-stoev philip-stoev Apr 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this entire thing is the default when executing bin/environmentd unless the entire SIZE list is overriden with a command-line option, which is what happens in the cloud
there is no easy way to distinguish "test" from "non-test" invocations of bin/environmentd at this time

philip-stoev marked this conversation as resolved.
Show resolved Hide resolved
format!("mem-{scale}"),
ReplicaAllocation {
memory_limit: Some(MemoryLimit(ByteSize(u64::cast_from(scale) * (1 << 30)))),
cpu_limit: None,
scale: 1,
workers: 8,
},
);
}

inner.insert(
Expand Down
41 changes: 26 additions & 15 deletions test/cloudtest/test_replica_restart.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import threading
import time
from io import StringIO
from textwrap import dedent

from pg8000 import Connection

Expand Down Expand Up @@ -50,12 +51,19 @@ def assert_notice(conn: Connection, contains: bytes) -> None:
def test_oom_clusterd(mz: MaterializeApplication) -> None:
def verify_cluster_oomed() -> None:
with mz.environmentd.sql_cursor(autocommit=False) as cur:
cur.execute("SET CLUSTER=mz_introspection")
cur.execute(
"""DECLARE c CURSOR FOR SUBSCRIBE TO (SELECT status, reason FROM mz_internal.mz_cluster_replica_statuses mcrs
JOIN mz_cluster_replicas mcr ON mcrs.replica_id = mcr.id
JOIN mz_clusters mc ON mcr.cluster_id = mc.id
WHERE mc.name = 'default')"""
dedent(
"""
SET CLUSTER=mz_introspection;
DECLARE c CURSOR FOR SUBSCRIBE TO (
SELECT status, reason
FROM mz_internal.mz_cluster_replica_statuses mcrs
JOIN mz_cluster_replicas mcr ON mcrs.replica_id = mcr.id
JOIN mz_clusters mc ON mcr.cluster_id = mc.id
WHERE mc.name = 'oom'
)
"""
)
)
while True:
cur.execute("FETCH ALL c")
Expand All @@ -65,22 +73,25 @@ def verify_cluster_oomed() -> None:
if status == "not-ready" and reason == "oom-killed":
return

mz.environmentd.sql("DROP VIEW IF EXISTS v CASCADE")
# Once we create an index on this view, it is practically guaranteed to OOM
# Once we create an index on this view in a cluster limited to 2Gb, it is practically guaranteed to OOM
mz.environmentd.sql(
"""
CREATE VIEW v AS
SELECT repeat('abc' || x || y, 1000000) FROM
(SELECT * FROM generate_series(1, 1000000)) a(x),
(SELECT * FROM generate_series(1, 1000000)) b(y)
"""
dedent(
"""
CREATE CLUSTER oom REPLICAS (oom (size 'mem-2'));
SET cluster=oom;
CREATE VIEW oom AS
SELECT repeat('abc' || x || y, 1000000) FROM
(SELECT * FROM generate_series(1, 1000000)) a(x),
(SELECT * FROM generate_series(1, 1000000)) b(y);
CREATE DEFAULT INDEX oom_idx ON oom
"""
)
)
mz.environmentd.sql("CREATE DEFAULT INDEX i ON v")

# Wait for the cluster pod to OOM
verify_cluster_oomed()

mz.environmentd.sql("DROP VIEW v CASCADE")
mz.environmentd.sql("DROP CLUSTER oom CASCADE; DROP VIEW oom CASCADE")


# Test that a crashed (and restarted) cluster replica generates expected notice
Expand Down