diff --git a/service_capacity_modeling/hardware/profiles/shapes/aws.json b/service_capacity_modeling/hardware/profiles/shapes/aws.json index ee0c373..b44f23f 100644 --- a/service_capacity_modeling/hardware/profiles/shapes/aws.json +++ b/service_capacity_modeling/hardware/profiles/shapes/aws.json @@ -797,13 +797,15 @@ "name": "gp2", "read_io_latency_ms": {"low": 0.8, "mid": 1.05, "high": 1.8, "maximum_value": 10, "confidence": 0.90}, "write_io_latency_ms": {"low": 1.2, "mid": 2, "high": 4, "maximum_value": 20, "confidence": 0.90}, - "max_scale_size_gib": 16384, "block_size_kib": 16 + "max_scale_size_gib": 16384, "block_size_kib": 16, + "max_scale_io_per_s": 16000 }, "io2": { "name": "io2", "read_io_latency_ms": {"low": 0.5, "mid": 0.8, "high": 1.2, "maximum_value": 2, "confidence": 0.90}, "write_io_latency_ms": {"low": 0.9, "mid": 1.2, "high": 2, "maximum_value": 4, "confidence": 0.90}, "max_scale_size_gib": 16384, "block_size_kib": 16, + "max_scale_io_per_s": 64000, "lifecycle": "alpha" }, "gp3": { @@ -811,7 +813,8 @@ "read_io_latency_ms": {"low": 0.8, "mid": 1.05, "high": 1.8, "maximum_value": 10, "confidence": 0.90}, "write_io_latency_ms": {"low": 1.2, "mid": 2, "high": 4, "maximum_value": 20, "confidence": 0.90}, "max_scale_size_gib": 16384, "block_size_kib": 16, - "lifecycle": "alpha" + "max_scale_io_per_s": 16000, + "lifecycle": "stable" }, "aurora": { "name": "aurora", diff --git a/service_capacity_modeling/interface.py b/service_capacity_modeling/interface.py index 23bde34..de6568b 100644 --- a/service_capacity_modeling/interface.py +++ b/service_capacity_modeling/interface.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import sys from decimal import Decimal from enum import Enum from functools import lru_cache @@ -199,6 +200,8 @@ class Drive(ExcludeUnsetModel): single_tenant: bool = True # If this drive can scale, how large can it scale to max_scale_size_gib: int = 0 + # If this drive can scale IO, how large can it scale to + max_scale_io_per_s: int = 0 lifecycle: Lifecycle = Lifecycle.stable compatible_families: List[str] = [] @@ -225,6 +228,13 @@ def max_size_gib(self): else: return self.size_gib + @property + def max_io_per_s(self): + if self.max_scale_io_per_s != 0: + return self.max_scale_io_per_s + else: + return sys.maxsize + @property def annual_cost(self): size = self.size_gib or 0 diff --git a/service_capacity_modeling/models/common.py b/service_capacity_modeling/models/common.py index 8d63c1b..551fa47 100644 --- a/service_capacity_modeling/models/common.py +++ b/service_capacity_modeling/models/common.py @@ -291,6 +291,13 @@ def compute_stateful_zone( utils.next_n(read_io, n=200), utils.next_n(write_io, n=200), ) + if (read_io + write_io) > drive.max_io_per_s: + ratio = (read_io + write_io) / drive.max_io_per_s + count = max(cluster_size(math.ceil(count * ratio)), min_count) + cost = count * instance.annual_cost + read_io = utils.next_n(read_io * ratio, n=200) + write_io = utils.next_n(write_io * ratio, n=200) + attached_drive = drive.copy() attached_drive.size_gib = ebs_gib attached_drive.read_io_per_s = int(round(read_io, 2)) @@ -326,9 +333,9 @@ def gp2_gib_for_io(read_ios) -> int: return int(max(1, read_ios // 3)) -def cloud_gib_for_io(drive, read_ios, space_gib) -> int: +def cloud_gib_for_io(drive, total_ios, space_gib) -> int: if drive.name == "gp2": - return gp2_gib_for_io(read_ios) + return gp2_gib_for_io(total_ios) else: return space_gib diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py index ba2eee5..10cd4d6 100644 --- a/service_capacity_modeling/models/org/netflix/cassandra.py +++ b/service_capacity_modeling/models/org/netflix/cassandra.py @@ -165,6 +165,7 @@ def _upsert_params(cluster, params): # pylint: disable=too-many-locals +# pylint: disable=too-many-return-statements # flake8: noqa: C901 def _estimate_cassandra_cluster_zonal( instance: Instance, @@ -174,6 +175,7 @@ def _estimate_cassandra_cluster_zonal( zones_per_region: int = 3, copies_per_region: int = 3, require_local_disks: bool = False, + require_attached_disks: bool = False, required_cluster_size: Optional[int] = None, max_rps_to_disk: int = 500, max_local_disk_gib: int = 2048, @@ -190,8 +192,12 @@ def _estimate_cassandra_cluster_zonal( if instance.drive is None and require_local_disks: return None - # Cassandra only deploys on gp2 drives right now - if drive.name != "gp2": + # if we're not allowed to use local disks, skip ephems + if instance.drive is not None and require_attached_disks: + return None + + # Cassandra only deploys on gp2 and gp3 drives right now + if drive.name not in ("gp2", "gp3"): return None rps = desires.query_pattern.estimated_read_per_second.mid // zones_per_region @@ -201,11 +207,13 @@ def _estimate_cassandra_cluster_zonal( write_bytes_per_sec = ( write_per_sec * desires.query_pattern.estimated_mean_write_size_bytes.mid ) + read_bytes_per_sec = rps * desires.query_pattern.estimated_mean_read_size_bytes.mid # Write IO will be 1 to commitlog + 2 writes (plus 2 reads) in the first # hour during compaction. # https://aws.amazon.com/ebs/volume-types/ says IOPS are 16k for io2/gp2 # so for now we're just hardcoding. write_io_per_sec = (1 + 4) * max(1, write_bytes_per_sec // 16384) + read_io_per_sec = max(rps, read_bytes_per_sec // 16384) # Based on the disk latency and the read latency SLOs we adjust our # working set to keep more or less data in RAM. Faster drives need @@ -262,7 +270,7 @@ def _estimate_cassandra_cluster_zonal( # Take into account the reads per read # from the per node dataset using leveled compaction required_disk_ios=lambda size, count: ( - _cass_io_per_read(size) * math.ceil(rps / count), + _cass_io_per_read(size) * math.ceil(read_io_per_sec / count), write_io_per_sec / count, ), # C* requires ephemeral disks to be 25% full because compaction @@ -418,6 +426,10 @@ class NflxCassandraArguments(BaseModel): default=False, description="If local (ephemeral) drives are required", ) + require_attached_disks: bool = Field( + default=False, + description="If attached (ebs) drives are required", + ) required_cluster_size: Optional[int] = Field( default=None, description="Require zonal clusters to be this size (force vertical scaling)", @@ -464,6 +476,9 @@ def capacity_plan( require_local_disks: bool = extra_model_arguments.get( "require_local_disks", False ) + require_attached_disks: bool = extra_model_arguments.get( + "require_attached_disks", False + ) required_cluster_size: Optional[int] = extra_model_arguments.get( "required_cluster_size", None ) @@ -493,6 +508,7 @@ def capacity_plan( zones_per_region=context.zones_in_region, copies_per_region=copies_per_region, require_local_disks=require_local_disks, + require_attached_disks=require_attached_disks, required_cluster_size=required_cluster_size, max_rps_to_disk=max_rps_to_disk, max_regional_size=max_regional_size, diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py index 3c7678c..e38a3df 100644 --- a/tests/netflix/test_cassandra.py +++ b/tests/netflix/test_cassandra.py @@ -75,6 +75,72 @@ def test_capacity_small_fast(): assert small_result.cluster_params["cassandra.heap.table.percent"] == 0.11 +def test_ebs_high_reads(): + cap_plan = planner.plan_certain( + model_name="org.netflix.cassandra", + region="us-east-1", + desires=CapacityDesires( + service_tier=1, + query_pattern=QueryPattern( + estimated_read_per_second=certain_int(100_000), + estimated_write_per_second=certain_int(1_000), + ), + data_shape=DataShape( + estimated_state_size_gib=certain_int(1_000), + ), + ), + extra_model_arguments={"require_attached_disks": True}, + )[0] + result = cap_plan.candidate_clusters.zonal[0] + + cores = result.count * result.instance.cpu + assert 64 <= cores <= 128 + # Should get gp3 + assert result.attached_drives[0].name == "gp3" + # 1TiB / ~32 nodes + assert result.attached_drives[0].read_io_per_s is not None + ios = result.attached_drives[0].read_io_per_s * result.count + # Each zone is handling ~33k reads per second, so total disk ios should be < 3x that + # 3 from each level + assert 100_000 < ios < 400_000 + + +def test_ebs_high_writes(): + cap_plan = planner.plan_certain( + model_name="org.netflix.cassandra", + region="us-east-1", + desires=CapacityDesires( + service_tier=1, + query_pattern=QueryPattern( + estimated_read_per_second=certain_int(10_000), + estimated_write_per_second=certain_int(100_000), + estimated_mean_write_size_bytes=certain_int(1024 * 8), + ), + data_shape=DataShape( + estimated_state_size_gib=certain_int(10_000), + ), + ), + extra_model_arguments={"require_attached_disks": True}, + )[0] + result = cap_plan.candidate_clusters.zonal[0] + + cores = result.count * result.instance.cpu + assert 128 <= cores <= 512 + # Should get gp3 + assert result.attached_drives[0].name == "gp3" + # 1TiB / ~32 nodes + assert result.attached_drives[0].read_io_per_s is not None + assert result.attached_drives[0].write_io_per_s is not None + + read_ios = result.attached_drives[0].read_io_per_s * result.count + write_ios = result.attached_drives[0].write_io_per_s * result.count + + # 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s + assert 20_000 < read_ios < 60_000 + # 33k wps * 8KiB / 16KiB write IO size = 16.5k / s * 4 for compaction = 64k + assert 60_000 < write_ios < 100_000 + + def test_capacity_high_writes(): cap_plan = planner.plan_certain( model_name="org.netflix.cassandra", diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py index 4cabdfb..eedf91c 100644 --- a/tests/netflix/test_cassandra_uncertain.py +++ b/tests/netflix/test_cassandra_uncertain.py @@ -162,7 +162,7 @@ def test_worn_dataset(): assert lr_cluster.instance.name.startswith( "m5." ) or lr_cluster.instance.name.startswith("r5.") - assert lr_cluster.attached_drives[0].name == "gp2" + assert lr_cluster.attached_drives[0].name == "gp3" # gp2 should not provision massive drives, prefer to upcolor assert lr_cluster.attached_drives[0].size_gib < 9000 assert lr_cluster.attached_drives[0].size_gib * lr_cluster.count * 3 > 204800