Promotes gp3 to be GA and properly does IO calcs

Now that Cassandra can provision with gp3, let's start recommending it from models.
Netflix-Skunkworks · Jun 23, 2023 · ebc7697 · ebc7697
1 parent 5337617
commit ebc7697
Show file tree

Hide file tree

Showing 6 changed files with 110 additions and 8 deletions.
diff --git a/service_capacity_modeling/hardware/profiles/shapes/aws.json b/service_capacity_modeling/hardware/profiles/shapes/aws.json
@@ -797,21 +797,24 @@
       "name": "gp2",
       "read_io_latency_ms": {"low": 0.8, "mid": 1.05, "high": 1.8, "maximum_value": 10, "confidence": 0.90},
       "write_io_latency_ms": {"low": 1.2, "mid": 2, "high": 4, "maximum_value": 20, "confidence": 0.90},
-      "max_scale_size_gib": 16384, "block_size_kib": 16
+      "max_scale_size_gib": 16384, "block_size_kib": 16,
+      "max_scale_io_per_s": 16000
     },
     "io2": {
       "name": "io2",
       "read_io_latency_ms": {"low": 0.5, "mid": 0.8, "high": 1.2, "maximum_value": 2, "confidence": 0.90},
       "write_io_latency_ms": {"low": 0.9, "mid": 1.2, "high": 2, "maximum_value": 4, "confidence": 0.90},
       "max_scale_size_gib": 16384, "block_size_kib": 16,
+      "max_scale_io_per_s": 64000,
       "lifecycle": "alpha"
     },
     "gp3": {
       "name": "gp3",
       "read_io_latency_ms": {"low": 0.8, "mid": 1.05, "high": 1.8, "maximum_value": 10, "confidence": 0.90},
       "write_io_latency_ms": {"low": 1.2, "mid": 2, "high": 4, "maximum_value": 20, "confidence": 0.90},
       "max_scale_size_gib": 16384, "block_size_kib": 16,
-      "lifecycle": "alpha"
+      "max_scale_io_per_s": 16000,
+      "lifecycle": "stable"
     },
     "aurora": {
       "name": "aurora",

diff --git a/service_capacity_modeling/interface.py b/service_capacity_modeling/interface.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+import sys
 from decimal import Decimal
 from enum import Enum
 from functools import lru_cache
@@ -199,6 +200,8 @@ class Drive(ExcludeUnsetModel):
     single_tenant: bool = True
     # If this drive can scale, how large can it scale to
     max_scale_size_gib: int = 0
+    # If this drive can scale IO, how large can it scale to
+    max_scale_io_per_s: int = 0
 
     lifecycle: Lifecycle = Lifecycle.stable
     compatible_families: List[str] = []
@@ -225,6 +228,13 @@ def max_size_gib(self):
         else:
             return self.size_gib
 
+    @property
+    def max_io_per_s(self):
+        if self.max_scale_io_per_s != 0:
+            return self.max_scale_io_per_s
+        else:
+            return sys.maxsize
+
     @property
     def annual_cost(self):
         size = self.size_gib or 0

diff --git a/service_capacity_modeling/models/common.py b/service_capacity_modeling/models/common.py
@@ -291,6 +291,13 @@ def compute_stateful_zone(
             utils.next_n(read_io, n=200),
             utils.next_n(write_io, n=200),
         )
+        if (read_io + write_io) > drive.max_io_per_s:
+            ratio = (read_io + write_io) / drive.max_io_per_s
+            count = max(cluster_size(math.ceil(count * ratio)), min_count)
+            cost = count * instance.annual_cost
+            read_io = utils.next_n(read_io * ratio, n=200)
+            write_io = utils.next_n(write_io * ratio, n=200)
+
         attached_drive = drive.copy()
         attached_drive.size_gib = ebs_gib
         attached_drive.read_io_per_s = int(round(read_io, 2))
@@ -326,9 +333,9 @@ def gp2_gib_for_io(read_ios) -> int:
     return int(max(1, read_ios // 3))
 
 
-def cloud_gib_for_io(drive, read_ios, space_gib) -> int:
+def cloud_gib_for_io(drive, total_ios, space_gib) -> int:
     if drive.name == "gp2":
-        return gp2_gib_for_io(read_ios)
+        return gp2_gib_for_io(total_ios)
     else:
         return space_gib
 

diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py
@@ -165,6 +165,7 @@ def _upsert_params(cluster, params):
 
 
 # pylint: disable=too-many-locals
+# pylint: disable=too-many-return-statements
 # flake8: noqa: C901
 def _estimate_cassandra_cluster_zonal(
     instance: Instance,
@@ -174,6 +175,7 @@ def _estimate_cassandra_cluster_zonal(
     zones_per_region: int = 3,
     copies_per_region: int = 3,
     require_local_disks: bool = False,
+    require_attached_disks: bool = False,
     required_cluster_size: Optional[int] = None,
     max_rps_to_disk: int = 500,
     max_local_disk_gib: int = 2048,
@@ -190,8 +192,12 @@ def _estimate_cassandra_cluster_zonal(
     if instance.drive is None and require_local_disks:
         return None
 
-    # Cassandra only deploys on gp2 drives right now
-    if drive.name != "gp2":
+    # if we're not allowed to use local disks, skip ephems
+    if instance.drive is not None and require_attached_disks:
+        return None
+
+    # Cassandra only deploys on gp2 and gp3 drives right now
+    if drive.name not in ("gp2", "gp3"):
         return None
 
     rps = desires.query_pattern.estimated_read_per_second.mid // zones_per_region
@@ -201,11 +207,13 @@ def _estimate_cassandra_cluster_zonal(
     write_bytes_per_sec = (
         write_per_sec * desires.query_pattern.estimated_mean_write_size_bytes.mid
     )
+    read_bytes_per_sec = rps * desires.query_pattern.estimated_mean_read_size_bytes.mid
     # Write IO will be 1 to commitlog + 2 writes (plus 2 reads) in the first
     # hour during compaction.
     # https://aws.amazon.com/ebs/volume-types/ says IOPS are 16k for io2/gp2
     # so for now we're just hardcoding.
     write_io_per_sec = (1 + 4) * max(1, write_bytes_per_sec // 16384)
+    read_io_per_sec = max(rps, read_bytes_per_sec // 16384)
 
     # Based on the disk latency and the read latency SLOs we adjust our
     # working set to keep more or less data in RAM. Faster drives need
@@ -262,7 +270,7 @@ def _estimate_cassandra_cluster_zonal(
         # Take into account the reads per read
         # from the per node dataset using leveled compaction
         required_disk_ios=lambda size, count: (
-            _cass_io_per_read(size) * math.ceil(rps / count),
+            _cass_io_per_read(size) * math.ceil(read_io_per_sec / count),
             write_io_per_sec / count,
         ),
         # C* requires ephemeral disks to be 25% full because compaction
@@ -418,6 +426,10 @@ class NflxCassandraArguments(BaseModel):
         default=False,
         description="If local (ephemeral) drives are required",
     )
+    require_attached_disks: bool = Field(
+        default=False,
+        description="If attached (ebs) drives are required",
+    )
     required_cluster_size: Optional[int] = Field(
         default=None,
         description="Require zonal clusters to be this size (force vertical scaling)",
@@ -464,6 +476,9 @@ def capacity_plan(
         require_local_disks: bool = extra_model_arguments.get(
             "require_local_disks", False
         )
+        require_attached_disks: bool = extra_model_arguments.get(
+            "require_attached_disks", False
+        )
         required_cluster_size: Optional[int] = extra_model_arguments.get(
             "required_cluster_size", None
         )
@@ -493,6 +508,7 @@ def capacity_plan(
             zones_per_region=context.zones_in_region,
             copies_per_region=copies_per_region,
             require_local_disks=require_local_disks,
+            require_attached_disks=require_attached_disks,
             required_cluster_size=required_cluster_size,
             max_rps_to_disk=max_rps_to_disk,
             max_regional_size=max_regional_size,

diff --git a/tests/netflix/test_cassandra.py b/tests/netflix/test_cassandra.py
@@ -75,6 +75,72 @@ def test_capacity_small_fast():
         assert small_result.cluster_params["cassandra.heap.table.percent"] == 0.11
 
 
+def test_ebs_high_reads():
+    cap_plan = planner.plan_certain(
+        model_name="org.netflix.cassandra",
+        region="us-east-1",
+        desires=CapacityDesires(
+            service_tier=1,
+            query_pattern=QueryPattern(
+                estimated_read_per_second=certain_int(100_000),
+                estimated_write_per_second=certain_int(1_000),
+            ),
+            data_shape=DataShape(
+                estimated_state_size_gib=certain_int(1_000),
+            ),
+        ),
+        extra_model_arguments={"require_attached_disks": True},
+    )[0]
+    result = cap_plan.candidate_clusters.zonal[0]
+
+    cores = result.count * result.instance.cpu
+    assert 64 <= cores <= 128
+    # Should get gp3
+    assert result.attached_drives[0].name == "gp3"
+    # 1TiB / ~32 nodes
+    assert result.attached_drives[0].read_io_per_s is not None
+    ios = result.attached_drives[0].read_io_per_s * result.count
+    # Each zone is handling ~33k reads per second, so total disk ios should be < 3x that
+    # 3 from each level
+    assert 100_000 < ios < 400_000
+
+
+def test_ebs_high_writes():
+    cap_plan = planner.plan_certain(
+        model_name="org.netflix.cassandra",
+        region="us-east-1",
+        desires=CapacityDesires(
+            service_tier=1,
+            query_pattern=QueryPattern(
+                estimated_read_per_second=certain_int(10_000),
+                estimated_write_per_second=certain_int(100_000),
+                estimated_mean_write_size_bytes=certain_int(1024 * 8),
+            ),
+            data_shape=DataShape(
+                estimated_state_size_gib=certain_int(10_000),
+            ),
+        ),
+        extra_model_arguments={"require_attached_disks": True},
+    )[0]
+    result = cap_plan.candidate_clusters.zonal[0]
+
+    cores = result.count * result.instance.cpu
+    assert 128 <= cores <= 512
+    # Should get gp3
+    assert result.attached_drives[0].name == "gp3"
+    # 1TiB / ~32 nodes
+    assert result.attached_drives[0].read_io_per_s is not None
+    assert result.attached_drives[0].write_io_per_s is not None
+
+    read_ios = result.attached_drives[0].read_io_per_s * result.count
+    write_ios = result.attached_drives[0].write_io_per_s * result.count
+
+    # 10TiB ~= 4 IO/read -> 3.3k r/zone/s -> 12k /s
+    assert 20_000 < read_ios < 60_000
+    # 33k wps * 8KiB  / 16KiB write IO size = 16.5k / s * 4 for compaction = 64k
+    assert 60_000 < write_ios < 100_000
+
+
 def test_capacity_high_writes():
     cap_plan = planner.plan_certain(
         model_name="org.netflix.cassandra",

diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py
@@ -162,7 +162,7 @@ def test_worn_dataset():
     assert lr_cluster.instance.name.startswith(
         "m5."
     ) or lr_cluster.instance.name.startswith("r5.")
-    assert lr_cluster.attached_drives[0].name == "gp2"
+    assert lr_cluster.attached_drives[0].name == "gp3"
     # gp2 should not provision massive drives, prefer to upcolor
     assert lr_cluster.attached_drives[0].size_gib < 9000
     assert lr_cluster.attached_drives[0].size_gib * lr_cluster.count * 3 > 204800