Add cpu utilization as a parameter in extra model arguments; If you h…

…ave cpu usage already use it directly
Netflix-Skunkworks · Oct 13, 2023 · 352a776 · 352a776
1 parent f0dde0d
commit 352a776
Show file tree

Hide file tree

Showing 4 changed files with 81 additions and 11 deletions.
diff --git a/service_capacity_modeling/capacity_planner.py b/service_capacity_modeling/capacity_planner.py
@@ -503,6 +503,13 @@ def _plan_certain(
         if len(allowed_drives) == 0:
             allowed_drives.update(hardware.drives.keys())
 
+        # Get current instance type if exists
+        current_instance_name: str = extra_model_arguments.get("current_instance_name", None)
+        if current_instance_name is not None:
+            for instance in hardware.instances.values():
+                if instance.name == current_instance_name:
+                    extra_model_arguments["current_instance_name"] = instance
+
         plans = []
         if model.run_hardware_simulation():
             for instance in hardware.instances.values():

diff --git a/service_capacity_modeling/models/org/netflix/cassandra.py b/service_capacity_modeling/models/org/netflix/cassandra.py
@@ -64,20 +64,26 @@ def _write_buffer_gib_zone(
 
 def _estimate_cassandra_requirement(
     instance: Instance,
+    current_instance: Instance,
+    required_cluster_size: Optional[int],
     desires: CapacityDesires,
     working_set: float,
     reads_per_second: float,
     max_rps_to_disk: int,
     zones_per_region: int = 3,
     copies_per_region: int = 3,
+    max_cpu_utilization: float = None,
 ) -> CapacityRequirement:
     """Estimate the capacity required for one zone given a regional desire
 
     The input desires should be the **regional** desire, and this function will
     return the zonal capacity requirement
     """
     # Keep half of the cores free for background work (compaction, backup, repair)
-    needed_cores = sqrt_staffed_cores(desires) * 2
+    if all([max_cpu_utilization, current_instance, required_cluster_size]):
+        needed_cores = (current_instance.cpu * required_cluster_size) * (max_cpu_utilization / 20)
+    else:
+        needed_cores = sqrt_staffed_cores(desires) * 2
     # Keep half of the bandwidth available for backup
     needed_network_mbps = simple_network_mbps(desires) * 2
 
@@ -169,6 +175,7 @@ def _upsert_params(cluster, params):
 # flake8: noqa: C901
 def _estimate_cassandra_cluster_zonal(
     instance: Instance,
+    current_instance: Instance,
     drive: Drive,
     context: RegionContext,
     desires: CapacityDesires,
@@ -182,6 +189,7 @@ def _estimate_cassandra_cluster_zonal(
     max_regional_size: int = 96,
     max_write_buffer_percent: float = 0.25,
     max_table_buffer_percent: float = 0.11,
+    max_cpu_utilization: float = None,
 ) -> Optional[CapacityPlan]:
 
     # Netflix Cassandra doesn't like to deploy on really small instances
@@ -234,12 +242,15 @@ def _estimate_cassandra_cluster_zonal(
 
     requirement = _estimate_cassandra_requirement(
         instance=instance,
+        current_instance=current_instance,
+        required_cluster_size=required_cluster_size,
         desires=desires,
         working_set=working_set,
         reads_per_second=rps,
         max_rps_to_disk=max_rps_to_disk,
         zones_per_region=zones_per_region,
         copies_per_region=copies_per_region,
+        max_cpu_utilization=max_cpu_utilization,
     )
 
     # Cassandra clusters should aim to be at least 2 nodes per zone to start
@@ -493,6 +504,8 @@ def capacity_plan(
         max_table_buffer_percent: float = min(
             0.5, extra_model_arguments.get("max_table_buffer_percent", 0.11)
         )
+        max_cpu_utilization: float = extra_model_arguments.get("max_cpu_utilization", None)
+        current_instance: Instance = extra_model_arguments.get("current_instance_name", None)
 
         # Adjust heap defaults for high write clusters
         if (
@@ -504,6 +517,7 @@ def capacity_plan(
 
         return _estimate_cassandra_cluster_zonal(
             instance=instance,
+            current_instance=current_instance,
             drive=drive,
             context=context,
             desires=desires,
@@ -517,6 +531,7 @@ def capacity_plan(
             max_local_disk_gib=max_local_disk_gib,
             max_write_buffer_percent=max_write_buffer_percent,
             max_table_buffer_percent=max_table_buffer_percent,
+            max_cpu_utilization=max_cpu_utilization,
         )
 
     @staticmethod

diff --git a/setup.py b/setup.py
@@ -14,6 +14,7 @@
         "numpy",
         'importlib_resources; python_version < "3.7"',
         "isodate",
+        "pytest",
     ],
     extras_require={
         "aws": ["boto3"],

diff --git a/tests/netflix/test_cassandra_uncertain.py b/tests/netflix/test_cassandra_uncertain.py
@@ -1,9 +1,11 @@
+import json
+
 from service_capacity_modeling.capacity_planner import planner
 from service_capacity_modeling.interface import CapacityDesires
 from service_capacity_modeling.interface import DataShape
 from service_capacity_modeling.interface import Interval
 from service_capacity_modeling.interface import QueryPattern
-
+from service_capacity_modeling.interface import AccessPattern
 
 uncertain_mid = CapacityDesires(
     service_tier=1,
@@ -42,14 +44,14 @@ def test_uncertain_planning():
     lr_cluster = lr.candidate_clusters.zonal[0]
     assert 8 <= lr_cluster.count * lr_cluster.instance.cpu <= 64
     assert (
-        5_000 <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
+            5_000 <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
     )
 
     sr = mid_plan.least_regret[1]
     sr_cluster = sr.candidate_clusters.zonal[0]
     assert 8 <= sr_cluster.count * sr_cluster.instance.cpu <= 64
     assert (
-        5_000 <= sr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
+            5_000 <= sr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 45_000
     )
 
     tiny_plan = planner.plan(
@@ -61,7 +63,7 @@ def test_uncertain_planning():
     lr_cluster = lr.candidate_clusters.zonal[0]
     assert 2 <= lr_cluster.count * lr_cluster.instance.cpu < 16
     assert (
-        1_000 < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 6_000
+            1_000 < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"] < 6_000
     )
 
 
@@ -155,9 +157,9 @@ def test_worn_dataset():
     lr_cluster = lr.candidate_clusters.zonal[0]
     assert 128 <= lr_cluster.count * lr_cluster.instance.cpu <= 512
     assert (
-        250_000
-        <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
-        < 1_000_000
+            250_000
+            <= lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
+            < 1_000_000
     )
     assert lr_cluster.instance.name.startswith(
         "m5."
@@ -193,11 +195,56 @@ def test_very_small_has_disk():
         lr_cluster = lr.candidate_clusters.zonal[0]
         assert 2 <= lr_cluster.count * lr_cluster.instance.cpu < 16
         assert (
-            1_000
-            < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
-            < 6_000
+                1_000
+                < lr.candidate_clusters.annual_costs["cassandra.zonal-clusters"]
+                < 6_000
         )
         if lr_cluster.instance.drive is None:
             assert sum(dr.size_gib for dr in lr_cluster.attached_drives) > 10
         else:
             assert lr_cluster.instance.drive.size_gib > 10
+
+
+def test_plan_certain():
+    """
+    Use cpu utilization to determine instance types directly as supposed to extrapolating it from the Data Shape
+    """
+    worn_desire = CapacityDesires(
+        service_tier=1,
+        query_pattern=QueryPattern(
+            access_pattern=AccessPattern(
+                AccessPattern.latency
+            ),
+            estimated_read_per_second=Interval(
+                low=234248, mid=351854, high=485906, confidence=0.98
+            ),
+            estimated_write_per_second=Interval(
+                low=19841, mid=31198, high=37307, confidence=0.98
+            ),
+        ),
+        # We think we're going to have around 200 TiB of data
+        data_shape=DataShape(
+            estimated_state_size_gib=Interval(
+                low=2006.083, mid=2252.5, high=2480.41, confidence=0.98
+            ),
+            estimated_compression_ratio=Interval(
+                low=1, mid=1, high=1, confidence=1
+            ),
+        ),
+    )
+    cap_plan = planner.plan_certain(
+        model_name="org.netflix.cassandra",
+        region="us-east-1",
+        num_results=3,
+        num_regions=4,
+        desires=worn_desire,
+        extra_model_arguments={
+            "required_cluster_size": 24,
+            "current_instance_name": "i4i.8xlarge",
+            "max_cpu_utilization": 14.194801291058118,
+        },
+    )
+
+    lr_clusters = cap_plan[0].candidate_clusters.zonal[0]
+    assert lr_clusters.count == 24
+    assert lr_clusters.instance.cpu < 32