diff --git a/src/backend/server/rpc_connection_handler.py b/src/backend/server/rpc_connection_handler.py index 67e4d2cc..83e4c8cb 100644 --- a/src/backend/server/rpc_connection_handler.py +++ b/src/backend/server/rpc_connection_handler.py @@ -40,8 +40,8 @@ def node_join(self, message): # "memory_gb": 100, # "memory_bandwidth_gbps": 100, # }, - # "kv_cache_ratio": 0.3, - # "param_hosting_ratio": 0.5, + # "kvcache_mem_ratio": 0.3, + # "param_mem_ratio": 0.5, # "max_concurrent_requests": 16, # "max_sequence_length": 1024, # } @@ -162,8 +162,8 @@ def build_node(self, node_json: dict): node_id=node_json.get("node_id"), hardware=self.build_hardware(node_json.get("hardware")), model_info=self.scheduler.model_info, - kv_cache_ratio=node_json.get("kv_cache_ratio"), - param_hosting_ratio=node_json.get("param_hosting_ratio"), + kvcache_mem_ratio=node_json.get("kvcache_mem_ratio"), + param_mem_ratio=node_json.get("param_mem_ratio"), max_concurrent_requests=node_json.get("max_concurrent_requests"), max_sequence_length=node_json.get("max_sequence_length"), is_active=node_json.get("is_active", True), diff --git a/src/backend/server/static_config.py b/src/backend/server/static_config.py index 609b9bb8..49045e50 100644 --- a/src/backend/server/static_config.py +++ b/src/backend/server/static_config.py @@ -169,7 +169,7 @@ def estimate_vram_gb_required(model_info): if model_info is None: return 0 - param_hosting_ratio = 0.65 + param_mem_ratio = 0.65 return ( ( model_info.embedding_io_bytes @@ -179,7 +179,7 @@ def estimate_vram_gb_required(model_info): / 1024 / 1024 / 1024 - / param_hosting_ratio + / param_mem_ratio ) diff --git a/src/parallax/launch.py b/src/parallax/launch.py index 168d4d83..f26f2e60 100644 --- a/src/parallax/launch.py +++ b/src/parallax/launch.py @@ -91,8 +91,8 @@ model_name=args.model_path, max_batch_size=args.max_batch_size, max_sequence_length=args.max_sequence_length, - param_hosting_ratio=args.param_hosting_ratio, - kv_cache_ratio=args.kv_cache_ratio, + param_mem_ratio=args.param_mem_ratio, + kvcache_mem_ratio=args.kvcache_mem_ratio, ) if gradient_server is not None: gradient_server.status = ServerState.READY @@ -131,8 +131,8 @@ model_name=args.model_path, max_batch_size=args.max_batch_size, max_sequence_length=args.max_sequence_length, - param_hosting_ratio=args.param_hosting_ratio, - kv_cache_ratio=args.kv_cache_ratio, + param_mem_ratio=args.param_mem_ratio, + kvcache_mem_ratio=args.kvcache_mem_ratio, ) args.start_layer = gradient_server.block_start_index args.end_layer = gradient_server.block_end_index diff --git a/src/parallax/p2p/server.py b/src/parallax/p2p/server.py index 9f59fff9..2c0c6f18 100644 --- a/src/parallax/p2p/server.py +++ b/src/parallax/p2p/server.py @@ -210,8 +210,8 @@ def __init__( model_name: Optional[str] = None, max_batch_size: Optional[int] = None, max_sequence_length: Optional[int] = None, - param_hosting_ratio: float = 0.65, - kv_cache_ratio: float = 0.25, + param_mem_ratio: float = 0.65, + kvcache_mem_ratio: float = 0.25, ): self.recv_from_peer_addr = recv_from_peer_addr self.send_to_peer_addr = send_to_peer_addr @@ -230,8 +230,8 @@ def __init__( self.model_name = model_name self.max_batch_size = max_batch_size self.max_sequence_length = max_sequence_length - self.param_hosting_ratio = param_hosting_ratio - self.kv_cache_ratio = kv_cache_ratio + self.param_mem_ratio = param_mem_ratio + self.kvcache_mem_ratio = kvcache_mem_ratio self.prefix_id = f"{dht_prefix}_announce" self.lattica = None self.routing_table = None @@ -686,8 +686,8 @@ def get_node_info(self, is_update: bool = False): info = { "node_id": self.lattica.peer_id(), "hardware": detect_node_hardware(self.lattica.peer_id()), - "kv_cache_ratio": self.kv_cache_ratio, - "param_hosting_ratio": self.param_hosting_ratio, + "kvcache_mem_ratio": self.kvcache_mem_ratio, + "param_mem_ratio": self.param_mem_ratio, "max_concurrent_requests": self.max_batch_size, "max_sequence_length": ( 1024 if self.max_sequence_length is None else self.max_sequence_length @@ -753,8 +753,8 @@ def launch_p2p_server( model_name: Optional[str], max_batch_size: Optional[int] = None, max_sequence_length: Optional[int] = None, - param_hosting_ratio: float = 0.65, - kv_cache_ratio: float = 0.25, + param_mem_ratio: float = 0.65, + kvcache_mem_ratio: float = 0.25, ): server = GradientServer( recv_from_peer_addr=recv_from_peer_addr, @@ -774,8 +774,8 @@ def launch_p2p_server( model_name=model_name, max_batch_size=max_batch_size, max_sequence_length=max_sequence_length, - param_hosting_ratio=param_hosting_ratio, - kv_cache_ratio=kv_cache_ratio, + param_mem_ratio=param_mem_ratio, + kvcache_mem_ratio=kvcache_mem_ratio, ) # Start the server thread = threading.Thread(target=server.run, daemon=True) diff --git a/src/parallax/server/server_args.py b/src/parallax/server/server_args.py index 11b2ed27..0d58fce8 100644 --- a/src/parallax/server/server_args.py +++ b/src/parallax/server/server_args.py @@ -61,14 +61,14 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( - "--param-hosting-ratio", + "--param-mem-ratio", type=float, default=0.65, help="Ratio of GPU memory to use for parameter hosting", ) parser.add_argument( - "--kv-cache-ratio", + "--kvcache-mem-ratio", type=float, default=0.25, help="Ratio of GPU memory to use for KV cache", diff --git a/src/scheduling/node.py b/src/scheduling/node.py index cb597dcd..7f397f53 100644 --- a/src/scheduling/node.py +++ b/src/scheduling/node.py @@ -174,8 +174,8 @@ class Node: hardware: NodeHardwareInfo model_info: ModelInfo - kv_cache_ratio: float = 0.3 - param_hosting_ratio: float = 0.5 + kvcache_mem_ratio: float = 0.3 + param_mem_ratio: float = 0.5 max_concurrent_requests: int = 16 max_sequence_length: int = 4096 @@ -221,7 +221,7 @@ def max_requests(self) -> int: requested_max_batch_size=self.max_concurrent_requests, max_sequence_len=self.max_sequence_length, device=None, - kv_cache_memory_fraction=self.kv_cache_ratio, + kv_cache_memory_fraction=self.kvcache_mem_ratio, num_shard_layers=self.num_current_layers, num_key_value_heads=self.model_info.num_kv_heads, head_dim=self.model_info.head_size, @@ -278,7 +278,7 @@ def get_decoder_layer_capacity( * 1024 * 1024 * 1024 - * self.param_hosting_ratio + * self.param_mem_ratio ) if include_input_embed: available_memory_bytes -= self.model_info.embedding_io_bytes @@ -312,7 +312,7 @@ def per_decoder_layer_kv_cache_memory(self) -> Optional[int]: * 1024 * 1024 * 1024 - * self.kv_cache_ratio + * self.kvcache_mem_ratio ) / self.num_current_layers ) diff --git a/src/scheduling/scheduler.py b/src/scheduling/scheduler.py index c6d0b081..9a8487ca 100644 --- a/src/scheduling/scheduler.py +++ b/src/scheduling/scheduler.py @@ -252,8 +252,8 @@ def join(self, node: Node, bootstrap: bool = False) -> None: logger.debug( "Joining node %s (kv_ratio=%.2f, param_ratio=%.2f, manual_assignment=%s)", node.node_id, - node.kv_cache_ratio, - node.param_hosting_ratio, + node.kvcache_mem_ratio, + node.param_mem_ratio, node.manual_layer_assignment, ) self.layer_allocator.declare(node)