Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions src/backend/server/rpc_connection_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ def node_join(self, message):
# "memory_gb": 100,
# "memory_bandwidth_gbps": 100,
# },
# "kv_cache_ratio": 0.3,
# "param_hosting_ratio": 0.5,
# "kvcache_mem_ratio": 0.3,
# "param_mem_ratio": 0.5,
# "max_concurrent_requests": 16,
# "max_sequence_length": 1024,
# }
Expand Down Expand Up @@ -162,8 +162,8 @@ def build_node(self, node_json: dict):
node_id=node_json.get("node_id"),
hardware=self.build_hardware(node_json.get("hardware")),
model_info=self.scheduler.model_info,
kv_cache_ratio=node_json.get("kv_cache_ratio"),
param_hosting_ratio=node_json.get("param_hosting_ratio"),
kvcache_mem_ratio=node_json.get("kvcache_mem_ratio"),
param_mem_ratio=node_json.get("param_mem_ratio"),
max_concurrent_requests=node_json.get("max_concurrent_requests"),
max_sequence_length=node_json.get("max_sequence_length"),
is_active=node_json.get("is_active", True),
Expand Down
4 changes: 2 additions & 2 deletions src/backend/server/static_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def estimate_vram_gb_required(model_info):
if model_info is None:
return 0

param_hosting_ratio = 0.65
param_mem_ratio = 0.65
return (
(
model_info.embedding_io_bytes
Expand All @@ -179,7 +179,7 @@ def estimate_vram_gb_required(model_info):
/ 1024
/ 1024
/ 1024
/ param_hosting_ratio
/ param_mem_ratio
)


Expand Down
8 changes: 4 additions & 4 deletions src/parallax/launch.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@
model_name=args.model_path,
max_batch_size=args.max_batch_size,
max_sequence_length=args.max_sequence_length,
param_hosting_ratio=args.param_hosting_ratio,
kv_cache_ratio=args.kv_cache_ratio,
param_mem_ratio=args.param_mem_ratio,
kvcache_mem_ratio=args.kvcache_mem_ratio,
)
if gradient_server is not None:
gradient_server.status = ServerState.READY
Expand Down Expand Up @@ -131,8 +131,8 @@
model_name=args.model_path,
max_batch_size=args.max_batch_size,
max_sequence_length=args.max_sequence_length,
param_hosting_ratio=args.param_hosting_ratio,
kv_cache_ratio=args.kv_cache_ratio,
param_mem_ratio=args.param_mem_ratio,
kvcache_mem_ratio=args.kvcache_mem_ratio,
)
args.start_layer = gradient_server.block_start_index
args.end_layer = gradient_server.block_end_index
Expand Down
20 changes: 10 additions & 10 deletions src/parallax/p2p/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,8 @@ def __init__(
model_name: Optional[str] = None,
max_batch_size: Optional[int] = None,
max_sequence_length: Optional[int] = None,
param_hosting_ratio: float = 0.65,
kv_cache_ratio: float = 0.25,
param_mem_ratio: float = 0.65,
kvcache_mem_ratio: float = 0.25,
):
self.recv_from_peer_addr = recv_from_peer_addr
self.send_to_peer_addr = send_to_peer_addr
Expand All @@ -230,8 +230,8 @@ def __init__(
self.model_name = model_name
self.max_batch_size = max_batch_size
self.max_sequence_length = max_sequence_length
self.param_hosting_ratio = param_hosting_ratio
self.kv_cache_ratio = kv_cache_ratio
self.param_mem_ratio = param_mem_ratio
self.kvcache_mem_ratio = kvcache_mem_ratio
self.prefix_id = f"{dht_prefix}_announce"
self.lattica = None
self.routing_table = None
Expand Down Expand Up @@ -686,8 +686,8 @@ def get_node_info(self, is_update: bool = False):
info = {
"node_id": self.lattica.peer_id(),
"hardware": detect_node_hardware(self.lattica.peer_id()),
"kv_cache_ratio": self.kv_cache_ratio,
"param_hosting_ratio": self.param_hosting_ratio,
"kvcache_mem_ratio": self.kvcache_mem_ratio,
"param_mem_ratio": self.param_mem_ratio,
"max_concurrent_requests": self.max_batch_size,
"max_sequence_length": (
1024 if self.max_sequence_length is None else self.max_sequence_length
Expand Down Expand Up @@ -753,8 +753,8 @@ def launch_p2p_server(
model_name: Optional[str],
max_batch_size: Optional[int] = None,
max_sequence_length: Optional[int] = None,
param_hosting_ratio: float = 0.65,
kv_cache_ratio: float = 0.25,
param_mem_ratio: float = 0.65,
kvcache_mem_ratio: float = 0.25,
):
server = GradientServer(
recv_from_peer_addr=recv_from_peer_addr,
Expand All @@ -774,8 +774,8 @@ def launch_p2p_server(
model_name=model_name,
max_batch_size=max_batch_size,
max_sequence_length=max_sequence_length,
param_hosting_ratio=param_hosting_ratio,
kv_cache_ratio=kv_cache_ratio,
param_mem_ratio=param_mem_ratio,
kvcache_mem_ratio=kvcache_mem_ratio,
)
# Start the server
thread = threading.Thread(target=server.run, daemon=True)
Expand Down
4 changes: 2 additions & 2 deletions src/parallax/server/server_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ def parse_args() -> argparse.Namespace:
)

parser.add_argument(
"--param-hosting-ratio",
"--param-mem-ratio",
type=float,
default=0.65,
help="Ratio of GPU memory to use for parameter hosting",
)

parser.add_argument(
"--kv-cache-ratio",
"--kvcache-mem-ratio",
type=float,
default=0.25,
help="Ratio of GPU memory to use for KV cache",
Expand Down
10 changes: 5 additions & 5 deletions src/scheduling/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ class Node:
hardware: NodeHardwareInfo
model_info: ModelInfo

kv_cache_ratio: float = 0.3
param_hosting_ratio: float = 0.5
kvcache_mem_ratio: float = 0.3
param_mem_ratio: float = 0.5

max_concurrent_requests: int = 16
max_sequence_length: int = 4096
Expand Down Expand Up @@ -221,7 +221,7 @@ def max_requests(self) -> int:
requested_max_batch_size=self.max_concurrent_requests,
max_sequence_len=self.max_sequence_length,
device=None,
kv_cache_memory_fraction=self.kv_cache_ratio,
kv_cache_memory_fraction=self.kvcache_mem_ratio,
num_shard_layers=self.num_current_layers,
num_key_value_heads=self.model_info.num_kv_heads,
head_dim=self.model_info.head_size,
Expand Down Expand Up @@ -278,7 +278,7 @@ def get_decoder_layer_capacity(
* 1024
* 1024
* 1024
* self.param_hosting_ratio
* self.param_mem_ratio
)
if include_input_embed:
available_memory_bytes -= self.model_info.embedding_io_bytes
Expand Down Expand Up @@ -312,7 +312,7 @@ def per_decoder_layer_kv_cache_memory(self) -> Optional[int]:
* 1024
* 1024
* 1024
* self.kv_cache_ratio
* self.kvcache_mem_ratio
)
/ self.num_current_layers
)
Expand Down
4 changes: 2 additions & 2 deletions src/scheduling/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,8 @@ def join(self, node: Node, bootstrap: bool = False) -> None:
logger.debug(
"Joining node %s (kv_ratio=%.2f, param_ratio=%.2f, manual_assignment=%s)",
node.node_id,
node.kv_cache_ratio,
node.param_hosting_ratio,
node.kvcache_mem_ratio,
node.param_mem_ratio,
node.manual_layer_assignment,
)
self.layer_allocator.declare(node)
Expand Down