Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions torchx/schedulers/kubernetes_mcad_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,7 @@ def create_pod_group(
pod_group_name = app_id + "-pg" + str(role_idx)

labels = object_labels(app, app_id)
labels.update({"appwrapper.mcad.ibm.com": app_id})
labels.update({"appwrapper.codeflare.dev": app_id})

pod_group: Dict[str, Any] = {
"apiVersion": "scheduling.sigs.k8s.io/v1alpha1",
Expand Down Expand Up @@ -434,7 +434,7 @@ def mcad_svc(
target_port=int(service_port),
)
],
selector={"appwrapper.mcad.ibm.com": svc_name},
selector={"appwrapper.codeflare.dev": svc_name},
session_affinity="None",
type="ClusterIP",
),
Expand Down Expand Up @@ -596,7 +596,7 @@ def app_to_resource(

"""
Create Service:
The selector will have the key 'appwrapper.mcad.ibm.com', and the value will be
The selector will have the key 'appwrapper.codeflare.dev', and the value will be
the appwrapper name
"""

Expand Down Expand Up @@ -627,7 +627,7 @@ def app_to_resource(
enable_retry(job_spec, appwrapper_retries, total_pods)

resource: Dict[str, object] = {
"apiVersion": "mcad.ibm.com/v1beta1",
"apiVersion": "codeflare.dev/v1beta1",
"kind": "AppWrapper",
"metadata": {"name": unique_app_id, "namespace": namespace},
"spec": job_spec,
Expand Down Expand Up @@ -947,7 +947,7 @@ def schedule(self, dryrun_info: AppDryRunInfo[KubernetesMCADJob]) -> str:

try:
resp = self._custom_objects_api().create_namespaced_custom_object(
group="mcad.ibm.com",
group="codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
Expand Down Expand Up @@ -1035,7 +1035,7 @@ def _validate(self, app: AppDef, scheduler: str) -> None:
def _cancel_existing(self, app_id: str) -> None:
namespace, name = app_id.split(":")
self._custom_objects_api().delete_namespaced_custom_object(
group="mcad.ibm.com",
group="codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
Expand Down Expand Up @@ -1096,7 +1096,7 @@ def describe(self, app_id: str) -> Optional[DescribeAppResponse]:

# Production section
api_instance = self._custom_objects_api
group = "mcad.ibm.com"
group = "codeflare.dev"
version = "v1beta1"
plural = "appwrappers"
try:
Expand Down Expand Up @@ -1214,7 +1214,7 @@ def list(self) -> List[ListAppResponse]:
namespace = active_context["context"]["namespace"]

resp = self._custom_objects_api().list_namespaced_custom_object(
group="mcad.ibm.com",
group="codeflare.dev",
version="v1beta1",
namespace=namespace,
plural="appwrappers",
Expand Down
44 changes: 24 additions & 20 deletions torchx/schedulers/ray/ray_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
from torchx.schedulers.ray.ray_common import RayActor, TORCHX_RANK0_HOST

_logger: logging.Logger = logging.getLogger(__name__)
_logger.setLevel(logging.getLevelName(os.environ.get("LOGLEVEL", "INFO")))
_logger.setLevel(logging.getLevelName(os.environ.get("LOGLEVEL", "DEBUG")))
logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))


Expand Down Expand Up @@ -81,6 +81,9 @@ def exec_module(
worker_evn.update(os.environ)
worker_evn.update(self.env)
worker_evn[TORCHX_RANK0_HOST] = master_addr
_logger.info(self.cmd)
_logger.info("worker env:", worker_evn)

popen = subprocess.Popen(self.cmd, env=worker_evn)

returncode = popen.wait()
Expand All @@ -96,12 +99,13 @@ def schedule(self, actor_id: str) -> CommandActorScheduled:
return CommandActorScheduled(actor_id)

def get_actor_address_and_port(self) -> Tuple[str, int]:
addr = ray.util.get_node_ip_address()
# addr = ray.util.get_node_ip_address()
addr = os.getenv("MY_POD_IP")
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
s.bind(("", 0))
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
port = s.getsockname()[1]
return addr, port
return addr, 49782


def load_actor_json(filename: str) -> List[RayActor]:
Expand All @@ -122,7 +126,10 @@ def create_placement_group_async(replicas: List[RayActor]) -> PlacementGroup:
for replica in replicas:
bundles.append({"CPU": replica.num_cpus, "GPU": replica.num_gpus})

pg = ray.util.placement_group(bundles, strategy="SPREAD")
pg = ray.util.placement_group(
bundles,
strategy="SPREAD",
)
return pg


Expand Down Expand Up @@ -216,6 +223,14 @@ def create_and_schedule_actor(self, pg: PlacementGroup, replica: RayActor) -> No
replica=replica,
)

if self.master_node_id == None:
self.master_node_id = actor_id
self.rank_0_address, self.rank_0_port = ray.get(
actor.get_actor_address_and_port.remote()
)

_logger.info(f"rdzv_endpoint set to {self.rank_0_address} for actor {actor_id}")

def place_command_actors(self) -> None:
"""Creating all command actors in all placement groups"""
# find the placement group index for a replica(actor's specification)
Expand All @@ -242,23 +257,11 @@ def _step(self) -> bool:
if isinstance(result, CommandActorScheduled):
if not self.terminating:
actor = self.actor_info_of_id[result.id].actor
if self.master_node_id is None:
# make this actor be the master node
self.master_node_id = result.id
self.rank_0_address, self.rank_0_port = ray.get(
actor.get_actor_address_and_port.remote() # pyre-ignore
)
self.active_tasks.append(
actor.exec_module.remote( # pyre-ignore
"localhost", 0, result.id
)
)
else:
self.active_tasks.append(
actor.exec_module.remote(
self.rank_0_address, self.rank_0_port, result.id
)
self.active_tasks.append(
actor.exec_module.remote(
self.rank_0_address, self.rank_0_port, result.id
)
)
self.command_actors_count += 1
elif isinstance(result, TaskCompleted):
self.terminating = (
Expand Down Expand Up @@ -293,6 +296,7 @@ def run(self) -> None:

def main() -> None: # pragma: no cover
actors: List[RayActor] = load_actor_json("actors.json")
_logger.info(actors)
driver = RayDriver(actors)
ray.init(address="auto", namespace="torchx-ray")
driver.init_placement_groups()
Expand Down
6 changes: 3 additions & 3 deletions torchx/schedulers/ray_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,13 +355,13 @@ def wait_until_finish(self, app_id: str, timeout: int = 30) -> None:

def _parse_app_id(self, app_id: str) -> Tuple[str, str]:
# find index of '-' in the first :\d+-
m = re.search(r":\d+-", app_id)
m = re.search(r":\d+-|.com-|.org-|.net-|.gov-|.io-" , app_id)
if m:
sep = m.span()[1]
addr = app_id[: sep - 1]
addr = app_id[: sep-1]
app_id = app_id[sep:]
return addr, app_id

addr, _, app_id = app_id.partition("-")
return addr, app_id

Expand Down