In [1]:
# home directory: c-val
# function availability : ./kubectl/functions.py

# Now the job-runner.ipynb will perform the following tasks:
# 1. get_free_node_list()
#     - save it to list - get_free_node_list[]
# 2. get_db_latest_status() 
#     - Get latest test results timmestamp from validation.db for all the nodes in the db ( by accessing gcr-admin-pvc-access pod)
#     - per node per test - latest timestamp
#     - if a node has no test results - mark it with very old timestamp - highest priority
# 3. build_priority_queue()
#     - Combine free nodes list with get_db_latest_status list, and create a priority queue function that takes 
#         1. free nodes list
#         2. db latest status
#         3. Z days threshold
#     - Returns priority queue
#         1. Filered free nodes only
#         2. skip nodes with test results not older than Z days 
#         3. order by latest test results timestamps (oldest first - highest priority) 
#     - Format of returned "job_priority_queue_list": [ nodename, priority_order, job_submission_status ]
#         [
#             [node1, 1, True],
#             [node2, 2, False],
#             ...
#         ]
# 4. batch job submission
#    - takes 
#         1. batch size: N single node jobs per batch
#         2. job queue list from build_priority_queue()
#         3. job template yaml file path  ( /home/hari/b200/validation/c-val/ymls/specific-node-job.yml )
#     - for each batch of N nodes
#         1. read job template yaml file
#         2. edit/ fill in 
#             a. node name <node-name>
#             b. job name hari-gcr-ceval-<node-name>-<timestamp>
#         3. submit job to k8s cluster and repeat N times ( for batch size )
# 5. monitor job status
#     - if a job pending for more than X minutes - cancel the job and update job_submission_status to canceled in job_priority_queue_list
# For each node in job queue list
#     - Create a job to run cluster-doctor validation tests on that node

# 6. Job run[Inside Job pod] 
#     - git clone c-val repo to /opt/c-val
#     - Run cluster-doctor tests on the pod/node and collect logs ( STDOUT/ STDERR) using tee
#     - Upon completion of tests
#         -Collect test results log ( STDOUT/ STDERR) and save it to /data/continuous_validation/<test-name>/<node-name>/<node-name>-<testname>-<timestamp>.log
#     - Update validation.db with new test results and timestamp at /data/continuous_validation/metadata/validation.db using /opt/c-val/kubectl/functions.py/add_result_local()

# 7. Generate a daily report
#     - Summary of nodes tested
#     - Summary of test results
#     - List of nodes that were never tested
#     - Save report to ./gitignored/reports/daily_report_<date>.txt

In [None]:
import subprocess; subprocess.run(["/home/hari/b200/validation/others/git_push_all.sh"], check=True)

import sys
import os
import time
import datetime
import importlib

# Add the current directory to path to ensure we can import utils
current_dir = os.path.dirname(os.path.abspath('__file__'))
if current_dir not in sys.path:
    sys.path.append(current_dir)

# Import the utility functions
try:
    import utils.functions as functions
    importlib.reload(functions) # Force reload to get new functions
except ImportError:
    # Fallback if running from a different context
    sys.path.append("/home/hari/b200/validation/c-val")
    import utils.functions as functions
    importlib.reload(functions)

home_dir = "/home/hari/b200/validation/c-val/"
batch_size = 2
monitor_timeout_mins = 2
template_path = os.path.join(home_dir, "ymls/specific-node-job.yml")
days_threshold = 7

class Cluster:
    def __init__(self, ns="gcr-admin"):
        self.ns = ns
        # numerical timestamp
        self.timestamp = int(time.time())
        self.freenode_list = []
        self.db_status = {}
        self.job_queue = []
        self.template_path = template_path
        self.home_dir = home_dir
        self.batch_size = batch_size
        self.monitor_timeout_mins = monitor_timeout_mins
        self.days_threshold = days_threshold
        
    def refresh_state(self):
        """
        Step 1 & 2: Get free nodes and latest DB status.
        """
        print(f"[{datetime.datetime.now().time()}] Refreshing cluster state...")
        
        # 1. Get Free Node List
        self.freenode_list = functions.get_free_node_list()
        print(f"  Found {len(self.freenode_list)} free nodes (fully avaialble).")
        
        # 2. Get DB Latest Status
        print("  Fetching DB status from cluster...")
        try:
            db_output = functions.get_db_latest_status(namespace=self.ns)
            self.db_status = functions.parse_db_status_output(db_output)
            print(f"  Retrieved status for {len(self.db_status)} nodes from DB.")
        except Exception as e:
            print(f"  Error fetching DB status: {e}")
            self.db_status = {}
            
    def build_priority_queue(self, days_threshold=None, shuffle=False):
        """
        Step 3: Build a priority queue filtering free nodes by age of last test.
        """
        if not self.freenode_list:
            print("No free nodes to queue.")
            self.job_queue = []
            return []

        print(f"[{datetime.datetime.now().time()}] Building priority queue (Threshold: {days_threshold} days, Shuffle: {shuffle})...")
        self.job_queue = functions.build_priority_queue(
            self.freenode_list, 
            self.db_status, 
            days_threshold=days_threshold,
            shuffle=shuffle
        )
        
        print(f"  Queue built: {len(self.job_queue)} jobs candidates.")
        return self.job_queue

    def run_batch(self, batch_size=batch_size, monitor_timeout_mins=monitor_timeout_mins, dry_run=False):
        """
        Step 4 & 5: Submit a batch of jobs AND monitor them.
        """
        if not self.job_queue:
            print("Job queue is empty.")
            return

        print(f"[{datetime.datetime.now().time()}] Processing batch (Size: {batch_size})...")
        
        pending_jobs = [j for j in self.job_queue if not j[2]]
        if not pending_jobs:
            print("  No pending jobs in queue.")
            return

        if not os.path.exists(self.template_path):
            print(f"  Error: Template not found at {self.template_path}")
            return
            
        with open(self.template_path, 'r') as f:
            template_content = f.read()

        active_batch_jobs = [] # format: {'job_name': str, 'node': str, 'start_time': float, 'item_ref': list}
        jobs_submitted_count = 0
        
        # --- SUBMISSION LOOP ---
        for job_info in pending_jobs:
            if jobs_submitted_count >= batch_size:
                break
                
            node_name = job_info[0]
            # Create Job Name
            ts = int(time.time())
            job_name = f"hari-gcr-ceval-{node_name}-{ts}"
            
            # YAML substitution
            # FIX: Correctly substitute placeholders found in templates/specific-node-job.yml
            # Previously used a hardcoded node name string which was incorrect for this template
            job_yaml = template_content.replace("nodename-placeholder", node_name)
            job_yaml = job_yaml.replace("time-placeholder", str(ts))
            
            # Replace job name placeholder
            job_yaml = job_yaml.replace("generateName: jobname-placeholder", f"name: {job_name}")
            
            print(f"  > Target: {node_name} | Job: {job_name}")
            
            if dry_run:
                print("    [Dry Run] Job would be submitted. (Marking as done in queue)")
                job_info[2] = True # Mark submitted mock
                jobs_submitted_count += 1
                continue
                
            # Create Temp File & Submit
            # Save to gitignored directory for debugging/inspection
            temp_dir = os.path.join(home_dir, "gitignored")
            os.makedirs(temp_dir, exist_ok=True)
            temp_path = os.path.join(temp_dir, f"{job_name}.yaml")
            
            try:
                with open(temp_path, 'w') as temp_f:
                    temp_f.write(job_yaml)
                out = functions.create_job(temp_path)
                print(f"    Submitted: {out.strip()}")
                
                # Update queue info status (submitted=True)
                job_info[2] = True
                
                active_batch_jobs.append({
                    'job_name': job_name,
                    'node': node_name,
                    'start_time': time.time(),
                    'item_ref': job_info # Reference to queue item to update status later if needed
                })
                jobs_submitted_count += 1
                
            except Exception as e:
                print(f"    Failed to submit: {e}")
            finally:
                # Keep the file for debugging since user requested "keep the directory"
                # if os.path.exists(temp_path):
                #    os.remove(temp_path)
                pass

        if dry_run:
            print("Batch dry-run complete.")
            return

        # --- MONITORING LOOP ---
        print(f"  Scanning {len(active_batch_jobs)} jobs for status (Timeout: {monitor_timeout_mins}m)...")
        timeout_seconds = monitor_timeout_mins * 60
        
        while len(active_batch_jobs) > 0:
            print(f"  [{datetime.datetime.now().time()}] Checking specific job statuses...")
            
            # Iterate backwards to remove finished jobs safely
            for i in range(len(active_batch_jobs) - 1, -1, -1):
                job = active_batch_jobs[i]
                jname = job['job_name']
                elapsed = time.time() - job['start_time']
                
                # Get Status
                status = functions.get_job_status(jname, namespace=self.ns)
                
                print(f"    [{jname}] Status: {status} (Elapsed: {elapsed:.0f}s)")
                
                # Logic: Succeeded / Failed / Completed -> Done
                if status in ["Completed", "Succeeded", "Failed", "Aborted", "Terminated"]:
                    print(f"    Job {jname}: {status}. Finished.")
                    active_batch_jobs.pop(i)
                elif status == "Pending":
                    # Check timeout
                    if elapsed > timeout_seconds:
                        print(f"    Job {jname}: Timed out ({elapsed:.0f}s > {timeout_seconds}s). Cancelling...")
                        functions.delete_job(jname, namespace=self.ns)
                        active_batch_jobs.pop(i)
                else:
                    # Running or Unknown
                    pass
            
            if not active_batch_jobs:
                break
                
            time.sleep(60) # Poll every minute
            
        print("Batch monitoring complete.")

    def process_full_queue(self, batch_size=batch_size, monitor_timeout_mins=monitor_timeout_mins, dry_run=False):
        """
        Runs multiple batches until the queue is empty.
        """
        print(f"[{datetime.datetime.now().time()}] Starting Full Queue Processing (Dry Run: {dry_run})...")
        
        while True:
            # Check if there are any pending jobs
            pending_jobs = [j for j in self.job_queue if not j[2]]
            if not pending_jobs:
                print("No more pending jobs in the queue. All done.")
                break
                
            print(f"\n--- Batch Start (Remaining: {len(pending_jobs)}) ---")
            self.run_batch(batch_size=batch_size, monitor_timeout_mins=monitor_timeout_mins, dry_run=dry_run)
            
            # Optional: Short pause between batches if not dry_run to allow cluster stabilization
            if not dry_run and len(pending_jobs) > batch_size:
                 time.sleep(10)

    def latest_test_results(self):
        """Helper to print human readable status from loaded DB map"""
        return self.db_status

    def freenodes(self):
        """Helper to return cached list"""
        return self.freenode_list

[1;33mChecking Deep Learning Unit Test at /home/hari/b200/validation/deeplearning_unit_test ...[0m
 → Branch: [0;32mmain[0m
[1;33mNo changes to push for Deep Learning Unit Test.[0m
----------------------------------------
[1;33mChecking Distributed Training Tools at /home/hari/b200/validation/distrbuted_training_tools ...[0m
 → Branch: [0;32mmain[0m
[1;33mNo changes to push for Distributed Training Tools.[0m
----------------------------------------
[1;33mChecking LLM fine-tuning with QLoRA at /home/hari/b200/LLM-finetune-for-function-calling-with-QLoRa ...[0m
 → Branch: [0;32mmain[0m
[1;33mNo changes to push for LLM fine-tuning with QLoRA.[0m
----------------------------------------
[1;33mChecking Cluster Validation Runbook at /home/hari/b200/validation/Cluster-Validation-Runbook ...[0m
 → Branch: [0;32mmain[0m
[1;33mNo changes to push for Cluster Validation Runbook.[0m
----------------------------------------
[1;33mChecking c-val at /home/hari/b200/validation/

To https://github.com/IamNirmata/c-val.git
   96d2d6b..5c59469  main -> main


In [None]:
cluster = Cluster("gcr-admin")
cluster.refresh_state()
print(f"Free Nodes: {len(cluster.freenodes())}")
print(f"DB Records: {len(cluster.latest_test_results())}")

[12:16:44.890244] Refreshing cluster state...
  Found 53 free nodes (fully avaialble).
  Fetching DB status from cluster...
  Retrieved status for 70 nodes from DB.
Free Nodes: 53
DB Records: 70


In [4]:
#summary table
from datetime import datetime, timezone
from zoneinfo import ZoneInfo

LA = ZoneInfo("America/Los_Angeles")

def add_age_and_validity(results: dict[str, int], validity_days: float = 2.0):
    now_utc = datetime.now(timezone.utc)
    out = []
    for node, ts in results.items():
        ts_dt_utc = datetime.fromtimestamp(int(ts), tz=timezone.utc)
        age_days = (now_utc - ts_dt_utc).total_seconds() / 86400.0

        out.append({
            "node": node,
            "timestamp_num": int(ts),
            "timestamp_ca": ts_dt_utc.astimezone(LA).strftime("%Y-%m-%d %H:%M:%S %Z"),
            "age_days": age_days,
            "validity": "valid" if age_days <= validity_days else "expired",
        })
    return out

def print_results_table(rows, validity_days: float):
    # sort for stable output
    rows = sorted(rows, key=lambda x: x["node"])

    headers = ["node", "timestamp_ca", "age_days", "validity"]
    # build formatted strings first
    formatted = []
    for r in rows:
        formatted.append({
            "node": r["node"],
            "timestamp_ca": r["timestamp_ca"],
            "age_days": f'{r["age_days"]:.2f}',
            "validity": "✅ valid" if r["validity"] == "valid" else "❌ expired",
        })

    # column widths
    widths = {h: max(len(h), *(len(fr[h]) for fr in formatted)) for h in headers}

    now_ca = datetime.now(timezone.utc).astimezone(LA).strftime("%Y-%m-%d %H:%M:%S %Z")
    print(f"Now (CA): {now_ca} | Validity window: {validity_days} days")
    print("-" * (sum(widths[h] for h in headers) + 3 * (len(headers) - 1)))

    # header
    print(" | ".join(h.ljust(widths[h]) for h in headers))
    print("-" * (sum(widths[h] for h in headers) + 3 * (len(headers) - 1)))

    # rows
    for fr in formatted:
        print(" | ".join(fr[h].ljust(widths[h]) for h in headers))

# usage
validity_days = 4
rows = add_age_and_validity(cluster.latest_test_results(), validity_days=validity_days)
print_results_table(rows, validity_days)


Now (CA): 2026-01-23 12:18:02 PST | Validity window: 4 days
--------------------------------------------------------------------
node                | timestamp_ca            | age_days | validity 
--------------------------------------------------------------------
slc01-cl02-hgx-0003 | 2026-01-19 19:42:26 PST | 3.69     | ✅ valid  
slc01-cl02-hgx-0008 | 2026-01-19 19:48:41 PST | 3.69     | ✅ valid  
slc01-cl02-hgx-0012 | 2026-01-19 19:42:26 PST | 3.69     | ✅ valid  
slc01-cl02-hgx-0015 | 2026-01-21 10:23:02 PST | 2.08     | ✅ valid  
slc01-cl02-hgx-0024 | 2026-01-19 19:48:42 PST | 3.69     | ✅ valid  
slc01-cl02-hgx-0031 | 2026-01-19 19:48:43 PST | 3.69     | ✅ valid  
slc01-cl02-hgx-0034 | 2026-01-19 19:54:56 PST | 3.68     | ✅ valid  
slc01-cl02-hgx-0047 | 2026-01-19 19:54:56 PST | 3.68     | ✅ valid  
slc01-cl02-hgx-0049 | 2026-01-19 20:33:31 PST | 3.66     | ✅ valid  
slc01-cl02-hgx-0062 | 2026-01-19 19:54:57 PST | 3.68     | ✅ valid  
slc01-cl02-hgx-0066 | 2026-01-16 13:08:23 P

In [5]:
days_threshold = 4
# print(f"[{datetime.now().time()}] Starting validation run...")
print(f"Free nodes available: {len(cluster.freenodes())}")
print(f"Days threshold: {days_threshold}")


# Build Queue
import datetime
queue = cluster.build_priority_queue(days_threshold=days_threshold, shuffle=False)

# Process the entire queue with batches. 
# dry_run=True for tests 
cluster.process_full_queue(batch_size=5, monitor_timeout_mins=3, dry_run=False) # Set dry_run=True for testing without submission

Free nodes available: 53
Days threshold: 4
[12:18:26.196714] Building priority queue (Threshold: 4 days, Shuffle: False)...
Building priority queue at 2026-01-23T20:18:26.196749+00:00 with threshold 4 days
  Skipping node slc01-cl02-hgx-0012: Age 3.69 days
  Skipping node slc01-cl02-hgx-0015: Age 2.08 days
  Skipping node slc01-cl02-hgx-0114: Age 3.70 days
  Skipping node slc01-cl02-hgx-0130: Age 3.68 days
  Skipping node slc01-cl02-hgx-0190: Age 3.67 days
  Skipping node slc01-cl02-hgx-0212: Age 3.69 days
  Skipping node slc01-cl02-hgx-0237: Age 3.67 days
  Skipping node slc01-cl02-hgx-0247: Age 3.67 days
  Skipping node slc01-cl02-hgx-0258: Age 3.70 days
  Skipping node slc01-cl02-hgx-0327: Age 3.67 days
  Skipping node slc01-cl02-hgx-0378: Age 3.66 days
  Skipping node slc01-cl02-hgx-0402: Age 3.66 days
  Skipping node slc01-cl02-hgx-0403: Age 3.66 days
  Queue built: 40 jobs candidates.
[12:18:26.197126] Starting Full Queue Processing (Dry Run: False)...

--- Batch Start (Remaining

In [None]:
# Debug: Check queue Status
print(f"Total jobs in queue: {len(cluster.job_queue)}")
submitted = [j for j in cluster.job_queue if j[2]]
print(f"Submitted jobs count: {len(submitted)}")
pending = [j for j in cluster.job_queue if not j[2]]
print(f"Pending jobs count: {len(pending)}")