## Extract Generalists' (Your Algorithms') Tables

In [13]:
import pandas as pd
import wandb
import os
import json
from datetime import datetime
from rich.progress import Progress

api = wandb.Api(timeout=60)
entity = 'jayden-teoh'
project = 'MORL-Baselines'
TABLE_TO_EXTRACT = 'eval/discounted_front'
ENV_NAME = "MOLunarLanderDR-v0"
WANDB_GROUP = "domain_randomization"
if not entity:
    raise ValueError("Entity not provided and environment variable 'WANDB_ENTITY' is not set.")
print(f"Fetched entity from environment variable 'WANDB_ENTITY': {entity}.")

if not project:
    raise ValueError("Project not provided and environment variable 'WANDB_PROJECT' is not set.")
print(f"Fetched project from environment variable 'WANDB_PROJECT': {project}.")

Fetched entity from environment variable 'WANDB_ENTITY': jayden-teoh.
Fetched project from environment variable 'WANDB_PROJECT': MORL-Baselines.


In [14]:
filters = {"group": WANDB_GROUP, "tags": {"$in": [ENV_NAME]}}
try:
    runs_sample = api.runs(path=f"{entity}/{project}", per_page=1, filters=filters)
    total_runs = len(runs_sample)
except Exception as e:
    raise ValueError(f"Invalid entity '{entity}' or project '{project}': {str(e)}\n\n \
        Also, make sure you are properly authenticated. \
        You can authenticate by using 'wandb.login() or setting the environment variable 'WANDB_API_KEY'"
    )

In [15]:
# Default CSV file name format
date_str = datetime.now().strftime("%m%d%y")
output_file = f"{entity}-{project}-{date_str}.csv"

all_runs_data = []
counter = 0
with Progress() as progress:
    task = progress.add_task("[cyan]Fetching runs...", total=total_runs)

    last_created_at = None
    while not progress.finished:
        if last_created_at:
            filters["created_at"] = {"$gt": last_created_at}

        runs = api.runs(path=f"{entity}/{project}", per_page=100, order="created_at", filters=filters)
        for run in runs:
            # if run.state != "finished":
            #     continue
            run_data = {
                "name": run.name,
                "state": run.state,
                "path": run.path,
                **run.summary._json_dict,
            }
            all_runs_data.append(run_data)
            progress.update(task, advance=1)
        if len(runs) > 0:
            last_created_at = runs[-1].created_at

df = pd.DataFrame(all_runs_data)
print(f"Found {len(df)} runs.")

Output()

Found 35 runs.


In [16]:
df['path']

0     [jayden-teoh, MORL-Baselines, kh4d7hzg]
1     [jayden-teoh, MORL-Baselines, l2r7x4vq]
2     [jayden-teoh, MORL-Baselines, jn0zc2rh]
3     [jayden-teoh, MORL-Baselines, dkb216oj]
4     [jayden-teoh, MORL-Baselines, 7g0hq8t0]
5     [jayden-teoh, MORL-Baselines, myh07pev]
6     [jayden-teoh, MORL-Baselines, kp5et3zr]
7     [jayden-teoh, MORL-Baselines, jdn7f8hi]
8     [jayden-teoh, MORL-Baselines, delwhiv5]
9     [jayden-teoh, MORL-Baselines, 7tuphzzf]
10    [jayden-teoh, MORL-Baselines, m1its6cm]
11    [jayden-teoh, MORL-Baselines, a3162z2c]
12    [jayden-teoh, MORL-Baselines, bc46w4nj]
13    [jayden-teoh, MORL-Baselines, u2npfs3k]
14    [jayden-teoh, MORL-Baselines, 8xmg03ji]
15    [jayden-teoh, MORL-Baselines, mno63zbo]
16    [jayden-teoh, MORL-Baselines, 1z2sbuu8]
17    [jayden-teoh, MORL-Baselines, nialg6hl]
18    [jayden-teoh, MORL-Baselines, lp6rpmei]
19    [jayden-teoh, MORL-Baselines, wfxc44t5]
20    [jayden-teoh, MORL-Baselines, bwx24z1u]
21    [jayden-teoh, MORL-Baselines

In [17]:
# Split the 'name' column into 'env_id', 'algorithm', 'seed', and 'time'
df[['env_id', 'name', 'seed', 'time']] = df['name'].str.split('__', expand=True)

# Drop rows with missing global_step
df = df.dropna(subset=['global_step'])
df['global_step'] = df['global_step'].astype(int)

# Filter for only columns that start with "eval/", "name", "path", "global_step", "env_id", and "seed"
columns_to_keep = df.filter(regex=f'^({TABLE_TO_EXTRACT}|name|path|global_step|env_id|seed)').columns
df = df[columns_to_keep]

# Remove the "eval/" prefix from the column names
df['path'] = df['path'].apply(lambda x: "/".join(x))
columns_to_process = [col for col in df.columns if TABLE_TO_EXTRACT in col]

output_directory = f'data/{TABLE_TO_EXTRACT}'
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for (env_id, algorithm), group in df.groupby(['env_id', 'name']):
    algo_dir = os.path.join(output_directory, env_id, algorithm)
    if not os.path.exists(algo_dir):
        os.makedirs(algo_dir)

    # Loop through each seed and save the corresponding data as a CSV file
    for seed, seed_data in group.groupby('seed'):
        seed_data = seed_data.sort_values(by='global_step')
        seed_data.set_index('global_step', inplace=True)
        
        # Ensure that rows with the same global_step are merged (if necessary)
        # This will collapse rows with the same 'global_step' by taking non-null values
        seed_data = seed_data.groupby('global_step').first()

        # Save the data for this seed
        seed_dir = os.path.join(algo_dir, f'seed_{seed}')
        if not os.path.exists(seed_dir):
            os.makedirs(seed_dir)

        # Restore files using wandb API
        for col in columns_to_process:
            artifact_path = seed_data[col].iloc[0]['path']  # Get the first (or specific) path
            if pd.notna(artifact_path):
                run_path = f"{seed_data['path'].iloc[0]}"  # Use the 'path' column to construct the run path
                try:
                    restored_file = wandb.restore(artifact_path, run_path=run_path)
                except Exception as e:
                    print(f"Failed to restore file for {artifact_path}: {e}")

                with open(restored_file.name, 'r') as json_file:
                    json_data = json.load(json_file)
                
                columns = json_data['columns']
                data = json_data['data']

                json_df = pd.DataFrame(data, columns=columns)
                json_csv_path = os.path.join(seed_dir, f"{col.strip(TABLE_TO_EXTRACT)}.csv")
                json_df.to_csv(json_csv_path, index=False)


print("Files have been successfully created.")


Files have been successfully created.


## Extract Specialists' Tables

In [36]:
import pandas as pd
import wandb
import os
import json
from datetime import datetime
from rich.progress import Progress
from helpers.utils import ENVIRONMENTS_MAP

api = wandb.Api(timeout=60)
entity = 'jayden-teoh'
project = 'MORL-Baselines'
TABLE_TO_EXTRACT = 'eval/front' # same as 'eval/discounted_front' but just named wrongly

ENV_NAME = "MOHumanoidDR-v5" # Change this to the environment you want to extract
WANDB_GROUP = "static_environment"
if not entity:
    raise ValueError("Entity not provided and environment variable 'WANDB_ENTITY' is not set.")
print(f"Fetched entity from environment variable 'WANDB_ENTITY': {entity}.")

if not project:
    raise ValueError("Project not provided and environment variable 'WANDB_PROJECT' is not set.")
print(f"Fetched project from environment variable 'WANDB_PROJECT': {project}.")

Fetched entity from environment variable 'WANDB_ENTITY': jayden-teoh.
Fetched project from environment variable 'WANDB_PROJECT': MORL-Baselines.


In [37]:
filters = {"group": WANDB_GROUP, "tags": {"$in": ENVIRONMENTS_MAP[ENV_NAME]}}
try:
    runs_sample = api.runs(path=f"{entity}/{project}", per_page=1, filters=filters)
    total_runs = len(runs_sample)
except Exception as e:
    raise ValueError(f"Invalid entity '{entity}' or project '{project}': {str(e)}\n\n \
        Also, make sure you are properly authenticated. \
        You can authenticate by using 'wandb.login() or setting the environment variable 'WANDB_API_KEY'"
    )

In [38]:
# Default CSV file name format
date_str = datetime.now().strftime("%m%d%y")
output_file = f"{entity}-{project}-{date_str}.csv"

all_runs_data = []
counter = 0
with Progress() as progress:
    task = progress.add_task("[cyan]Fetching runs...", total=total_runs)

    last_created_at = None
    while not progress.finished:
        if last_created_at:
            filters["created_at"] = {"$gt": last_created_at}

        runs = api.runs(path=f"{entity}/{project}", per_page=100, order="created_at", filters=filters)
        for run in runs:
            # if run.state != "finished":
            #     continue
            run_data = {
                "name": run.name,
                "state": run.state,
                "path": run.path,
                **run.summary._json_dict,
            }
            all_runs_data.append(run_data)
            progress.update(task, advance=1)
        if len(runs) > 0:
            last_created_at = runs[-1].created_at

df = pd.DataFrame(all_runs_data)
print(f"Found {len(df)} runs.")

Output()

Found 11 runs.


In [39]:
# Split the 'name' column into 'env_id', 'algorithm', 'seed', and 'time'
df[['env_id', 'name', 'seed', 'time']] = df['name'].str.split('__', expand=True)

# Drop rows with missing global_step
df = df.dropna(subset=['global_step'])
df['global_step'] = df['global_step'].astype(int)

# Filter for only columns that start with "eval/", "name", "path", "global_step", "env_id" (no need seed)
columns_to_keep = df.filter(regex=f'^({TABLE_TO_EXTRACT}|name|path|global_step|env_id)').columns
df = df[columns_to_keep]

# Remove the "eval/" prefix from the column names
df['path'] = df['path'].apply(lambda x: "/".join(x))
columns_to_process = [col for col in df.columns if TABLE_TO_EXTRACT in col]

output_directory = f'data/single_env/{TABLE_TO_EXTRACT}/{ENV_NAME}' # add to single_env folder
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for (env_id, algorithm), group in df.groupby(['env_id', 'name']):
    env_dir = os.path.join(output_directory, env_id)
    if not os.path.exists(env_dir):
        os.makedirs(env_dir)

    data = group.sort_values(by='global_step')
    data.set_index('global_step', inplace=True)
    data = data.groupby('global_step').first()

    # Restore files using wandb API
    for col in columns_to_process:
        artifact_path = data[col].iloc[0]['path']  # Get the first (or specific) path
        if pd.notna(artifact_path):
            run_path = f"{data['path'].iloc[0]}"  # Use the 'path' column to construct the run path
            try:
                restored_file = wandb.restore(artifact_path, run_path=run_path)
            except Exception as e:
                print(f"Failed to restore file for {artifact_path}: {e}")

            with open(restored_file.name, 'r') as json_file:
                json_data = json.load(json_file)
            
            columns = json_data['columns']
            fetched_data = json_data['data']

            json_df = pd.DataFrame(fetched_data, columns=columns)
            json_csv_path = os.path.join(env_dir, f"{col.strip(TABLE_TO_EXTRACT)}", algorithm + '.csv')
            json_df.to_csv(json_csv_path, index=False)


print("Files have been successfully created.")


Files have been successfully created.
