In [20]:
%load_ext autoreload
%autoreload 2
import numpy
import pandas
import io
import uuid
import psycopg.sql
import pyarrow

import jobqueue
from jobqueue.connection_manager import ConnectionManager


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# from enum import Enum, auto

# class ColumnSource(Enum):
#     run_column=auto()
#     run
#     run_column=auto()
#     run_column=auto()
#     run_column=auto()
#     run_column=auto()

from dataclasses import dataclass

    

@dataclass
class Column:
    name : str
    pandas_type : str
    

In [26]:

run_columns = [
    ('run_timestamp', 'datetime'),
    ('run_id', 'uuid'),
    ('job_id', 'uuid'),
    ('seed' 'int64'),
    ('slurm_job_id', 'str'),
    ('task_version', 'int16'),
    ('num_nodes', 'int16'),
    ('num_cpus', 'int16'),
    ('num_gpus', 'int16'),
    ('gpu_memory', 'int32'),
    ('host_name', 'str'),
    ('batch', 'str'),
    ('run_data', 'object'),
    ('run_history', 'object'),
    ('run_extended_history', 'object'),
]

experiment_columns = [
    # experiment columns
    ('experiment_id', 'uuid'),
    ('experiment_attrs', 'integer[]'),
    ('experiment_tags', 'integer[]'),
    ('old_experiment_id', 'integer')
]

columns = [
    ('e', 'experiment_id'),
    ('e', 'old_experiment_id'),
    ('experiment_attrs', 'experiment_attrs'),
    ('r', 'run_id'),
    ('r', 'job_id'),
    ('r', 'slurm_job_id'),
    ('r', 'run_timestamp'),
    ('r', 'seed'),
    ('r', 'task_version'),
    ('r', 'num_nodes'),
    ('r', 'num_cpus'),
    ('r', 'num_gpus'),
    ('r', 'gpu_memory'),
    ('r', 'host_name'),
    ('r', 'batch'),
    ('r', 'run_data'),
    ('r', 'run_history'),
    ('r', 'run_extended_history'),
    ('s', 'queue'),
    ('s', 'status'),
    ('s', 'priority'),
    ('s', 'start_time'),
    ('s', 'update_time'),
    ('s', 'worker'),
    ('s', 'error_count'),
    ('d', 'command')
]


credentials = jobqueue.load_credentials('dmp')
with ConnectionManager(credentials) as connection:
    with connection.cursor(binary=True) as cursor:
        query = psycopg.sql.SQL("""
SELECT
    e.experiment_id,
    e.old_experiment_id,
    ea.experiment_attrs,
    r.run_id,
    r.job_id,
    r.slurm_job_id,
    r.run_timestamp,
    r.seed,
    r.task_version,
    r.num_nodes,
    r.num_cpus,
    r.num_gpus,
    r.gpu_memory,
    r.host_name,
    r.batch,
    r.run_data,
    r.run_history,
    r.run_extended_history,
    s.queue,
    s.status,
    s.priority,
    s.start_time,
    s.update_time,
    s.worker,
    s.error_count,
    d.command
FROM
    run r,
    experiment e,
    job_status s,
    job_data d,
    lateral (
        select
            jsonb_object_agg(
                a.kind,
                coalesce(
                    to_jsonb(a.value_bool),
                    to_jsonb(a.value_int),
                    to_jsonb(a.value_float),
                    to_jsonb(a.value_str),
                    to_jsonb(a.value_json)
                )) experiment_attrs
        FROM
            (
                SELECT attr_id FROM unnest(e.experiment_attrs) attr_id
                UNION ALL
                SELECT attr_id FROM unnest(e.experiment_tags) attr_id
            )  attr_id
            INNER JOIN attr a ON (a.attr_id = attr_id.attr_id)
    ) ea
WHERE TRUE
    AND r.experiment_id = e.experiment_id
    AND r.job_id = d.id
    AND r.job_id = s.id
    AND s.status = 2
    AND jsonb_path_exists(r.run_data, '$.record_times[*] == true')
    AND batch = 'make_batch_optimizer_butter_growth_eagle_gpu_1'
    AND jsonb_path_exists(d.command, '$.tags.butter_growth[*] == true')
    AND e.experiment_attrs && (
        SELECT array_agg(attr_id)
        FROM attr
        WHERE kind = 'model_depth' and value_int < 5)
    AND e.experiment_attrs @> (
        SELECT array_agg(attr_id)
        FROM attr
        WHERE (kind = 'type' and value_str = 'GrowthExperiment')
        OR (kind = 'classifier' and value_str = 'SGD')
       )
LIMIT 10;
""")

        cursor.execute(query, binary=True)
        data = {column: [] for source, column in columns}
        for row_number, row in enumerate(cursor):
            for i, (source, column) in enumerate(columns):
                value = row[i]
                if column in {'experiment_attrs', 'run_data'}:
                    for kind, val in value.items():
                        if kind not in data:
                            data[kind] = [None] * row_number
                        data[kind].append(val)
                    continue
                elif column == 'run_data':
                    with io.BytesIO(value) as buffer:
                        value = pyarrow.parquet.read_table(
                            pyarrow.PythonFile(buffer, mode='r')).to_pandas()
                
                data[column].append(value)
            for column in list(data.keys()):
                while len(data[column]) < (row_number+1):
                    data[column].append(None)

        df = pandas.DataFrame(data=data)

# df.describe()
df['run_data'].head()


0    None
1    None
2    None
3    None
4    None
Name: run_data, dtype: object