In [29]:
%load_ext autoreload
%autoreload 2
import numpy
import pandas
import io
import uuid
import psycopg.sql
import pyarrow
import pyarrow.parquet

import jobqueue
from jobqueue.connection_manager import ConnectionManager


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# from enum import Enum, auto

# class ColumnSource(Enum):
#     run_column=auto()
#     run
#     run_column=auto()
#     run_column=auto()
#     run_column=auto()
#     run_column=auto()

from dataclasses import dataclass

    

@dataclass
class Column:
    name : str
    pandas_type : str
    

In [32]:

run_columns = [
    ('run_timestamp', 'datetime'),
    ('run_id', 'uuid'),
    ('job_id', 'uuid'),
    ('seed' 'int64'),
    ('slurm_job_id', 'str'),
    ('task_version', 'int16'),
    ('num_nodes', 'int16'),
    ('num_cpus', 'int16'),
    ('num_gpus', 'int16'),
    ('gpu_memory', 'int32'),
    ('host_name', 'str'),
    ('batch', 'str'),
    ('run_data', 'object'),
    ('run_history', 'object'),
    ('run_extended_history', 'object'),
]

experiment_columns = [
    # experiment columns
    ('experiment_id', 'uuid'),
    ('experiment_attrs', 'integer[]'),
    ('experiment_tags', 'integer[]'),
    ('old_experiment_id', 'integer')
]

columns = [
    ('e', 'experiment_id'),
    ('e', 'old_experiment_id'),
    ('experiment_attrs', 'experiment_attrs'),
    ('r', 'run_id'),
    ('r', 'job_id'),
    ('r', 'slurm_job_id'),
    ('r', 'run_timestamp'),
    ('r', 'seed'),
    ('r', 'task_version'),
    ('r', 'num_nodes'),
    ('r', 'num_cpus'),
    ('r', 'num_gpus'),
    ('r', 'gpu_memory'),
    ('r', 'host_name'),
    ('r', 'batch'),
    ('r', 'run_data'),
    ('r', 'run_history'),
    ('r', 'run_extended_history'),
    ('s', 'queue'),
    ('s', 'status'),
    ('s', 'priority'),
    ('s', 'start_time'),
    ('s', 'update_time'),
    ('s', 'worker'),
    ('s', 'error_count'),
    ('d', 'command')
]


credentials = jobqueue.load_credentials('dmp')
with ConnectionManager(credentials) as connection:
    with connection.cursor(binary=True) as cursor:
        query = psycopg.sql.SQL("""
SELECT
    e.experiment_id,
    e.old_experiment_id,
    ea.experiment_attrs,
    r.run_id,
    r.job_id,
    r.slurm_job_id,
    r.run_timestamp,
    r.seed,
    r.task_version,
    r.num_nodes,
    r.num_cpus,
    r.num_gpus,
    r.gpu_memory,
    r.host_name,
    r.batch,
    r.run_data,
    r.run_history,
    r.run_extended_history,
    s.queue,
    s.status,
    s.priority,
    s.start_time,
    s.update_time,
    s.worker,
    s.error_count,
    d.command
FROM
    run r,
    experiment e,
    job_status s,
    job_data d,
    lateral (
        select
            jsonb_object_agg(
                a.kind,
                coalesce(
                    to_jsonb(a.value_bool),
                    to_jsonb(a.value_int),
                    to_jsonb(a.value_float),
                    to_jsonb(a.value_str),
                    to_jsonb(a.value_json)
                )) experiment_attrs
        FROM
            (
                SELECT attr_id FROM unnest(e.experiment_attrs) attr_id
                UNION ALL
                SELECT attr_id FROM unnest(e.experiment_tags) attr_id
            )  attr_id
            INNER JOIN attr a ON (a.attr_id = attr_id.attr_id)
    ) ea
WHERE TRUE
    AND r.experiment_id = e.experiment_id
    AND r.job_id = d.id
    AND r.job_id = s.id
    AND s.status = 2
    AND jsonb_path_exists(r.run_data, '$.record_times[*] == true')
    AND batch = 'make_batch_optimizer_butter_growth_eagle_gpu_1'
    AND jsonb_path_exists(d.command, '$.tags.butter_growth[*] == true')
    AND e.experiment_attrs && (
        SELECT array_agg(attr_id)
        FROM attr
        WHERE kind = 'model_depth' and value_int < 5)
    AND e.experiment_attrs @> (
        SELECT array_agg(attr_id)
        FROM attr
        WHERE (kind = 'type' and value_str = 'GrowthExperiment')
        OR (kind = 'classifier' and value_str = 'SGD')
       )
LIMIT 10;
""")

        cursor.execute(query, binary=True)
        data = {column: [] for source, column in columns}
        for row_number, row in enumerate(cursor):
            for i, (source, column) in enumerate(columns):
                value = row[i]
                if column in {'experiment_attrs', 'run_data'}:
                    for kind, val in value.items():
                        if kind not in data:
                            data[kind] = [None] * row_number
                        data[kind].append(val)
                    continue
                elif column == 'run_history':
                    with io.BytesIO(value) as buffer:
                        value = pyarrow.parquet.read_table(
                            pyarrow.PythonFile(buffer, mode='r')).to_pandas()
                
                data[column].append(value)
            for column in list(data.keys()):
                while len(data[column]) < (row_number+1):
                    data[column].append(None)

        df = pandas.DataFrame(data=data)

df.describe()

Unnamed: 0,slurm_job_id,seed,task_version,num_nodes,num_cpus,num_gpus,gpu_memory,queue,status,priority,...,model_output_units,optimizer_momentum,dataset_label_noise,num_free_parameters,validation_set_size,optimizer_learning_rate,max_equivalent_epoch_budget,model_cell_width_scale_factor,model_inner_kernel_regularizer_l1,queue_id
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,11880790.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000349.0,...,5.2,0.0,0.0,15099330.0,1133.7,0.01,6000.0,1.0,0.05,12.0
std,37.65988,18.4198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.4198,...,3.614784,0.0,0.0,5305279.0,1772.994332,1.828559e-18,0.0,0.0,0.0,0.0
min,11880750.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000325.0,...,1.0,0.0,0.0,245.0,0.0,0.01,6000.0,1.0,0.05,12.0
25%,11880760.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000339.0,...,3.0,0.0,0.0,16775900.0,48.25,0.01,6000.0,1.0,0.05,12.0
50%,11880760.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000345.0,...,4.5,0.0,0.0,16777110.0,485.0,0.01,6000.0,1.0,0.05,12.0
75%,11880830.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000356.0,...,6.75,0.0,0.0,16777210.0,750.0,0.01,6000.0,1.0,0.05,12.0
max,11880830.0,1679596000.0,0.0,1.0,2.0,2.0,14636.0,12.0,2.0,2000391.0,...,11.0,0.0,0.0,16780940.0,5296.0,0.01,6000.0,1.0,0.05,12.0


In [33]:
df.head()

Unnamed: 0,experiment_id,old_experiment_id,experiment_attrs,run_id,job_id,slurm_job_id,run_timestamp,seed,task_version,num_nodes,...,precision,worker_id,record_model,record_times,python_version,record_metrics,tensorflow_version,tensorflow_strategy,record_post_training_metrics,make_batch_optimizer_butter_growth_eagle_gpu_1
0,156fdb06-c3bb-d2c9-4072-0f81d8239cbf,,,083faa7a-c332-4d6e-8d94-219634b5ba14,083faa7a-c332-4d6e-8d94-219634b5ba14,11880834,2023-04-09 17:15:11.390310+00:00,1679595711,0,1,...,float32,d633b008-f37e-404e-81bb-935d927df3dc,,True,3.10.8,,2.8.1,<class 'tensorflow.python.distribute.mirrored_...,True,True
1,1057fec7-bffc-5278-2b6f-53989006f69f,,,12152034-ca1d-4246-9450-7c3b4eeb5f58,12152034-ca1d-4246-9450-7c3b4eeb5f58,11880830,2023-04-09 18:19:40.437656+00:00,1679595725,0,1,...,float32,200f2e46-8fc2-4c7d-bbd8-747a9240f5e2,,True,3.10.8,,2.8.1,<class 'tensorflow.python.distribute.mirrored_...,True,True
2,47270ea5-b09c-ce28-6af1-d526170d648d,,,da49a972-b955-4393-862c-1b43ed228ce4,da49a972-b955-4393-862c-1b43ed228ce4,11880752,2023-04-09 17:36:48.185189+00:00,1679595713,0,1,...,float32,9df93874-80e9-4b86-83bc-53a1d65fc315,,True,3.10.8,,2.8.1,<class 'tensorflow.python.distribute.mirrored_...,True,True
3,9558d0f4-8371-43a4-cdec-0dfce073a65d,,,55aaf39f-6bbe-4d5e-a371-c0005ecb11c8,55aaf39f-6bbe-4d5e-a371-c0005ecb11c8,11880759,2023-04-09 17:42:41.898486+00:00,1679595717,0,1,...,float32,eaedef35-586c-443e-9028-e566ac345041,,True,3.10.8,,2.8.1,<class 'tensorflow.python.distribute.mirrored_...,True,True
4,f097163f-0e08-f9b3-5c25-4af565d985d2,,,fcd3284d-5336-4e9f-9282-cfab6d2ba859,fcd3284d-5336-4e9f-9282-cfab6d2ba859,11880761,2023-04-09 18:25:46.797050+00:00,1679595729,0,1,...,float32,a476a74e-9c01-45ce-832f-81a1ac330eae,,True,3.10.8,,2.8.1,<class 'tensorflow.python.distribute.mirrored_...,True,True


In [38]:
df.columns

Index(['experiment_id', 'old_experiment_id', 'experiment_attrs', 'run_id',
       'job_id', 'slurm_job_id', 'run_timestamp', 'seed', 'task_version',
       'num_nodes',
       ...
       'precision', 'worker_id', 'record_model', 'record_times',
       'python_version', 'record_metrics', 'tensorflow_version',
       'tensorflow_strategy', 'record_post_training_metrics',
       'make_batch_optimizer_butter_growth_eagle_gpu_1'],
      dtype='object', length=105)

In [40]:
df['run_history'].iloc[0]['train_loss']


0      1.247540
1      0.898057
2      0.847315
3      0.850349
4      0.847144
         ...   
145    0.225552
146    0.213775
147    0.213580
148    0.213284
149    0.210847
Name: train_loss, Length: 150, dtype: float32

In [42]:
run_df = df['run_history'].iloc[0]
run_df['trained_loss'] - run_df['train_loss']

0      0.000000
1     -0.047707
2      0.003034
3      0.000000
4     -0.001494
         ...   
145    0.026551
146   -0.015600
147   -0.015426
148   -0.008199
149    0.011152
Length: 150, dtype: float32