In [30]:
%load_ext autoreload
%autoreload 2
import numpy
import pandas
import io
import uuid
import psycopg.sql
import pyarrow
import pyarrow.parquet

import jobqueue
from jobqueue.connection_manager import ConnectionManager



import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import os
import datetime

from typing import Callable, List

from psycopg import sql

import dmp.keras_interface.model_serialization as model_serialization
from dmp.task.experiment.training_experiment.training_epoch import TrainingEpoch
from dmp.postgres_interface.element.column import Column
from dmp.postgres_interface.element.table import Table
from dmp.postgres_interface.element.column_group import ColumnGroup

from dmp.util.butter_e_export import *

pd.options.display.max_seq_items = None
credentials = jobqueue.load_credentials("dmp")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
# epoch that minimizes: Column = Column("minimizes", "loss") and the corresponding loss
#

{
  "fit": {
    "epochs": 3000,
    "batch_size": 256
  },
  "loss": null,
  "seed": 1663277341,
  "type": "TrainingExperiment",
  "batch": "energy_1",
  "model": {
    "size": 2048,
    "type": "DenseBySize",
    "depth": 4,
    "inner": {
      "type": "Dense",
      "units": -1,
      "use_bias": true,
      "activation": "relu",
      "bias_constraint": null,
      "bias_initializer": "Zeros",
      "bias_regularizer": null,
      "kernel_constraint": null,
      "kernel_initializer": "GlorotUniform",
      "kernel_regularizer": null,
      "activity_regularizer": null
    },
    "input": null,
    "shape": "exponential",
    "output": null,
    "search_method": "integer"
  },
  "record": {
    "type": "ExperimentRecordSettings",
    "model": null,
    "times": false,
    "metrics": null,
    "post_training_metrics": false
  },
  "dataset": {
    "name": "201_pol",
    "type": "DatasetSpec",
    "method": "shuffled_train_test_split",
    "source": "pmlb",
    "test_split": 0.2,
    "label_noise": 0,
    "validation_split": 0
  },
  "run_tags": null,
  "optimizer": {
    "class": "Adam",
    "learning_rate": 0.0001
  },
  "precision": "float32",
  "early_stopping": null
}

In [52]:
from psycopg import ClientCursor


print(f"run vars {vars(run)}")

columns = (
    run
    + ColumnGroup(*[c for c in job_status.columns if c.name != "id"])
    + job_data.command
)
print(columns.names)


def passthrough(row, index, value, column, data):
    data[column.name] = value


column_converters: List[Callable] = [passthrough for _ in columns]


def flatten_json(json_obj, destination=None, parent_key="", separator="_"):
    if isinstance(destination, dict):
        flattened = destination
    else:
        flattened = {}

    for key, value in json_obj.items():
        new_key = f"{parent_key}{separator}{key}" if parent_key else key
        if isinstance(value, dict):
            flattened.update(flatten_json(value, new_key, separator=separator))
        else:
            flattened[new_key] = value
    return flattened


column_converters[
    columns.get_index_of(job_data.command)
] = lambda row, index, value, column, data: flatten_json(value, destination=data)
column_converters[
    columns.get_index_of(run.run_data)
] = lambda row, index, value, column, data: flatten_json(value, destination=data)


def parquet_to_dataframe(row, index, value, column, data):
    with io.BytesIO(value) as buffer:
        data[column.name] = (
            pyarrow.parquet.read_table(pyarrow.PythonFile(buffer, mode="r"))
            .to_pandas()
            .sort_values(by="epoch")
        )


column_converters[columns.get_index_of(run.run_history)] = parquet_to_dataframe
column_converters[columns.get_index_of(run.run_extended_history)] = parquet_to_dataframe


dfs = []

with ConnectionManager(credentials) as connection:
    query = psycopg.sql.SQL(
        """
SELECT
	{columns}
FROM
	{run},
	{job_status},
	{job_data}
WHERE TRUE
	AND {run}.batch like {pattern}
	AND {job_status}.id = {run}.run_id
	AND {job_status}.id = {job_data}.id
    AND {job_status}.status = 2
ORDER BY experiment_id, run_id
LIMIT 10;
"""
    ).format(
        columns=columns.columns_sql,
        run=run.identifier,
        job_status=job_status.identifier,
        job_data=job_data.identifier,
        pattern=sql.Literal("%energy%"),
    )

    with ClientCursor(connection) as c:
        print(c.mogrify(query))

    with connection.cursor(binary=True) as cursor:
        cursor.execute(query, binary=True)

        for row in cursor:
            row_data = {}
            for i, (column, column_converter) in enumerate(
                zip(columns, column_converters)
            ):
                column_converter(row, i, row[i], column, row_data)

            row_df = row_data["run_history"]
            row_df = row_df.join(row_data["run_extended_history"], on="epoch", how="left", rsuffix="_")
            for k in ("run_history", "run_extended_history"):
                del row_data[k]

            for k, v in row_data.items():
                print(f'{k} {v}')
                if k in row_df:
                    pass
                if isinstance(v, list):
                    row_df[k] = [v] * len(row_df)
                else:
                    row_df[k] = v
                # data.setdefault(k, []).append(v)
            dfs.append(row_df)

data = pandas.concat(dfs)
del dfs

data.describe()
print(data.head())

run vars {'_name': 'run', '_columns': (), '_index': None}
('experiment_id', 'run_timestamp', 'run_id', 'job_id', 'seed', 'slurm_job_id', 'task_version', 'num_nodes', 'num_cpus', 'num_gpus', 'gpu_memory', 'host_name', 'batch', 'run_data', 'run_history', 'run_extended_history', 'queue', 'status', 'priority', 'start_time', 'update_time', 'worker', 'error_count', 'error', 'parent', 'command')
{Column(_name='experiment_id', type_name='uuid'): 0, Column(_name='run_timestamp', type_name='timestamp'): 1, Column(_name='run_id', type_name='uuid'): 2, Column(_name='job_id', type_name='uuid'): 3, Column(_name='seed', type_name='bigint'): 4, Column(_name='slurm_job_id', type_name='bigint'): 5, Column(_name='task_version', type_name='smallint'): 6, Column(_name='num_nodes', type_name='smallint'): 7, Column(_name='num_cpus', type_name='smallint'): 8, Column(_name='num_gpus', type_name='smallint'): 9, Column(_name='gpu_memory', type_name='integer'): 10, Column(_name='host_name', type_name='text'): 11,

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
