In [None]:
# Copyright (c) 2022 Massachusetts Institute of Technology
# SPDX-License-Identifier: MIT

In [1]:
data_dir         = "/home/gridsan/groups/datasets/cifar-10"
executor_dir     = "log_test"
slurm_partition  = "gaia"
slurm_gres       = "gpu:volta:1"
slurm_constraint = "xeon-g6"

timeout_minutes = 10
max_epochs = 10

gpu_powers = [100,110,120,130,140,150]
multirun=True

In [2]:
from powersweep import powersweep as ps
from tse.tse import TrainingSpeedEstimator as TSE

In [3]:
import pytorch_lightning as pl
import pandas as pd
from pytorch_lightning import loggers as pl_loggers
from hydra_zen import builds, make_config, MISSING, instantiate
from omegaconf import DictConfig
from torch.nn import Module
import hydra_zen_example.image_classifier.configs as hzconfigs
from hydra_zen_example.image_classifier.utils import set_seed

In [4]:
slurm_cfg = builds(ps.SlurmConfig,
                executor_dir=executor_dir,
                slurm_partition=slurm_partition,
                slurm_gres=slurm_gres,
                slurm_constraint=slurm_constraint,
                timeout_min=timeout_minutes,
                zen_partial=True,
                populate_full_signature=True,
            )

In [5]:
csv_logger = builds(pl_loggers.CSVLogger, save_dir="csv_logs")

TrainerConf = builds(
    pl.Trainer,
    accelerator='gpu',
    max_epochs=50,
    zen_partial=False,
    enable_progress_bar=False,
    enable_model_summary=False,
    log_every_n_steps=1,
    logger=csv_logger,
)

In [6]:
task_cfg = make_config(
    #
    # Experiment Defaults: See https://hydra.cc/docs/next/advanced/defaults_list
    defaults=[
        "_self_",  # See https://hydra.cc/docs/upgrades/1.0_to_1.1/default_composition_order
        {"data": "cifar10"},
        {"model": "resnet18"},
        {"model/optim": "sgd"},
    ],
    #
    # Experiment Modules
    data=MISSING,
    model=MISSING,
    trainer=TrainerConf,
    #
    # Experiment Constants
    data_dir=data_dir,
    random_seed=0,
    testing=False,
    ckpt_path=None,
)

In [7]:
def task_fn(cfg: DictConfig) -> Module:
    # Set seed BEFORE instantiating anything
    set_seed(cfg.random_seed)

    # Data and Lightning Modules
    data = instantiate(cfg.data)
    pl_module = instantiate(cfg.model)
    
    # The PL Trainer
    trainer = instantiate(cfg.trainer)
    
    # Train on data
    trainer.fit(pl_module, datamodule=data)
    
    return pl_module

In [8]:
PS = ps.PowerSweep(task_cfg, task_fn, slurm_cfg, gpower=gpu_powers, multirun=multirun)

In [9]:
overrides=[
        "model=resnet18",
        f"trainer.max_epochs={max_epochs}",
    ]
job = PS.launch(overrides=overrides)

The version_base parameter is not specified.
Please specify a compatability version level, or None.
Will assume defaults for version 1.1
  with initialize(


[2022-12-13 09:42:08,017][HYDRA] Launching 6 jobs locally
[2022-12-13 09:42:08,018][HYDRA] 	#0 : model=resnet18 trainer.max_epochs=10 +gpower=100
[2022-12-13 09:42:08,258][HYDRA] 	#1 : model=resnet18 trainer.max_epochs=10 +gpower=110


See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(


[2022-12-13 09:42:08,508][HYDRA] 	#2 : model=resnet18 trainer.max_epochs=10 +gpower=120


See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(


[2022-12-13 09:42:08,701][HYDRA] 	#3 : model=resnet18 trainer.max_epochs=10 +gpower=130


See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(


[2022-12-13 09:42:08,898][HYDRA] 	#4 : model=resnet18 trainer.max_epochs=10 +gpower=140
[2022-12-13 09:42:09,086][HYDRA] 	#5 : model=resnet18 trainer.max_epochs=10 +gpower=150


See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(
See https://hydra.cc/docs/next/upgrades/1.1_to_1.2/changes_to_job_working_dir/ for more information.
  ret = run_job(


Job states: ['UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN', 'UNKNOWN']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING', 'RUNNING']
Job states: ['RUNNING', 'RUNNING', 'RUNNING', 'RUNNI

In [10]:
tse = TSE()

summary = []

for j in job[0]:
    # Record swept override parameters
    model = j.overrides[0].split('=')[1]
    optim = j.overrides[1].split('=')[1]
    power = j.overrides[2].split('=')[1]
    
    working_dir = j.working_dir
    csvfile = f"{working_dir}/csv_logs/lightning_logs/version_0/metrics.csv"

    df = pd.read_csv(csvfile)
    
    df.rename(inplace=True, columns={'Train/Loss':'train_loss', 'Train/Accuracy':'train_acc_stp'})
    
    grad_measures, params, fhat, dL1, dL2 = tse.estimate_losscurve(df.loc[~df['train_loss'].isna()])
    
    tse_dict = tse.estimate(df, None, 10)
        
    sumdict = {'model':model, 'optimizer':optim, 'GPU power (W)':power, 
               'loss curve approx.':grad_measures['d1/sqrt(d2)_sum'],
                'tse':tse_dict['tse'], 'tsee':tse_dict['tsee'], 'tsema':tse_dict['tseema']}
    summary.append(sumdict)
    
summary = pd.DataFrame(summary)

In [11]:
summary.sort_values(by='tse', ascending=False)

Unnamed: 0,model,optimizer,GPU power (W),loss curve approx.,tse,tsee,tsema
4,resnet18,10,140,-0.573048,9.889643,0.0,9.82231
5,resnet18,10,150,-0.527951,9.810785,0.0,9.744312
2,resnet18,10,120,-0.540024,9.76068,0.0,9.694187
1,resnet18,10,110,-0.523411,9.732961,0.0,9.666933
3,resnet18,10,130,-0.545779,9.694935,0.0,9.629042
0,resnet18,10,100,-0.528312,9.526445,0.0,9.461528
