In [1]:
%%capture _ 
# ^ this just silences warnings
import torch
import mlflow
import hiddenlayer as HL

from model.collectdata_mdsA import collect_data
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.utilities import load_full_state, count_parameters, Params, save_to_mlflow

from model.autoencoder_models import UNet
from model.autoencoder_models import UNetPlusPlus

In [2]:
args = Params(
    batch_size=64,
    device = select_gpu(1),
    epochs=30,
    lr=4e-4,
    experiment_name='UNet++', 
    asymmetry_parameter=0.0
)

1 available GPUs (initially using device 0):
  0 GeForce RTX 2080 Ti


In [3]:
train_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_A/Aug14_80K_train.h5',
#     '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
#     '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_1.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_3.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_4.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_5.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_6.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_7.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_8.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_9.h5',
    batch_size=args['batch_size'],
    masking=True,
    shuffle=False,
    load_XandXsq=False,
#     device = args['device'], 
    load_xy=False
)

val_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
    batch_size=args['batch_size'],
    slice=slice(256 * 39),
    masking=True, 
    shuffle=False,
    load_XandXsq=False,
    load_xy=False)


Loading data...
Loaded /share/lazy/sokoloff/ML-data_A/Aug14_80K_train.h5 in 11.72 s
Constructing 80000 event dataset took 0.2182 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5 in 2.548 s
Constructing 9984 event dataset took 0.04099 s


In [4]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment(args['experiment_name'])

Traceback (most recent call last):
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 197, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 260, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 167, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/share/lazy/pv-finder_model_repo/ML/meta.yaml' does not exist.


In [5]:
model = UNetPlusPlus().to(args['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss = Loss(epsilon=1e-5,coefficient=args['asymmetry_parameter'])

#print(model.i1.size())

In [None]:
# load_full_state(model, optimizer, '/share/lazy/pv-finder_model_repo/0/a868d4b8ec0642b39a7156f3dd894dfb/artifacts/run_stats.pyt', freeze_weights=False)

run_name = 'u-net++'

# tune kernel based on gpu
#torch.backends.cudnn.benchmark=True
train_iter = enumerate(trainNet(model, optimizer, loss, train_loader, val_loader, args['epochs'], notebook=True))
with mlflow.start_run(run_name = run_name) as run:
    for i, result in train_iter:
        save_to_mlflow({
            'Metric: Training loss':result.cost,
            'Metric: Validation loss':result.val,
            'Metric: Efficiency':result.eff_val.eff_rate,
            'Metric: False positive rate':result.eff_val.fp_rate,
            'Param: Asymmetry':args['asymmetry_parameter'],
            'Param: Epochs':args['epochs'],
        }, step=i)

Number of batches: train = 1250, val = 156


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  file=sys.stderr,


HBox(children=(FloatProgress(value=0.0, description='Epochs', layout=Layout(flex='2'), max=80.0, style=Progres…

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 0: train=16.321, val=2.47723, took 87.956 s
  Validation Found 47280 of 54504, added 384 (eff 86.75%) (0.0385 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 1: train=1.50876, val=2.48513, took 95.419 s
  Validation Found 47366 of 54504, added 354 (eff 86.90%) (0.0355 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 2: train=1.48328, val=2.46639, took 97.735 s
  Validation Found 47892 of 54504, added 373 (eff 87.87%) (0.0374 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 3: train=1.46934, val=2.4493, took 96.94 s
  Validation Found 47988 of 54504, added 396 (eff 88.04%) (0.0397 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 4: train=1.46031, val=2.46096, took 96.354 s
  Validation Found 48013 of 54504, added 385 (eff 88.09%) (0.0386 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 5: train=1.45244, val=2.44036, took 96.283 s
  Validation Found 48155 of 54504, added 389 (eff 88.35%) (0.039 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 6: train=1.44768, val=2.44463, took 96.359 s
  Validation Found 48129 of 54504, added 373 (eff 88.30%) (0.0374 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 7: train=1.44291, val=2.4395, took 96.229 s
  Validation Found 48264 of 54504, added 396 (eff 88.55%) (0.0397 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 8: train=1.43871, val=2.42542, took 96.078 s
  Validation Found 48364 of 54504, added 406 (eff 88.73%) (0.0407 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 9: train=1.43487, val=2.44443, took 95.904 s
  Validation Found 48265 of 54504, added 380 (eff 88.55%) (0.0381 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 10: train=1.43184, val=2.44146, took 95.908 s
  Validation Found 48420 of 54504, added 403 (eff 88.84%) (0.0404 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 11: train=1.42875, val=2.4211, took 96.099 s
  Validation Found 48537 of 54504, added 450 (eff 89.05%) (0.0451 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 12: train=1.4254, val=2.43301, took 96.342 s
  Validation Found 48539 of 54504, added 426 (eff 89.06%) (0.0427 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 13: train=1.42231, val=2.44293, took 96.503 s
  Validation Found 48171 of 54504, added 382 (eff 88.38%) (0.0383 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 14: train=1.4197, val=2.4427, took 96.515 s
  Validation Found 48292 of 54504, added 388 (eff 88.60%) (0.0389 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 15: train=1.41645, val=2.44236, took 96.262 s
  Validation Found 48388 of 54504, added 397 (eff 88.78%) (0.0398 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 16: train=1.41374, val=2.45668, took 96.223 s
  Validation Found 48215 of 54504, added 392 (eff 88.46%) (0.0393 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 17: train=1.41127, val=2.44524, took 95.93 s
  Validation Found 48405 of 54504, added 411 (eff 88.81%) (0.0412 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 18: train=1.40801, val=2.45789, took 95.831 s
  Validation Found 48289 of 54504, added 398 (eff 88.60%) (0.0399 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 19: train=1.40594, val=2.45443, took 96.046 s
  Validation Found 48434 of 54504, added 425 (eff 88.86%) (0.0426 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 20: train=1.40352, val=2.45639, took 96.319 s
  Validation Found 48592 of 54504, added 475 (eff 89.15%) (0.0476 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 21: train=1.40073, val=2.46123, took 96.533 s
  Validation Found 48481 of 54504, added 458 (eff 88.95%) (0.0459 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 22: train=1.39845, val=2.44036, took 96.339 s
  Validation Found 48584 of 54504, added 447 (eff 89.14%) (0.0448 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 23: train=1.39681, val=2.45079, took 96.051 s
  Validation Found 48549 of 54504, added 431 (eff 89.07%) (0.0432 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 24: train=1.39436, val=2.46828, took 95.918 s
  Validation Found 48340 of 54504, added 419 (eff 88.69%) (0.042 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 25: train=1.39199, val=2.44491, took 96.033 s
  Validation Found 48758 of 54504, added 480 (eff 89.46%) (0.0481 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 26: train=1.39023, val=2.45659, took 96.109 s
  Validation Found 48467 of 54504, added 453 (eff 88.92%) (0.0454 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 27: train=1.38764, val=2.45282, took 96.295 s
  Validation Found 48447 of 54504, added 420 (eff 88.89%) (0.0421 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 28: train=1.38561, val=2.46803, took 96.403 s
  Validation Found 48413 of 54504, added 413 (eff 88.82%) (0.0414 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 29: train=1.38468, val=2.44619, took 96.466 s
  Validation Found 48611 of 54504, added 468 (eff 89.19%) (0.0469 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 30: train=1.38181, val=2.46446, took 96.259 s
  Validation Found 48591 of 54504, added 449 (eff 89.15%) (0.045 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 31: train=1.37967, val=2.44996, took 96.034 s
  Validation Found 48467 of 54504, added 447 (eff 88.92%) (0.0448 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 32: train=1.37812, val=2.4455, took 95.936 s
  Validation Found 48690 of 54504, added 497 (eff 89.33%) (0.0498 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 33: train=1.37715, val=2.46907, took 96.111 s
  Validation Found 48623 of 54504, added 445 (eff 89.21%) (0.0446 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 34: train=1.37458, val=2.45792, took 96.216 s
  Validation Found 48714 of 54504, added 479 (eff 89.38%) (0.048 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 35: train=1.37289, val=2.45332, took 96.391 s
  Validation Found 48995 of 54504, added 522 (eff 89.89%) (0.0523 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 36: train=1.37191, val=2.46797, took 96.362 s
  Validation Found 48737 of 54504, added 486 (eff 89.42%) (0.0487 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…