In [1]:
%%capture _ 
# ^ this just silences warnings
import torch
import mlflow
import hiddenlayer as HL

from model.collectdata_mdsA import collect_data
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.utilities import load_full_state, count_parameters, Params, save_to_mlflow

from model.autoencoder_models import UNet
from model.autoencoder_models import UNetPlusPlus

In [2]:
args = Params(
    batch_size=64,
    device = select_gpu(0),
    epochs=30,
    lr=4e-4,
    experiment_name='UNet', 
    asymmetry_parameter=2.5
)

1 available GPUs (initially using device 0):
  0 GeForce RTX 2080 Ti


In [3]:
train_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_A/Aug14_80K_train.h5',
#     '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
#     '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_1.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_3.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_4.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_5.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_6.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_7.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_8.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_9.h5',
    batch_size=args['batch_size'],
    masking=True,
    shuffle=False,
    load_XandXsq=False,
#     device = args['device'], 
    load_xy=False
)

val_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
    batch_size=args['batch_size'],
    slice=slice(256 * 39),
    masking=True, 
    shuffle=False,
    load_XandXsq=False,
    load_xy=False)


Loading data...
Loaded /share/lazy/sokoloff/ML-data_A/Aug14_80K_train.h5 in 11.53 s
Constructing 80000 event dataset took 0.2034 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5 in 2.533 s
Constructing 9984 event dataset took 0.04051 s


In [4]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment(args['experiment_name'])

Traceback (most recent call last):
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 197, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 260, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 167, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/share/lazy/pv-finder_model_repo/ML/meta.yaml' does not exist.


In [5]:
model = UNet().to(args['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss = Loss(epsilon=1e-5,coefficient=args['asymmetry_parameter'])

#print(model.i1.size())

In [None]:
# load_full_state(model, optimizer, '/share/lazy/pv-finder_model_repo/0/a868d4b8ec0642b39a7156f3dd894dfb/artifacts/run_stats.pyt', freeze_weights=False)

run_name = 'u-net'

# tune kernel based on gpu
#torch.backends.cudnn.benchmark=True
train_iter = enumerate(trainNet(model, optimizer, loss, train_loader, val_loader, args['epochs'], notebook=True))
with mlflow.start_run(run_name = run_name) as run:
    for i, result in train_iter:
        save_to_mlflow({
            'Metric: Training loss':result.cost,
            'Metric: Validation loss':result.val,
            'Metric: Efficiency':result.eff_val.eff_rate,
            'Metric: False positive rate':result.eff_val.fp_rate,
            'Param: Asymmetry':args['asymmetry_parameter'],
            'Param: Epochs':args['epochs'],
        }, step=i)

Number of batches: train = 1250, val = 156


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  file=sys.stderr,


HBox(children=(FloatProgress(value=0.0, description='Epochs', layout=Layout(flex='2'), max=80.0, style=Progres…

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 0: train=18.9447, val=5.55068, took 108.38 s
  Validation Found 50352 of 54504, added 1567 (eff 92.38%) (0.157 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 1: train=3.20394, val=5.56606, took 110.24 s
  Validation Found 50597 of 54504, added 1601 (eff 92.83%) (0.16 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 2: train=3.13078, val=5.56825, took 109.36 s
  Validation Found 50578 of 54504, added 1403 (eff 92.80%) (0.141 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 3: train=3.09238, val=5.57131, took 111.21 s
  Validation Found 50619 of 54504, added 1404 (eff 92.87%) (0.141 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 4: train=3.0707, val=5.48549, took 108.71 s
  Validation Found 50733 of 54504, added 1357 (eff 93.08%) (0.136 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 5: train=3.05197, val=5.5517, took 111.67 s
  Validation Found 50904 of 54504, added 1436 (eff 93.39%) (0.144 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 6: train=3.03757, val=5.50453, took 109.34 s
  Validation Found 50795 of 54504, added 1311 (eff 93.19%) (0.131 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 7: train=3.02595, val=5.60773, took 110.49 s
  Validation Found 50694 of 54504, added 1201 (eff 93.01%) (0.12 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 8: train=3.01096, val=5.47479, took 108.82 s
  Validation Found 51089 of 54504, added 1541 (eff 93.73%) (0.154 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 9: train=3.00342, val=5.51418, took 108.74 s
  Validation Found 50969 of 54504, added 1440 (eff 93.51%) (0.144 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 10: train=2.99213, val=5.75863, took 110.45 s
  Validation Found 50468 of 54504, added 1082 (eff 92.60%) (0.108 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 11: train=2.98645, val=5.6048, took 109.82 s
  Validation Found 50639 of 54504, added 1183 (eff 92.91%) (0.118 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 12: train=2.97596, val=5.44945, took 110.97 s
  Validation Found 51133 of 54504, added 1553 (eff 93.82%) (0.156 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 13: train=2.96825, val=5.48121, took 109.63 s
  Validation Found 50849 of 54504, added 1303 (eff 93.29%) (0.13 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 14: train=2.96256, val=5.45916, took 110.5 s
  Validation Found 51137 of 54504, added 1514 (eff 93.82%) (0.152 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 15: train=2.95345, val=5.42678, took 108.97 s
  Validation Found 51144 of 54504, added 1547 (eff 93.84%) (0.155 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 16: train=2.94451, val=5.44689, took 110.55 s
  Validation Found 51142 of 54504, added 1552 (eff 93.83%) (0.155 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 17: train=2.93688, val=5.39417, took 109.99 s
  Validation Found 51239 of 54504, added 1566 (eff 94.01%) (0.157 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 18: train=2.93134, val=5.43228, took 110.76 s
  Validation Found 51155 of 54504, added 1505 (eff 93.86%) (0.151 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 19: train=2.92635, val=5.57952, took 109.53 s
  Validation Found 50902 of 54504, added 1383 (eff 93.39%) (0.139 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 20: train=2.91833, val=5.54172, took 109.06 s
  Validation Found 50945 of 54504, added 1332 (eff 93.47%) (0.133 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 21: train=2.91272, val=5.3989, took 110.68 s
  Validation Found 51114 of 54504, added 1464 (eff 93.78%) (0.147 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 22: train=2.90572, val=5.37542, took 108.95 s
  Validation Found 51334 of 54504, added 1673 (eff 94.18%) (0.168 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 23: train=2.89891, val=5.4063, took 111.37 s
  Validation Found 51279 of 54504, added 1642 (eff 94.08%) (0.164 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 24: train=2.89514, val=5.40505, took 109.42 s
  Validation Found 51349 of 54504, added 1696 (eff 94.21%) (0.17 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 25: train=2.88569, val=5.39361, took 110.95 s
  Validation Found 51372 of 54504, added 1693 (eff 94.25%) (0.17 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 26: train=2.88318, val=5.46397, took 107.82 s
  Validation Found 50958 of 54504, added 1408 (eff 93.49%) (0.141 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 27: train=2.87508, val=5.35836, took 110.7 s
  Validation Found 51213 of 54504, added 1624 (eff 93.96%) (0.163 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 28: train=2.87237, val=5.39042, took 108.02 s
  Validation Found 51118 of 54504, added 1531 (eff 93.79%) (0.153 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 29: train=2.86551, val=5.38016, took 110.23 s
  Validation Found 51389 of 54504, added 1753 (eff 94.28%) (0.176 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 30: train=2.86034, val=5.40226, took 109.81 s
  Validation Found 51253 of 54504, added 1540 (eff 94.04%) (0.154 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…

Epoch 31: train=2.85826, val=5.39768, took 110.95 s
  Validation Found 51302 of 54504, added 1662 (eff 94.13%) (0.166 FP/event)


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=1250.0, style=Pro…