In [1]:
%%capture _ 
# ^ this just silences warnings
import torch
import mlflow
# import hiddenlayer as HL

from model.collectdata_mdsA import collect_data
from model.collectdata_poca_KDE import collect_data_poca
from model.alt_loss_A import Loss
from model.training import trainNet, select_gpu
from model.utilities import load_full_state, count_parameters, Params, save_to_mlflow

from model.autoencoder_models import UNet
from model.autoencoder_models import UNetPlusPlus

In [2]:
args = Params(
    batch_size=64,
    device = select_gpu(0),
    epochs=100,
    lr=1e-4,
    experiment_name='Top Models',
    asymmetry_parameter=0
)

1 available GPUs (initially using device 0):
  0 GeForce RTX 2080 Ti


  and should_run_async(code)


In [3]:
'''
train_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_A/Aug14_80K_train.h5',
      '/share/lazy/sokoloff/ML-data_AA/Oct03_80K_train.h5',
#     '/share/lazy/sokoloff/ML-data_AA/Oct03_40K_train.h5',
      '/share/lazy/will/ML_mdsA/June30_2020_80k_1.h5',
     '/share/lazy/will/ML_mdsA/June30_2020_80k_3.h5',
     '/share/lazy/will/ML_mdsA/June30_2020_80k_4.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_5.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_6.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_7.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_8.h5',
#     '/share/lazy/will/ML_mdsA/June30_2020_80k_9.h5',
    batch_size=args['batch_size'],
    masking=True,
    shuffle=False,
    load_XandXsq=False,
#     device = args['device'], 
    load_xy=False)

val_loader = collect_data(
    '/share/lazy/sokoloff/ML-data_AA/Oct03_20K_val.h5',
    batch_size=args['batch_size'],
    slice=slice(256 * 39),
    masking=True, 
    shuffle=False,
    load_XandXsq=False,
    load_xy=False)
'''

events = 320000
## This is used when training with the new KDE
train_loader = collect_data_poca('/share/lazy/will/data/June30_2020_80k_1.h5',
                            '/share/lazy/will/data/June30_2020_80k_3.h5',
                            '/share/lazy/will/data/June30_2020_80k_4.h5',
                            '/share/lazy/will/data/June30_2020_80k_5.h5',
                            batch_size=args['batch_size'],
                            #device=args['device'],
                            masking=True, shuffle=True,
                            load_A_and_B=True,
                            load_xy=True,
                           ## slice = slice(0,18000)
                           )

val_loader = collect_data_poca('/share/lazy/sokoloff/ML-data_AA/20K_POCA_kernel_evts_200926.h5',
                            batch_size=args['batch_size'],
                            #device=args['device'],
                            masking=True, shuffle=True,
                            load_A_and_B=True,
                            load_xy=True,
                            ##slice = slice(18000,None)
                           )

Loading data...
Loaded /share/lazy/will/data/June30_2020_80k_1.h5 in 29.61 s
Loaded /share/lazy/will/data/June30_2020_80k_3.h5 in 34.99 s
Loaded /share/lazy/will/data/June30_2020_80k_4.h5 in 29.13 s
Loaded /share/lazy/will/data/June30_2020_80k_5.h5 in 33.61 s
Constructing 320000 event dataset took 4.694 s
Loading data...
Loaded /share/lazy/sokoloff/ML-data_AA/20K_POCA_kernel_evts_200926.h5 in 6.69 s
Constructing 20000 event dataset took 0.1658 s


In [4]:
mlflow.tracking.set_tracking_uri('file:/share/lazy/pv-finder_model_repo')
mlflow.set_experiment(args['experiment_name'])

Traceback (most recent call last):
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 197, in list_experiments
    experiment = self._get_experiment(exp_id, view_type)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/store/tracking/file_store.py", line 260, in _get_experiment
    meta = read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/apps/miniconda3/envs/june2020-gpu/lib/python3.7/site-packages/mlflow/utils/file_utils.py", line 167, in read_yaml
    raise MissingConfigException("Yaml file '%s' does not exist." % file_path)
mlflow.exceptions.MissingConfigException: Yaml file '/share/lazy/pv-finder_model_repo/ML/meta.yaml' does not exist.


In [5]:
model = UNetPlusPlus().to(args['device'])
model.to("cuda:0")
optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])
loss = Loss(epsilon=1e-5,coefficient=args['asymmetry_parameter'])

parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [6]:
#load_full_state(model, optimizer, '/share/lazy/pv-finder_model_repo/24/9a2b98a397eb404497b26ab5eaa091a5/artifacts/train.ipynb')

run_name = 'u-net++'

# tune kernel based on gpu
#torch.backends.cudnn.benchmark=True
train_iter = enumerate(trainNet(model, optimizer, loss, train_loader, val_loader, args['epochs'], notebook=True))
with mlflow.start_run(run_name = run_name) as run:
    mlflow.log_artifact('train.ipynb')
    for i, result in train_iter:
        print(result.cost)
        torch.save(model, 'run_stats.pyt')
        mlflow.log_artifact('run_stats.pyt')

        save_to_mlflow({
            'Metric: Training loss':result.cost,
            'Metric: Validation loss':result.val,
            'Metric: Efficiency':result.eff_val.eff_rate,
            'Metric: False positive rate':result.eff_val.fp_rate,
            'Param: Parameters':parameters,
            'Param: Events':events,
            'Param: Asymmetry':args['asymmetry_parameter'],
            'Param: Epochs':args['epochs'],
        }, step=i)

Number of batches: train = 5000, val = 313


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  file=sys.stderr,


HBox(children=(FloatProgress(value=0.0, description='Epochs', layout=Layout(flex='2'), style=ProgressStyle(des…

HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 0: train=7.88466, val=2.53085, took 1211.4 s
  Validation Found 84640 of 108007, added 358 (eff 78.37%) (0.0179 FP/event)
7.884662766218185


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 1: train=2.00213, val=2.30458, took 1204.6 s
  Validation Found 89746 of 108007, added 466 (eff 83.09%) (0.0233 FP/event)
2.002134434938431


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 2: train=1.90656, val=2.25787, took 1204.7 s
  Validation Found 91565 of 108007, added 516 (eff 84.78%) (0.0258 FP/event)
1.9065633627653122


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 3: train=1.85631, val=2.082, took 1207.0 s
  Validation Found 94612 of 108007, added 671 (eff 87.60%) (0.0335 FP/event)
1.856307295513153


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 4: train=1.82332, val=1.98138, took 1206.3 s
  Validation Found 96212 of 108007, added 736 (eff 89.08%) (0.0368 FP/event)
1.8233194808006286


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 5: train=1.80033, val=1.99751, took 1205.1 s
  Validation Found 95633 of 108007, added 677 (eff 88.54%) (0.0338 FP/event)
1.800329497027397


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 6: train=1.78072, val=1.9706, took 1200.2 s
  Validation Found 96596 of 108007, added 758 (eff 89.43%) (0.0379 FP/event)
1.7807168056249618


HBox(children=(FloatProgress(value=0.0, description='Training', layout=Layout(flex='2'), max=5000.0, style=Pro…

Epoch 7: train=1.76482, val=1.91978, took 1198.3 s
  Validation Found 96884 of 108007, added 729 (eff 89.70%) (0.0364 FP/event)
1.76482245657444


PermissionError: [Errno 13] Permission denied: '/share/lazy/pv-finder_model_repo/.trash'