### Train model in notebook (inefficient, for testing and debug only)

In [1]:
import os

In [None]:
#os.environ['NCCL_DEBUG'] = 'INFO'
#os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL'
#os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ['NCCL_SOCKET_IFNAME'] = 'enp2s0f1np1'
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_BLOCKING_WAIT'] = '1'

In [2]:
import sys
from os.path import join
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import optim, nn
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from astropy.io import fits
import pyxis.torch as pxt
import normflows as nf

from networks import *
from train import *
import config

train_dir = '/ocean/projects/phy250048p/shared/datasets/small'
test_dir = '/ocean/projects/phy250048p/shared/datasets/small'
fig_dir = '/ocean/projects/phy250048p/shared/figures/'
model_dir = '/ocean/projects/phy250048p/shared/models/'

In [3]:
torch.cuda.device_count()

1

In [4]:
world_size = 1
save_every = 1
nepochs = config.train['epoch_number']
batch_size = config.train['batch_size']
nfeatures = config.train['feature_number']

In [5]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.cuda.set_device(0)
init_process_group(backend='nccl', rank=0, world_size=1)

In [6]:
train_ds = pxt.TorchDataset(train_dir)
valid_ds = pxt.TorchDataset(test_dir)

In [7]:
train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(train_ds),
    )
valid_dl = DataLoader(
        valid_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(valid_ds),
    )

In [None]:
# Define flows
K = 4

latent_size = 2
hidden_units = 64
num_blocks = 2
context_size = 1024

flows = []
for i in range(K):
    flows += [nf.flows.MaskedAffineAutoregressive(latent_size, hidden_units, 
                                                  context_features=context_size, 
                                                  num_blocks=num_blocks,
                                                  use_batch_norm=True,)]
    flows += [nf.flows.LULinearPermute(latent_size)]

# Set base distribution
q0 = nf.distributions.DiagGaussian(2, trainable=False)

In [9]:
model = ForkCNN(mode=1, base=q0, flows=flows)
model.to(0)
model = DDP(model, device_ids=[0])

In [10]:
# optimizer = optim.SGD(model.parameters(), 
#                       lr=config.train['initial_learning_rate'],
#                       momentum=config.train['momentum'])
optimizer = optim.AdamW(model.parameters(),
                        lr=config.train['initial_learning_rate'], 
                        weight_decay=config.train['weight_decay'])

In [11]:
trainer = CNNTrainer(world_size, model, nfeatures, train_ds, valid_ds, optimizer, 0, save_every, batch_size)

In [12]:
trainer.train(2)



torch.Size([100, 1024]) -3.735682725906372 5.698746681213379
torch.Size([100, 1024]) -2.4996755123138428 5.058440208435059
torch.Size([100, 1024]) -2.5209600925445557 7.4257988929748535
torch.Size([100, 1024]) -2.5083298683166504 4.513862609863281
torch.Size([100, 1024]) -2.4859349727630615 7.404705047607422
torch.Size([100, 1024]) -2.483673572540283 4.819789886474609
torch.Size([100, 1024]) -2.5560293197631836 6.319204330444336
torch.Size([100, 1024]) -2.4841442108154297 5.069055557250977
torch.Size([100, 1024]) -2.4822652339935303 5.7034759521484375
torch.Size([100, 1024]) -2.4698126316070557 4.039548397064209
torch.Size([100, 1024]) -2.5047030448913574 4.420663833618164
torch.Size([100, 1024]) -2.4737021923065186 4.379639148712158
torch.Size([100, 1024]) -2.5185751914978027 7.999060153961182
torch.Size([100, 1024]) -2.504322052001953 4.431675434112549
torch.Size([100, 1024]) -2.4504120349884033 4.746517658233643
torch.Size([100, 1024]) -2.488983154296875 7.229802131652832
torch.Size

KeyboardInterrupt: 

In [12]:
destroy_process_group()

In [4]:
mp.spawn(train_nn, args=(world_size, ForkCNN, CNNTrainer), nprocs=world_size)

INFO:Setup:Initializing
INFO:Setup:[rank: 0] Successfully set up device
INFO:Setup:Setting up for density estimation
INFO:Setup:[rank: 0] Successfully loaded training objects
INFO:Setup:[rank: 0] Successfully initialized Trainer
INFO:Trainer:Setting up tensors on GPU
INFO:Trainer:Uploading training set to GPU...
INFO:Trainer:10% complete
INFO:Trainer:20% complete
INFO:Trainer:30% complete
INFO:Trainer:40% complete
INFO:Trainer:50% complete
INFO:Trainer:60% complete
INFO:Trainer:70% complete
INFO:Trainer:80% complete
INFO:Trainer:90% complete
INFO:Trainer:Uploading validation set to GPU...
INFO:Trainer:10% complete
INFO:Trainer:20% complete
INFO:Trainer:30% complete
INFO:Trainer:40% complete
INFO:Trainer:50% complete
INFO:Trainer:60% complete
INFO:Trainer:70% complete
INFO:Trainer:80% complete
INFO:Trainer:90% complete
INFO:Trainer:Training start
INFO:Trainer:Starting epoch 0
INFO:Trainer:Randomized SNR and noise for epoch 0
INFO:Trainer:Batch 0 complete
INFO:Trainer:[TRAIN] Epoch: 1 Lo

Loaded new model
torch.Size([10000, 1, 48, 48]) torch.Size([10000, 1, 48, 48]) torch.Size([10000, 2]) torch.Size([10000, 2])
