### Train model in notebook (inefficient, for testing and debug only)

In [1]:
import os

In [None]:
#os.environ['NCCL_DEBUG'] = 'INFO'
#os.environ['NCCL_DEBUG_SUBSYS'] = 'ALL'
#os.environ['TORCH_DISTRIBUTED_DEBUG'] = 'INFO'
os.environ['NCCL_IB_DISABLE'] = '1'
os.environ['NCCL_SOCKET_IFNAME'] = 'enp2s0f1np1'
os.environ['NCCL_P2P_DISABLE'] = '1'
os.environ['NCCL_BLOCKING_WAIT'] = '1'

In [2]:
import sys
from os.path import join
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
from torch import optim, nn
import torch.multiprocessing as mp
from torch.distributed import init_process_group, destroy_process_group
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader
from astropy.io import fits
import pyxis.torch as pxt
import normflows as nf

from networks import *
from train import *
import config

train_dir = '/ocean/projects/phy250048p/shared/datasets/small'
test_dir = '/ocean/projects/phy250048p/shared/datasets/small'
fig_dir = '/ocean/projects/phy250048p/shared/figures/'
model_dir = '/ocean/projects/phy250048p/shared/models/'

In [3]:
torch.cuda.device_count()

1

In [4]:
world_size = 1
save_every = 1
nepochs = config.train['epoch_number']
batch_size = config.train['batch_size']
nfeatures = config.train['feature_number']

In [5]:
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "12355"
torch.cuda.set_device(0)
init_process_group(backend='nccl', rank=0, world_size=1)

In [6]:
train_ds = pxt.TorchDataset(train_dir)
valid_ds = pxt.TorchDataset(test_dir)

In [7]:
train_dl = DataLoader(
        train_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(train_ds),
    )
valid_dl = DataLoader(
        valid_ds,
        batch_size=batch_size,
        pin_memory=True,
        sampler=DistributedSampler(valid_ds),
    )

In [8]:
# Define flows
K = 4

latent_size = 2
hidden_units = 64
num_blocks = 2
context_size = 1024

flows = []
for i in range(K):
    flows += [nf.flows.MaskedAffineAutoregressive(latent_size, hidden_units, 
                                                  context_features=context_size, 
                                                  num_blocks=num_blocks)]
    flows += [nf.flows.LULinearPermute(latent_size)]

# Set base distribution
q0 = nf.distributions.DiagGaussian(2, trainable=False)

In [9]:
model = ForkCNN(mode=1, base=q0, flows=flows)
model.to(0)
model = DDP(model, device_ids=[0])

In [10]:
# optimizer = optim.SGD(model.parameters(), 
#                       lr=config.train['initial_learning_rate'],
#                       momentum=config.train['momentum'])
optimizer = optim.AdamW(model.parameters(),
                        lr=config.train['initial_learning_rate'], 
                        weight_decay=config.train['weight_decay'])

In [11]:
trainer = CNNTrainer(world_size, model, nfeatures, train_ds, valid_ds, optimizer, 0, save_every, batch_size)

In [12]:
trainer.train(2)



torch.Size([10000, 1, 48, 48]) torch.Size([10000, 1, 48, 48]) torch.Size([10000, 2]) torch.Size([10000, 2])


In [12]:
destroy_process_group()

In [12]:
mp.spawn(train_nn, args=(world_size, ForkCNN, CNNTrainer), nprocs=world_size)

INFO:Setup:Initializing
INFO:Setup:[rank: 0] Successfully set up device
INFO:Setup:Setting up for density estimation


ProcessRaisedException: 

-- Process 0 terminated with the following error:
Traceback (most recent call last):
  File "/jet/home/xwang30/.conda/envs/kl-nn/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 90, in _wrap
    fn(i, *args)
  File "/jet/home/xwang30/kl-nn/arch/train.py", line 290, in train_nn
    train_ds, valid_ds, model, optimizer = load_train_objs(mode, nfeatures, batch_size, world_size, Model, rank, epoch, base, flows)
TypeError: load_train_objs() takes from 6 to 7 positional arguments but 9 were given
