## More automated approach to training architectures

### Setting up the imports/project path

In [6]:
# Ensure notebook sees necessary paths
import os, sys
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [7]:
# Imports
from DataObjects.DataLoader import DataLoader
from Architectures.DifussionModel import DiffusionModel
from Trainers.DiffusionTrainer import TrainingConfig

# Noise schedulers
from diffusers import DDPMScheduler
from diffusers.optimization import get_cosine_schedule_with_warmup
from diffusers import DDPMPipeline

# Torch files
import torch
import torch.nn.functional as F

from Utils.utils import make_grid, evaluate

from Architectures.DifussionModel import train_loop

In [8]:
#Set up the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


### Setup training objects for diffusion

In [4]:
# loading the config
config = TrainingConfig().from_json("../configs/config1.json")

In [9]:
# Load the data
Data = DataLoader(config.data_dir, batch_size=config.train_batch_size, shuffle=True, fraction = config.fraction, raw = True, dim_shape=config.image_size)
print(f"Number of data batches: {len(Data)}")

train_dataloader = torch.utils.data.DataLoader(Data, batch_size=config.train_batch_size, shuffle=True)

Number of data batches: 78


In [None]:
noise_scheduler = DDPMScheduler(num_train_timesteps=config.num_train_timesteps, beta_start=config.beta_start, beta_end=config.beta_end, beta_schedule=config.beta_schedule, variance_type=config.variance_type)
model = DiffusionModel.return_custom_arch(config)
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate)

In [None]:
lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=config.lr_warmup_steps,
    num_training_steps=(len(Data) * config.num_epochs * config.train_batch_size),
)

In [None]:
## Launch the training
from accelerate import notebook_launcher
args = (config, model, noise_scheduler, optimizer, Data, lr_scheduler, device)

notebook_launcher(train_loop, args, num_processes=1)

Launching training on one GPU.


  deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False)
Epoch 0: 100%|██████████| 95/95 [00:28<00:00,  3.28it/s, loss=0.625, lr=1.9e-5, step=94] 
Epoch 1: 100%|██████████| 95/95 [00:26<00:00,  3.64it/s, loss=0.0408, lr=3.8e-5, step=189]
Epoch 2: 100%|██████████| 95/95 [00:51<00:00,  1.84it/s, loss=0.0165, lr=5.7e-5, step=284] 
Epoch 3: 100%|██████████| 95/95 [00:27<00:00,  3.48it/s, loss=0.0119, lr=7.6e-5, step=379]
Epoch 4: 100%|██████████| 95/95 [00:27<00:00,  3.47it/s, loss=0.0284, lr=9.5e-5, step=474] 
Epoch 5: 100%|██████████| 95/95 [00:37<00:00,  2.54it/s, loss=0.00365, lr=0.0001, step=569]
Epoch 6: 100%|██████████| 95/95 [00:31<00:00,  3.01it/s, loss=0.0709, lr=0.0001, step=664]
Epoch 7: 100%|██████████| 95/95 [00:30<00:00,  3.09it/s, loss=0.0034, lr=0.0001, step=759]
Epoch 8: 100%|██████████| 95/95 [00:31<00:00,  3.03it/s, loss=0.00891, lr=0.0001, step=854]
Epoch 9: 100%|██████████| 95/95 [00:31<00:00,  3.04it/s, loss=0.00839, lr=0.0001, step