In [1]:
import wandb
wandb.login()

import torch
from torch import nn
from torch.utils.data import DataLoader
# from model import DnCNN
# from Dataset import Img_Dataset
import numpy as np 
import pathlib
import matplotlib.pyplot as plt

# Importing utitility functions for training
from PT_files.model import DnCNN, DnCNN_B
from PT_files.Dataset import Img_Dataset, Large_Img_Dataset
import PT_files.preprocess_data as ppd
import PT_files.save_load as sl

device = "cuda" if torch.cuda.is_available() else "cpu"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmdowicz[0m (use `wandb login --relogin` to force relogin)


Using cuda device


In [None]:
training_data = sl.NERSC_load('training_data_60%_6000.npy')
test_data = sl.NERSC_load('test_data_40%_6000.npy')

In [None]:
train_dataset = Img_Dataset(data_set=training_data,
                                  patch_size=150,
                                  width=6000,
                                  height=6000)

test_dataset = Img_Dataset(data_set=test_data,
                                patch_size=150,
                                width=6000,
                                height=6000)

In [None]:
#@title
import wandb
import math
import random
import torch, torchvision
import torch.nn as nn
import torchvision.transforms as T
from tqdm.notebook import tqdm


def get_dataloader(is_train, batch_size, slice=5):
    "Get a training dataloader"
    full_dataset = torchvision.datasets.MNIST(root=".", train=is_train, transform=T.ToTensor(), download=True)
    sub_dataset = torch.utils.data.Subset(full_dataset, indices=range(0, len(full_dataset), slice))
    loader = torch.utils.data.DataLoader(dataset=sub_dataset, 
                                         batch_size=batch_size, 
                                         shuffle=True if is_train else False, 
                                         pin_memory=True, num_workers=2)
    return loader


def get_dataloader(is_train, batch_size):
    "Get a training dataloader"
    train_dataset = Img_Dataset(data_set=training_data,
                                  patch_size=150,
                                  width=6000,
                                  height=6000)

    test_dataset = Img_Dataset(data_set=test_data,
                                patch_size=150,
                                width=6000,
                                height=6000)
    if is_train == True:
        loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                         batch_size=batch_size, 
                                         shuffle=True if is_train else False)
        
    else:
        loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                         batch_size=batch_size, 
                                         shuffle=True if is_train else False)
    return loader

In [11]:
def train_one_step(model, imgs, labels, optimizer):
    optimizer.zero_grad()
  
    
    output = model(imgs)
    loss_fn = nn.MSELoss(reduction='sum')
    loss = loss_fn(output, labels) /(2*len(imgs))

    loss.backward()
    optimizer.step()
    return loss

def train_one_epoch(model, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch_index, (imgs, labels) in enumerate(dataloader):
        
        loss = train_one_step(model, imgs, labels, optimizer)
        total_loss += loss
        
    return np.log(total_loss)

In [12]:
def validate_one_step(model, imgs, labels):
    output = model(imgs)
    loss_fn = nn.MSELoss(reduction='sum')
    loss = loss_fn(output, labels) /(2*len(imgs))
    return loss

def validate_one_epoch(model, dataloader):
    model.eval()
    total_loss = 0
    for batch_index, (imgs, labels) in enumerate(dataloader):

        with torch.no_grad():
            loss = validate_one_step(model, imgs, labels)
        total_loss += loss
        
    return np.log(total_loss)

In [13]:
def build_optimizer(model, optimizer, learning_rate):
    
    if optimizer == "sgd":
        optimizer = torch.optim.SGD(mode.parameters(),
                                    lr=learning_rate,
                                    momentum=0.9)
    elif optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=learning_rate)
    return optimizer

In [14]:
sweep_config

{'method': 'random',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'optimizer': {'values': ['adam', 'sgd']},
  'fc_num_layers': {'values': [17, 20, 25, 30]},
  'fc_num_features': {'values': [64, 128, 256]},
  'epochs': {'value': 2},
  'learning_rate': {'distribution': 'uniform', 'min': 0, 'max': 0.1},
  'batch_size': {'values': [16, 32, 64, 128, 256]}}}

In [1]:
for _ in range(5):
    # initialize a wandb run
    wandb.init(
        project="pytorch-DnCNN",
        config={
            "epochs": 2,
            "batch_size": [8, 16, 32, 64, 128],
            "lr": 

SyntaxError: EOL while scanning string literal (4007491228.py, line 4)

In [18]:
def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        # If calld by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        
        # Training Data
        train_loader = DataLoader(train_dataset, 
                                  config.batch_size)
        # Validation Data
        val_loader = DataLoader(test_dataset,
                                 batch_size=config.batch_size)
        
        # Model
        model = DnCNN(num_layers=config.fc_num_layers,
                        num_features=config.fc_num_features)
        
        # Optimizer
        optimizer = build_optimizer(model, config.optimizer, config.learning_rate)
        
        # Track loss curves
        losses, val_losses = [], []
        
        for epoch in range(config.epochs):
            losses.append(train_one_epoch(model, train_loader, optimizer)).cpu().detach().numpy()
            train_metrics = {"train_loss": train_one_epoch(model, train_loader, optimizer)}
                             
            val_loses.append(validate_one_epoch(model, test_loader)).cpu().detach().numpy()
            val_metrics = {"val_loss": validate_one_epoch(model, test_loader)}
                             
            wandb.log({**train_metrics, **val_metrics})
            
    wandb.finish()

In [19]:
wandb.agent(sweep_id, train, count=5)

[34m[1mwandb[0m: Agent Starting Run: pfbzxnxx with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_num_features: 256
[34m[1mwandb[0m: 	fc_num_layers: 25
[34m[1mwandb[0m: 	learning_rate: 0.046667252374289016
[34m[1mwandb[0m: 	optimizer: sgd
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Run pfbzxnxx errored: NameError("name 'mode' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run pfbzxnxx errored: NameError("name 'mode' is not defined")
[34m[1mwandb[0m: Agent Starting Run: eigjod06 with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_num_features: 128
[34m[1mwandb[0m: 	fc_num_layers: 25
[34m[1mwandb[0m: 	learning_rate: 0.09004904601154037
[34m[1mwandb[0m: 	optimizer: sgd
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Run eigjod06 errored: NameError("name 'mode' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run eigjod06 errored: NameError("name 'mode' is not defined")
[34m[1mwandb[0m: Agent Starting Run: n7rqdiwm with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 2
[34m[1mwandb[0m: 	fc_num_features: 256
[34m[1mwandb[0m: 	fc_num_layers: 17
[34m[1mwandb[0m: 	learning_rate: 0.019228073331369434
[34m[1mwandb[0m: 	optimizer: sgd
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

Run n7rqdiwm errored: NameError("name 'mode' is not defined")
[34m[1mwandb[0m: [32m[41mERROR[0m Run n7rqdiwm errored: NameError("name 'mode' is not defined")
Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: [32m[41mERROR[0m Detected 3 failed runs in the first 60 seconds, killing sweep.
[34m[1mwandb[0m: To disable this check set WANDB_AGENT_DISABLE_FLAPPING=true


In [17]:
# wandb.agent(sweep_id, train(train_ds=train_dataset, val_ds=test_dataset, config=sweep_config), count=5)