In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
from tqdm.notebook import tqdm

import numpy as np 
import pathlib
import matplotlib.pyplot as plt

# Importing utitility functions for training
from PT_files.model import DnCNN, DnCNN_B
from PT_files.Dataset import Img_Dataset, Large_Img_Dataset
import PT_files.preprocess_data as ppd
import PT_files.save_load as sl

import pprint

device = "cuda" if torch.cuda.is_available() else "cpu"

Using cuda device


In [2]:
%%capture
# !pip install wandb --upgrade

In [3]:
import wandb

wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmdowicz[0m (use `wandb login --relogin` to force relogin)


True

In [4]:
sweep_config = {
    'method': 'grid'
    }

metric = {
    'name': 'train_loss',
    'goal': 'minimize',
    'name': 'val_loss',
    'goal': 'minimize'
    }

sweep_config['metric'] = metric

In [5]:
# for 2k modele

# parameters_dict = {
#     'optimizer': {
#         'values': ['adam', 'sgd']
#         },
#     'num_layers': {
#         'values': [15, 20, 25, 30]
#         },
#     'num_features': {
#         'values': [96, 128, 256]
#         },
#     'epochs': {
#         'values': [200]
#         },
#     'learning_rate': {
#         # a flat distribution between 0 and 0.1
#         'distribution': 'uniform',
#         'min': 0,
#         'max': 1e-4
#         },
#     'batch_size': {
#         'values': [4, 8, 16]
#         },
#     'patch_size': {
#         'values': [10, 15, 20, 25, 30, 35]
#     }
# }

# sweep_config['parameters'] = parameters_dict

In [6]:
# # for 6k model

# parameters_dict = {
#     'optimizer': {
#         #'values': ['adam', 'sgd']
#         'values': ['adam']

#         },
#     'num_layers': {
#         'values': [10, 12, 14, 16, 17, 18, 19, 20]
#         },
#     'num_features': {
#         'values': [32, 40, 48, 50, 52, 54, 56, 58]
#         },
#     'epochs': {
#         'values': [200]
#         },
#     'learning_rate': {
#         # a flat distribution between 0 and 0.1
#         'distribution': 'uniform',
#         'min': 0.5e-3,
#         'max': 1.5e-3
#         },
#     'batch_size': {
#         'values': [12, 14, 16, 18, 20, 22, 24]
#         },
#     'patch_size': {
#         'values': [100, 125, 150, 175, 200, 250, 300, 400]
#         },
#     'eps': {
#         'values': [1e-6, 1e-7, 1e-8]
#     }
# }

# sweep_config['parameters'] = parameters_dict

In [7]:
parameters_dict = {
    'optimizer': {
        'values': ['adam']
        },
    'num_layers': {
        'values': [20]
        },
    'num_features': {
        'values': [64, 68, 72]
        },
    'epochs': {
        'values': [770, 780, 790, 800, 810]
        },
    'learning_rate': {
        'values': [0.001375]
        },
    'batch_size': {
        'values': [16, 24]
        },
    'patch_size': {
        'values': [150, 200, 250]
        },
    'eps': {
        'values': [1e-8]
    }
}

sweep_config['parameters'] = parameters_dict

In [8]:
pprint.pprint(sweep_config)

{'method': 'grid',
 'metric': {'goal': 'minimize', 'name': 'val_loss'},
 'parameters': {'batch_size': {'values': [16, 24]},
                'epochs': {'values': [770, 780, 790, 800, 810]},
                'eps': {'values': [1e-08]},
                'learning_rate': {'values': [0.001375]},
                'num_features': {'values': [64, 68, 72]},
                'num_layers': {'values': [20]},
                'optimizer': {'values': ['adam']},
                'patch_size': {'values': [150, 200, 250]}}}


In [9]:
sweep_id = wandb.sweep(sweep_config, project="6k_DnCNN_sweeps_adam")

Create sweep with ID: phzw21ra
Sweep URL: https://wandb.ai/mdowicz/6k_DnCNN_sweeps_adam/sweeps/phzw21ra


In [10]:
def model_pipeline(config=None):
    
    # tell wandb to get started
    with wandb.init(config=config):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config
        
        # make the model, data, and optimization problem
        model, train_loader, test_loader, criterion, optimizer = make(config)
        print(model)
        
        # and use them to train the model
        train(model, train_loader, test_loader, criterion, optimizer, config)

In [11]:
def make(config):
    # Make the data
    train = get_data(model_name="6k", patch_size=config.patch_size, train=True) 
    test = get_data(model_name="6k", patch_size=config.patch_size, train=False)
    train_loader = make_loader(train, batch_size=config.batch_size)
    test_loader = make_loader(test, batch_size=config.batch_size)
    
    # Make the model
    model = DnCNN(num_layers=config.num_layers,
                  num_features=config.num_features).to(device)
    
    # Make the loss and optimizer
    criterion = nn.MSELoss(reduction='sum') # add more to this. see 02A notebook
    optimizer = build_optimizer(model=model,
                                optimizer=config.optimizer,
                                learning_rate=config.learning_rate,
                                eps=config.eps)
    
    
    return model, train_loader, test_loader, criterion, optimizer 

In [12]:
def get_data(model_name, patch_size, train=True):
    
    if model_name == "6k":
        if train == True:
            # Raw training data
            training_data = sl.NERSC_load('training_data_60%_6000.npy')        
            # Processed training data for DnCNN
            dataset = Img_Dataset(data_set=training_data,
                                      patch_size=patch_size,
                                      width=6000,
                                      height=6000)
        else:
            
            # Raw test data
            test_data = sl.NERSC_load('test_data_40%_6000.npy')
            # Processed test data for DnCN
            dataset = Img_Dataset(data_set=test_data,
                                    patch_size=patch_size,
                                    width=6000,
                                    height=6000)
        
    elif model_name == "2k":
        if train == True:
            # Raw training & test data
            training_data = sl.NERSC_load('training_data_60%_2000.npy')        
            # Processed training & test data for DnCNN
            dataset = Img_Dataset(data_set=training_data,
                                      patch_size=patch_size,
                                      width=2000,
                                      height=2000)
        else:
            
            # Raw test data
            test_data = sl.NERSC_load('test_data_40%_2000.npy')
            # Processed test data for DnCN
            dataset = Img_Dataset(data_set=test_data,
                                    patch_size=patch_size,
                                    width=2000,
                                    height=2000)
        

    return dataset

def make_loader(dataset, batch_size):
    
    loader = DataLoader(dataset=dataset,
                        batch_size=batch_size,
                        shuffle=True)
    
    return loader

def build_optimizer(model, optimizer, learning_rate, eps=None):
    if optimizer == "sgd":
        optimizer = torch.optim.SGD(model.parameters(),
                              lr=learning_rate, 
                              momentum=0.5)
    
    elif optimizer == "adam":
        optimizer = torch.optim.Adam(model.parameters(),
                               lr=learning_rate,
                                eps=eps)
        
    return optimizer

In [13]:
def train(model, train_loader, test_loader, criterion, optimizer, config):
    # Tell wandb to watch what the model gets up to: gradients, weights, etc.
    wandb.watch(model, criterion, log="all", log_freq=10)
    
    # Run training and track with wandb
    model.train()
    total_batches = len(train_loader) * config.epochs
    example_ct = 0 # number of examples seen
    batch_ct = 0
    
  
    for epoch in tqdm(range(config.epochs)):
        train_tot_loss = 0
        val_tot_loss = 0
        # Evaluate the training loss
        for batch_index, (images, labels) in enumerate(train_loader):
            
            loss = train_batch(images=images,
                               labels=labels,
                               model=model,
                               optimizer=optimizer,
                               criterion=criterion)
            train_tot_loss += loss
            example_ct += len(images)
            batch_ct += 1
            

        # Evaluate the validation loss       
        model.eval()
        for batch_index, (images, labels) in enumerate(test_loader):
            images, labels = images.to(device), labels.to(device)
            with torch.no_grad():
                val_loss = validate_batch(images=images,
                                          labels=labels,
                                          model=model,
                                          criterion=criterion)
            val_tot_loss += val_loss
        # Log validation and training loss
        wandb.log({"val_loss": val_tot_loss, "train_loss": train_tot_loss})
    
                
                
def train_batch(images, labels, model, optimizer, criterion):
    images, labels = images.to(device), labels.to(device)
    
    # Forward Pass ->
    output = model(images)
    loss = criterion(output, labels) /(2*len(images))
    
    # Backward pass <-
    optimizer.zero_grad()
    loss.backward()
    
    # Step with optimizer
    optimizer.step()
    
    return loss
    # return np.log(loss.detach().cpu().numpy())

def validate_batch(images, labels, model, criterion):
    output = model(images)
    loss = criterion(output, labels) /(2*len(images))
    # loss = np.log(loss.cpu())
    
    return loss

In [None]:
wandb.agent(sweep_id, model_pipeline)

[34m[1mwandb[0m: Agent Starting Run: uf601jfz with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 770
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	learning_rate: 0.001375
[34m[1mwandb[0m: 	num_features: 64
[34m[1mwandb[0m: 	num_layers: 20
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	patch_size: 150
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


DnCNN(
  (layers): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (4): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True

  0%|          | 0/770 [00:00<?, ?it/s]




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_loss,▇▇▇▂▂▂▁▁▁▁▁▁▁▁▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇
val_loss,██▇▂▂▁▁▂▂▁▁▁▁▁██████████████████████████

0,1
train_loss,44.97177
val_loss,27.28059


[34m[1mwandb[0m: Agent Starting Run: 6skkulfo with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 770
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	learning_rate: 0.001375
[34m[1mwandb[0m: 	num_features: 64
[34m[1mwandb[0m: 	num_layers: 20
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	patch_size: 200
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


DnCNN(
  (layers): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (4): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True

  0%|          | 0/770 [00:00<?, ?it/s]




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_loss,█▇▇█▇▇▆▅▃▂▂▂▁▁▁▂▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▂▁▇▇▇▇▇
val_loss,█████▇▇▅▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█████

0,1
train_loss,84.99322
val_loss,50.70247


[34m[1mwandb[0m: Agent Starting Run: rsk33jcn with config:
[34m[1mwandb[0m: 	batch_size: 16
[34m[1mwandb[0m: 	epochs: 770
[34m[1mwandb[0m: 	eps: 1e-08
[34m[1mwandb[0m: 	learning_rate: 0.001375
[34m[1mwandb[0m: 	num_features: 64
[34m[1mwandb[0m: 	num_layers: 20
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	patch_size: 250
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


DnCNN(
  (layers): Sequential(
    (0): Sequential(
      (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): ReLU(inplace=True)
    )
    (1): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (2): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (4): Sequential(
      (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True

  0%|          | 0/770 [00:00<?, ?it/s]

In [None]:
# model = model_pipeline(config)