In [25]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.hub import load_state_dict_from_url
from models.awpooling import AWPool2d
from models.vggaw import VGG11AW

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.air import session
from ray.air.config import RunConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search.hyperopt import HyperOptSearch


import os
import time
from utils import get_network

In [7]:
model_urls = {
    'vgg11': 'https://download.pytorch.org/models/vgg11_bn-6002323d.pth',
    'vgg13': 'https://download.pytorch.org/models/vgg13_bn-abd245e5.pth',
    'vgg16': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth',
    'vgg19': 'https://download.pytorch.org/models/vgg19_bn-c79401a0.pth',
}

In [12]:
def get_loader(root='/root/notebooks/nfs/work/dataset/tiny-imagenet-200'):
    
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    traindir = os.path.join(root, 'train')
    valdir = os.path.join(root, 'val')
    
    train_ds = datasets.ImageFolder(
        traindir,
        transform=transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    )
    
    val_ds = datasets.ImageFolder(
        valdir,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )
    
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=4, pin_memory=True)
    
    return train_loader, val_loader

In [13]:
def load_pretrain(model, pretrain):
    
    model_dict = model.state_dict()
    delete_keys = [k for k in model_dict.keys() if 'batch' in k]
    # delete batch_tracked
    for k in delete_keys:
        del model_dict[k]

    param_value = list(pretrain.values())
    index = 0
    
    for k in model_dict.keys():
        if 'aw' in k or 'classifier' in k:
            continue
        model_dict[k] = param_value[index]
        index += 1

    model.load_state_dict(model_dict, strict=False)

In [8]:
def train_model(config): 
    
    assert torch.cuda.is_available()
    
    save_root = '/root/notebooks/nfs/work/larry.lai/AWPooling/baysopt'
    os.makedirs(save_root, exist_ok=True)
    
    device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
    train_loader, val_loader = get_loader()
    
    model = VGG11AW(num_class=200)
    model.set_temperature(config)
    model.to(device) 
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    best_acc = 0
    running_loss = 0
    
    for epoch in range(90):
        model.train()
        for data in train_loader:
            image, label = data
            image = image.to(device)
            label = label.to(device)
            
            logits = model(image)
            loss = criterion(logits, label)
            running_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        scheduler.step()
        print(f"Epoch[{epoch + 1}/5]: loss {running_loss/len(train_loader):.4f}")
        
        running_loss = 0
        corrects = 0
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                image, label = data
                image = image.to(device)
                label = label.to(device)
                
                logits = model(image)
                loss = criterion(logits, label)
                running_loss += loss.item()
                
                _, pred = logits.max(dim=1)
                corrects += pred.eq(label).sum()
        
        acc = corrects / len(val_loader.dataset)
        
        best_acc = acc if acc > best_acc else best_acc
        
        acc = acc.data.cpu().numpy()
        checkpoint = Checkpoint.from_directory(save_root)
        session.report({"accuracy": float(acc), "loss": running_loss / len(val_loader)}, checkpoint=checkpoint)
    
    print(f"trial {session.get_tral_name()}: best acc: {best_acc:.4f}")
    print("Finish training")

In [31]:
def train_from_pretrain(config, data=None):
    assert torch.cuda.is_available()
    
    device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
    train_loader, val_loader = get_loader()
    
    # load model and pretrain weight
    model = get_network(config['arch'], config['num_class'])
    load_pretrain(model, data)
    model.set_temperature(config)
    model.to(device) 
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.0001, momentum=0.9, weight_decay=1e-4)
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    best_acc = 0
    running_loss = 0
    
    for epoch in range(config['epochs']):
        model.train()
        for data in train_loader:
            image, label = data
            image = image.to(device)
            label = label.to(device)
            
            logits = model(image)
            loss = criterion(logits, label)
            running_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        running_loss = 0
        corrects = 0
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                image, label = data
                image = image.to(device)
                label = label.to(device)
                
                logits = model(image)
                loss = criterion(logits, label)
                running_loss += loss.item()
                
                _, pred = logits.max(dim=1)
                corrects += pred.eq(label).sum()
        
        acc = corrects / len(val_loader.dataset)
        
        best_acc = acc if acc > best_acc else best_acc
        
        acc = acc.data.cpu().numpy()
        session.report({"accuracy": float(acc), "loss": running_loss / len(val_loader)}, checkpoint=checkpoint)

In [41]:
import random
def util(config, data=None):
    save_root = os.path.join('/root/notebooks/nfs/work/larry.lai/AWPooling/baysopt')
#     train_loader, val_loader = get_loader()
    model = get_network(config['arch'], num_class=config['num_class'])
#     load_pretrain(model, data)
    for i in range(50):
        checkpoint = Checkpoint.from_directory(save_root)
        
        session.report({'epoch': i, 'accuracy': random.randint(1, 100)}, checkpoint=checkpoint)
        time.sleep(0.1)
    
#     print("trial terminate!")

In [45]:
search_space = {
    'arch': 'vgg11awt',
    'num_class': 200
}
pretrain = load_state_dict_from_url(model_urls['vgg11'])
trainable = tune.with_resources(util, {'gpu': 1, 'cpu': 2})
asha_scheduler = ASHAScheduler(
        time_attr='epoch',
        metric='accuracy',
        mode='max',
        grace_period=30
)
tune_config = tune.TuneConfig(num_samples=5, scheduler=asha_scheduler)
checkpoint_config = CheckpointConfig(
    num_to_keep=3,
    checkpoint_score_attribute='accuracy',
    checkpoint_score_order='max'
)

run_config = RunConfig(
    name='bays',
    local_dir='/root/notebooks/nfs/work/larry.lai/AWPooling/HPO/tiny-imagenet-test',
    checkpoint_config=checkpoint_config,
)
tuner = tune.Tuner(tune.with_parameters(trainable, data=pretrain), tune_config=tune_config, run_config=run_config, param_space=search_space)
tuner.fit()

0,1
Current time:,2023-02-20 20:26:07
Running for:,00:04:07.75
Memory:,96.6/503.6 GiB

Trial name,status,loc,iter,total time (s),epoch,accuracy
util_2c667_00000,TERMINATED,10.233.108.232:2402100,50,111.253,49,52
util_2c667_00001,TERMINATED,10.233.108.232:2402263,50,111.394,49,49
util_2c667_00002,TERMINATED,10.233.108.232:2402100,31,71.4496,30,45
util_2c667_00003,TERMINATED,10.233.108.232:2402263,50,116.218,49,24
util_2c667_00004,TERMINATED,10.233.108.232:2402100,31,60.6794,30,51


Trial name,accuracy,date,done,episodes_total,epoch,experiment_id,hostname,iterations_since_restore,node_ip,pid,should_checkpoint,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
util_2c667_00000,52,2023-02-20_20-23-53,True,,49,d9da323af6614a64bcea3cf0bd83c420,forge-5rqsrx8c-310832011-797d64d666-cx4nd,50,10.233.108.232,2402100,True,111.253,2.41024,111.253,1676895833,0,,50,2c667_00000,0.00576735
util_2c667_00001,49,2023-02-20_20-23-56,True,,49,587376b0c7194ad5839e8b3ef204f5fc,forge-5rqsrx8c-310832011-797d64d666-cx4nd,50,10.233.108.232,2402263,True,111.394,1.52769,111.394,1676895836,0,,50,2c667_00001,0.00940251
util_2c667_00002,45,2023-02-20_20-25-06,True,,30,d9da323af6614a64bcea3cf0bd83c420,forge-5rqsrx8c-310832011-797d64d666-cx4nd,31,10.233.108.232,2402100,True,71.4496,1.57841,71.4496,1676895906,0,,31,2c667_00002,0.00576735
util_2c667_00003,24,2023-02-20_20-25-53,True,,49,587376b0c7194ad5839e8b3ef204f5fc,forge-5rqsrx8c-310832011-797d64d666-cx4nd,50,10.233.108.232,2402263,True,116.218,2.30051,116.218,1676895953,0,,50,2c667_00003,0.00940251
util_2c667_00004,51,2023-02-20_20-26-07,True,,30,d9da323af6614a64bcea3cf0bd83c420,forge-5rqsrx8c-310832011-797d64d666-cx4nd,31,10.233.108.232,2402100,True,60.6794,1.2617,60.6794,1676895967,0,,31,2c667_00004,0.00576735


2023-02-20 20:26:07,430	INFO tune.py:762 -- Total run time: 247.88 seconds (247.69 seconds for the tuning loop).


<ray.tune.result_grid.ResultGrid at 0x7fbeb566bdc0>

In [None]:
def main(args):
    search_space = {
        "t0": tune.uniform(1e-5, 10),
        "t1": tune.uniform(1e-5, 10),
        "t2": tune.uniform(1e-5, 10),
        "t3": tune.uniform(1e-5, 10),
        "t4": tune.uniform(1e-5, 10),
        "arch": args.arch,
        "num_class": 200,
        "epochs": args.epoch,
    }
    
    name = args.arch.split('a')[0]
    pretrain = load_state_dict_from_url(model_urls[name])
    
    # define search algorithm
    algo = BayesOptSearch(metric='accuracy', mode='max')
    
    # allocate trial resources
    trainable = tune.with_resources(train_from_pretrain, {'gpu': 1, 'cpu': 2})
    
    # define trail scheduler
    asha_scheduler = ASHScheduler(
        time_attr='epoch',
        metric='accuracy',
        model='max',
        grace_period=30,
    )
    
    tune_config = tune.TuneConfig(
        num_samples=args.num_sample,
        search_alg=algo,
        scheduler=asha_scheduler,
        param_space=search_space,
    )

    checkpoint_config = CheckpointConfig(
        num_to_keep=3,
        checkpoint_score_attribute='accuracy',
        checkpoint_score_order='max'
    )
    
    run_config = RunConfig(
        name='bays_'+args.arch,
        local_dir='./HPO/tiny-imagenet/',
        checkpoint_config=checkpoint_config,
    )

    tuner = tune.Tuner(
        tune.with_parameters(trainable, data=pretrain),
        tune_config=tune_config,
        run_config=runconfig,
    )
    results = tuner.fit()

In [None]:
search_space = {
    "t0": tune.uniform(1e-5, 10),
    "t1": tune.uniform(1e-5, 10),
    "t2": tune.uniform(1e-5, 10),
    "t3": tune.uniform(1e-5, 10),
    "t4": tune.uniform(1e-5, 10),
}

algo = BayesOptSearch(metric='accuracy', mode='max')
tune_config = tune.TuneConfig(
    num_samples=30,
    search_alg=algo
)

checkpoint_config = CheckpointConfig(
    num_to_keep=3,
    checkpoint_score_attribute='accuracy',
    checkpoint_score_order='max'
)

tuner = tune.Tuner(
    tune.with_resources(train_model, {'gpu': 2, 'cpu': 4}),
    tune_config=tune_config,
    run_config=RunConfig(local_dir='./test_run', name='bays_epoch90', checkpoint_config=checkpoint_config),
    param_space=search_space,
)
results = tuner.fit()

In [20]:
from ray.tune import ExperimentAnalysis

experiment = ExperimentAnalysis('test_run/bays_vgg11aw_epoch90', default_metric='accuracy', default_mode='max')

In [23]:
experiment.best_config

{'t0': 0.8872739874944279,
 't1': 0.19094476202714777,
 't2': 9.0334249741,
 't3': 5.974913130728477,
 't4': 7.412619693559958}

In [15]:
import numpy as np
a = np.array([9, 7, 8, 6])
t = 10
i = np.exp(a / t)
d = sum(i)
w = i / d
w.dot(a)

7.624647182103896

In [30]:
df

Unnamed: 0,mean_accuracy,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,time_since_restore,timesteps_since_restore,iterations_since_restore,warmup_time,config/t1,config/t2,config/t3,config/t4,config/t5,logdir
0,"tensor(0.7571, device='cuda:0')",9.670802,False,,,10,9412d_00000,b67d4feb27714c01bad462aaa7b9a6a0,2022-10-17_17-48-54,1666000134,...,140.423691,0,10,0.004107,5.032654,2.353893,0.511015,-0.905503,5.045072,/root/notebooks/ray_results/train_model_2022-1...


In [26]:
df

Result(metrics={'mean_accuracy': tensor(0.7571, device='cuda:0'), 'done': True, 'trial_id': '9412d_00000', 'experiment_tag': '0_t1=5.0327,t2=2.3539,t3=0.5110,t4=-0.9055,t5=5.0451'}, error=None, log_dir=PosixPath('/root/notebooks/ray_results/train_model_2022-10-17_17-46-30/train_model_9412d_00000_0_t1=5.0327,t2=2.3539,t3=0.5110,t4=-0.9055,t5=5.0451_2022-10-17_17-46-31'))