In [None]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from models.awpooling import AWPool2d
from models.vggaw import VGG11AW

import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.air import session
from ray.air.config import RunConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from ray.tune.search.bayesopt import BayesOptSearch
from ray.tune.search.hyperopt import HyperOptSearch

import os
import time

In [2]:
def get_loader(root='/root/notebooks/nfs/work/dataset/tiny-imagenet-200'):
    
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    traindir = os.path.join(root, 'train')
    valdir = os.path.join(root, 'val')
    
    train_ds = datasets.ImageFolder(
        traindir,
        transform=transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    )
    
    val_ds = datasets.ImageFolder(
        valdir,
        transform=transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])
    )
    
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=256, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = torch.utils.data.DataLoader(val_ds, batch_size=256, shuffle=False, num_workers=4, pin_memory=True)
    
    return train_loader, val_loader

In [8]:
def train_model(config): 
    
    assert torch.cuda.is_available()
    
    save_root = '/root/notebooks/nfs/work/larry.lai/AWPooling/baysopt'
    os.makedirs(save_root, exist_ok=True)
    
    device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
    train_loader, val_loader = get_loader()
    
    model = VGG11AW(num_class=200)
    model.set_temperature(config)
    model.to(device) 
    
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    best_acc = 0
    running_loss = 0
    
    for epoch in range(90):
        model.train()
        for data in train_loader:
            image, label = data
            image = image.to(device)
            label = label.to(device)
            
            logits = model(image)
            loss = criterion(logits, label)
            running_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        scheduler.step()
        print(f"Epoch[{epoch + 1}/5]: loss {running_loss/len(train_loader):.4f}")
        
        running_loss = 0
        corrects = 0
        model.eval()
        with torch.no_grad():
            for data in val_loader:
                image, label = data
                image = image.to(device)
                label = label.to(device)
                
                logits = model(image)
                loss = criterion(logits, label)
                running_loss += loss.item()
                
                _, pred = logits.max(dim=1)
                corrects += pred.eq(label).sum()
        
        acc = corrects / len(val_loader.dataset)
        
        best_acc = acc if acc > best_acc else best_acc
        
        acc = acc.data.cpu().numpy()
        checkpoint = Checkpoint.from_directory(save_root)
        session.report({"accuracy": float(acc), "loss": running_loss / len(val_loader)}, checkpoint=checkpoint)
    
    print(f"trial {session.get_tral_name()}: best acc: {best_acc:.4f}")
    print("Finish training")

In [31]:
def search_t(config):
    assert torch.cuda.is_available()
    
    device = torch.device('cuda')
    
    # load dataset
    train_loader, val_loader = get_loader()
    
    model = VGG11AW(num_class=200).to(device)
    checkpoint = torch.load('/root/notebooks/nfs/work/larry.lai/AWPooling/checkpoints/tiny-imagenet/vgg11aw_best.pth.tar', map_location=device)
    model.load_state_dict(checkpoint['state_dict'])
    model.set_temperature(config)
    
    criterion = nn.CrossEntropyLoss()
    criterion.to(device)
    
    total_loss = 0
    corrects = 0
    model.eval()
    with torch.no_grad():
        for i, data in enumerate(val_loader):
            image, label = data
            image, label = image.to(device), label.to(device)
            
            logits = model(image)
            loss = criterion(logits, label)
            
            _, pred = logits.max(dim=1)
            corrects += pred.eq(label).sum()
            total_loss += loss.item()
        
    acc = corrects / len(val_loader.dataset)
    total_loss /= len(val_loader)
    
    acc = acc.data.cpu().numpy()
    session.report({"loss": float(total_loss), "mean_accuracy": float(acc)})
    
    print("Finished")

In [None]:
search_space = {
    "t0": tune.uniform(1e-5, 10),
    "t1": tune.uniform(1e-5, 10),
    "t2": tune.uniform(1e-5, 10),
    "t3": tune.uniform(1e-5, 10),
    "t4": tune.uniform(1e-5, 10),
}

algo = BayesOptSearch(metric='accuracy', mode='max')
tune_config = tune.TuneConfig(
    num_samples=30,
    search_alg=algo
)

checkpoint_config = CheckpointConfig(
    num_to_keep=3,
    checkpoint_score_attribute='accuracy',
    checkpoint_score_order='max'
)

tuner = tune.Tuner(
    tune.with_resources(train_model, {'gpu': 2, 'cpu': 4}),
    tune_config=tune_config,
    run_config=RunConfig(local_dir='./test_run', name='bays_epoch90', checkpoint_config=checkpoint_config),
    param_space=search_space,
)
results = tuner.fit()

In [20]:
from ray.tune import ExperimentAnalysis

experiment = ExperimentAnalysis('test_run/bays_vgg11aw_epoch90', default_metric='accuracy', default_mode='max')

In [23]:
experiment.best_config

{'t0': 0.8872739874944279,
 't1': 0.19094476202714777,
 't2': 9.0334249741,
 't3': 5.974913130728477,
 't4': 7.412619693559958}

In [15]:
import numpy as np
a = np.array([9, 7, 8, 6])
t = 10
i = np.exp(a / t)
d = sum(i)
w = i / d
w.dot(a)

7.624647182103896

In [30]:
df

Unnamed: 0,mean_accuracy,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,timestamp,...,time_since_restore,timesteps_since_restore,iterations_since_restore,warmup_time,config/t1,config/t2,config/t3,config/t4,config/t5,logdir
0,"tensor(0.7571, device='cuda:0')",9.670802,False,,,10,9412d_00000,b67d4feb27714c01bad462aaa7b9a6a0,2022-10-17_17-48-54,1666000134,...,140.423691,0,10,0.004107,5.032654,2.353893,0.511015,-0.905503,5.045072,/root/notebooks/ray_results/train_model_2022-1...


In [26]:
df

Result(metrics={'mean_accuracy': tensor(0.7571, device='cuda:0'), 'done': True, 'trial_id': '9412d_00000', 'experiment_tag': '0_t1=5.0327,t2=2.3539,t3=0.5110,t4=-0.9055,t5=5.0451'}, error=None, log_dir=PosixPath('/root/notebooks/ray_results/train_model_2022-10-17_17-46-30/train_model_9412d_00000_0_t1=5.0327,t2=2.3539,t3=0.5110,t4=-0.9055,t5=5.0451_2022-10-17_17-46-31'))