In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
#export
from nb_004 import *

AttributeError: Final

In [None]:
DATA_PATH = Path('data')
PATH = DATA_PATH/'cifar10'

In [None]:
data_mean,data_std = map(tensor, ([0.491, 0.482, 0.447], [0.247, 0.243, 0.261]))
cifar_norm = normalize_tfm(mean=data_mean,std=data_std)

In [None]:
train_tfms = [flip_lr_tfm(p=0.5),
              pad_tfm(padding=4),
              crop_tfm(size=32, row_pct=(0,1.), col_pct=(0,1.)),
              cifar_norm]
valid_tfms = [cifar_norm]

In [None]:
bs = 64

In [None]:
#export
def to_half(b):  return [b[0].half(), b[1]]

In [None]:
DataBunch

In [None]:
#export
@dataclass
class DeviceDataLoader():
    dl: DataLoader
    device: torch.device
    progress_func: Callable
    half: bool = False
        
    def __len__(self): return len(self.dl)
    def __iter__(self):
        self.gen = (to_device(self.device,o) for o in self.dl)
        if self.half: self.gen = (to_half(o) for o in self.gen)
        if self.progress_func is not None:
            self.gen = self.progress_func(self.gen, total=len(self.dl), leave=False)
        return iter(self.gen)

    @classmethod
    def create(cls, *args, device=default_device, progress_func=tqdm, **kwargs):
        return cls(DataLoader(*args, **kwargs), device=device, progress_func=progress_func, half=False)

nb_002b.DeviceDataLoader = DeviceDataLoader

In [None]:
train_ds = FilesDataset.from_folder(PATH/'train', classes=['airplane','dog'])
valid_ds = FilesDataset.from_folder(PATH/'test', classes=['airplane','dog'])
data = DataBunch.create(train_ds, valid_ds, bs=bs, train_tfm=train_tfms, valid_tfm=valid_tfms, num_workers=4)
len(data.train_dl), len(data.valid_dl)

In [None]:
model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)
learn = Learner(data, model)
learn.metrics = [accuracy]
sched = OneCycleScheduler(learn, 0.1, 5)

# FP16

In [None]:
#export
def bn2float(module):
    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): module.float()
    for child in module.children(): bn2float(child)
    return module

def model2half(model):
    "Converts the model to half precision except the batchnorm layers"
    return bn2float(model.half())

Helper function to save the master model in FP32 with flat tensors (apparently it helps with performance)

In [None]:
#export
from torch._utils import _unflatten_dense_tensors
from torch.nn.utils import parameters_to_vector

In [None]:
from torch.nn.utils import vector_to_parameters
from torch.nn.utils.convert_parameters import _check_param_device

In [None]:
def vector_to_parameters1(vec, parameters):
    
    if not isinstance(vec, torch.Tensor):
        raise TypeError('expected torch.Tensor, but got: {}'
                        .format(torch.typename(vec)))
    param_device = None
    pointer = 0
    for param in parameters:
        param_device = _check_param_device(param, param_device)
        num_param = torch.prod(torch.LongTensor(list(param.size())))
        param.data.copy_(vec[pointer:pointer + num_param].view(param.size()).data)
        pointer += num_param

In [None]:
#export
def get_master(model, flat_master=False):
    "Returns two lists, one for the model parameters in FP16 and one for the master parameters in FP32"
    model_params = [param for param in model.parameters() if param.requires_grad]
    if flat_master:
        master_params = parameters_to_vector([param.data.float() for param in model_params])
        master_params = torch.nn.Parameter(master_params)
        master_params.requires_grad = True
        if master_params.grad is None: master_params.grad = master_params.new(*master_params.size())
        return model_params, [master_params]
    else:
        master_params = [param.clone().float().detach() for param in model_params]
        for param in master_params: param.requires_grad = True
        return model_params, master_params

#export
def model_g2master_g(model_params, master_params, flat_master=False):
    "Copies the model gradients to the master parameters for the optimizer step"
    if flat_master:
        master_params[0].grad.data.copy_(parameters_to_vector([p.grad.data.float() for p in model_params]))
    else:
        for model, master in zip(model_params, master_params):
            if model.grad is not None:
                if master.grad is None: master.grad = master.data.new(*master.data.size())
                master.grad.data.copy_(model.grad.data)
            else: master.grad = None

#export
def master2model(model_params, master_params, flat_master=False):
    "Copy master parameters to model parameters"
    if flat_master:
        for model, master in zip(model_params, _unflatten_dense_tensors(master_params[0].data, model_params)):
            model.data.copy_(master)
    else: 
        for model, master in zip(model_params, master_params): model.data.copy_(master.data)

In [None]:
#export
@dataclass
class MixedPrecision(Callback):
    learn:Learner
    loss_scale:int = 512
    flat_master:bool = False
    def __post_init__(self): assert torch.backends.cudnn.enabled, "Mixed precision training requires cudnn." 
    
    def on_train_begin(self, **kwargs):
        #Insures the dataloaders are in half precision.
        self.learn.data.train_dl.half = True
        if hasattr(self.learn.data, 'valid_dl') and self.learn.data.valid_dl is not None:
            self.learn.data.valid_dl.half = True
        #Get a copy of the model params in FP32
        self.model_params, self.master_params = get_master(self.learn.model, self.flat_master)
        #Changes the optimizer so that the optimization step is done in FP32.
        opt = self.learn.opt
        mom,wd,beta = opt.mom,opt.wd,opt.beta
        self.learn.opt.opt = self.learn.opt_fn(self.master_params, self.learn.opt.lr)
        opt.mom,opt.wd,opt.beta = mom,wd,beta
    
    def on_loss_begin(self, last_output, **kwargs):
        #It's better to compute the loss in FP32, to avoid reduction overflow.
        return last_output.float()
    
    def on_backward_begin(self, last_loss, **kwargs):
        #To avoid gradient underflow, we scale the gradients
        return last_loss * self.loss_scale
    
    def on_backward_end(self, **kwargs):
        #Convert the gradients back to FP32 and divide them by the scale.
        model_g2master_g(self.model_params, self.master_params, self.flat_master)
        for param in self.master_params: param.grad.div_(self.loss_scale)
    
    def on_step_end(self, **kwargs):
        #Zeros the gradients of the model since the optimizer is disconnected.
        self.learn.model.zero_grad()
        #Update the params from master to model.
        master2model(self.model_params, self.master_params, self.flat_master)

In [None]:
model = Darknet([1, 2, 2, 2, 2], num_classes=2, nf=16)
model = model2half(model)
learn = Learner(data, model)
learn.metrics = [accuracy]
scheds = [MixedPrecision(learn, flat_master=True), OneCycleScheduler(learn, 0.1, 5)]

In [None]:
learn.fit(1, 1e-2, callbacks=scheds)

In [None]:
scheds[0].master_params[0].size()