# Custom Keys Optimizer

Here we create the custom keys dictionary for the runner config.
It is necessary to get layer decay.

In [7]:
LAYER_DECAY = 0.75
MODEL_DEPTH = 12
BASE_WEIGHT_DECAY = 0.05

In [8]:
from mmengine.runner import Runner
from mmengine.config import Config

runner_cfg = Config.fromfile(
    "configs/models/vit-s-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_base.py"
)
runner = Runner.from_cfg(runner_cfg)
model = runner.model

12/04 22:32:34 - mmengine - [4m[97mINFO[0m - 
------------------------------------------------------------
System environment:
    sys.platform: darwin
    Python: 3.10.13 | packaged by conda-forge | (main, Oct 26 2023, 18:09:17) [Clang 16.0.6 ]
    CUDA available: False
    numpy_random_seed: 104644062
    GCC: Apple clang version 15.0.0 (clang-1500.0.40.1)
    PyTorch: 2.1.1
    PyTorch compiling details: PyTorch built with:
  - GCC 4.2
  - C++ Version: 201703
  - clang 13.1.6
  - LAPACK is enabled (usually provided by MKL)
  - NNPACK is enabled
  - CPU capability usage: NO AVX
  - Build settings: BLAS_INFO=accelerate, BUILD_TYPE=Release, CXX_COMPILER=/Applications/Xcode_13.3.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang++, CXX_FLAGS= -fvisibility-inlines-hidden -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_PYTORCH_QNNPACK -DUSE_XNNPACK -DUSE_PYTORCH_METAL_EXPORT -DSYMBOLICATE_MOBILE_DEBUG_HANDLE -DUSE_COREML

In [9]:
# Functions from the VideoMAE repo


def get_num_layer_for_vit(var_name, num_max_layer):
    if var_name in ("backbone.cls_token", "backbone.mask_token", "backbone.pos_embed"):
        return 0
    elif var_name.startswith("backbone.patch_embed"):
        return 0
    elif var_name.startswith("backbone.rel_pos_bias"):
        return num_max_layer - 1
    elif var_name.startswith("backbone.blocks"):
        layer_id = int(var_name.split(".")[2])
        return layer_id + 1
    else:
        return num_max_layer - 1


class LayerDecayValueAssigner(object):
    def __init__(self, values):
        self.values = values

    def get_scale(self, layer_id):
        return self.values[layer_id]

    def get_layer_id(self, var_name):
        return get_num_layer_for_vit(var_name, len(self.values))


def get_parameter_groups(
    model, weight_decay=1e-5, skip_list=(), get_num_layer=None, get_layer_scale=None
):
    parameter_group_names = {}
    parameter_group_vars = {}

    for name, param in model.named_parameters():
        if not param.requires_grad:
            continue  # frozen weights
        if (
            len(param.shape) == 1
            or name.endswith(".bias")
            or name.endswith(".scale")
            or name in skip_list
        ):
            group_name = "no_decay"
            this_weight_decay = 0.0
        else:
            group_name = "decay"
            this_weight_decay = weight_decay
        if get_num_layer is not None:
            layer_id = get_num_layer(name)
            group_name = "layer_%d_%s" % (layer_id, group_name)
        else:
            layer_id = None

        if group_name not in parameter_group_names:
            if get_layer_scale is not None:
                scale = get_layer_scale(layer_id)
            else:
                scale = 1.0

            parameter_group_names[group_name] = {
                "weight_decay": this_weight_decay,
                "params": [],
                "lr_scale": scale,
            }
            parameter_group_vars[group_name] = {
                "weight_decay": this_weight_decay,
                "params": [],
                "lr_scale": scale,
            }

        parameter_group_vars[group_name]["params"].append(param)
        parameter_group_names[group_name]["params"].append(name)

    return parameter_group_names

In [10]:
# Get the parameter groups from VideoMAE

assigner = LayerDecayValueAssigner(
    list(LAYER_DECAY ** (MODEL_DEPTH + 1 - i) for i in range(MODEL_DEPTH + 2))
)

groups = get_parameter_groups(
    model,
    BASE_WEIGHT_DECAY,
    get_num_layer=assigner.get_layer_id,
    get_layer_scale=assigner.get_scale,
)

In [11]:
# Convert the parameter groups to the format used by mmaction

custom_keys = {}
for _, group in groups.items():
    decay_mult = 0 if group["weight_decay"] == 0 else 1
    params = group["params"]
    lr_mult = group["lr_scale"]
    for param in params:
        custom_keys[param] = {"lr_mult": lr_mult, "decay_mult": decay_mult}

custom_keys

{'backbone.patch_embed.projection.weight': {'lr_mult': 0.023757264018058777,
  'decay_mult': 1},
 'backbone.patch_embed.projection.bias': {'lr_mult': 0.023757264018058777,
  'decay_mult': 0},
 'backbone.blocks.0.norm1.weight': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.norm1.bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.attn.q_bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.attn.v_bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.attn.proj.bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.norm2.weight': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.norm2.bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.mlp.layers.0.0.bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbone.blocks.0.mlp.layers.1.bias': {'lr_mult': 0.03167635202407837,
  'decay_mult': 0},
 'backbo