In [1]:
import torch
import torchvision.models as models
import torch.nn as nn
import copy
import numpy as np

# Load the default MobileNetV2 model
mobilenet_v2 = models.mobilenet_v2(pretrained=False, num_classes=2)



We are going to use Torch-Pruning [1] library to help us with the pruning.

**[1]** Fang, Gongfan, et al. "Depgraph: Towards any structural pruning." Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (2023): 16091–16101.

[link text](https://github.com/VainF/Torch-Pruning)


In [2]:
# Install torch-pruning: we will use it to apply structured pruning on our model
!pip install -q torch-pruning

In [3]:
import torch_pruning as tp

In [4]:
def get_blocks(model):
    """
    Extracts and organizes the main blocks of a mobilenet_v2 model while
    identifying blocks that should be ignored during pruning.

    Args:
        model (torch.nn.Module): The model from which blocks will be extracted.

    Returns:
        tuple:
            - model_blocks (list): A list of extracted model blocks, excluding the first block.
            - ignored_blocks (list): A list containing the classifier and the first block, which are to be ignored during pruning.

    Description:
        - Iterates through the top-level children of the model.
        - If a module is named 'classifier', it is added to the ignored blocks.
        - If a module is named 'features', its submodules are extracted.
        - The first submodule of 'features' is added to the ignored blocks, while the remaining submodules
          are considered as the main model blocks.
    """

    blocks = []
    model_blocks = []
    ignored_blocks = []

    # Iterate through the top-level children of the model
    for name, module in model.named_children():
        if name == 'classifier':
            ignored_blocks.append(module)

        elif name == 'features':
            blocks += module  # Assuming 'features' is an iterable of blocks

    model_blocks.extend(blocks[1:])  # Exclude the first block
    ignored_blocks.append(blocks[0])  # Ignore the first block

    return model_blocks, ignored_blocks


In [5]:
# Prepare dummy input and output to perform forward and backward passes
input = torch.randn(1, 3, 80, 80)
output = torch.tensor([0])

In [6]:
def selective_block_pruning(original_model, prune_method, pruning_ratios, input_sample, output_sample, device):
    """
    Performs selective block-wise pruning on a given model based on specified pruning ratios.

    Args:
        original_model (torch.nn.Module): The trained model to be pruned.
        prune_method (str): The pruning method to use. Supports 'channel_pruning_Taylor_importance'.
        pruning_ratios (list of float): A list containing pruning ratios for each model block.
        input_sample (torch.Tensor): A sample input tensor for computing importance scores.
        output_sample (torch.Tensor): The corresponding output tensor for computing loss.
        device (torch.device): The device (CPU/GPU) on which to perform pruning.

    Returns:
        tuple:
            - pruned_model (torch.nn.Module): The pruned model.
            - macs (int): The number of MAC operations after pruning.
            - nparams (int): The number of parameters after pruning.

    Description:
        - Creates a deep copy of the model to avoid modifying the original.
        - Extracts model blocks and identifies blocks to ignore.
        - Initializes pruning importance using Taylor importance scoring.
        - Computes importance scores via forward and backward passes.
        - Iterates through each block, applying pruning selectively.
        - Adjusts the pruning ratio dynamically if no parameters are reduced.
        - Frees memory after pruning to optimize GPU usage.
    """

    # Create a copy of the model and move it to the specified device
    model = copy.deepcopy(original_model).to(device)

    # Extract model blocks and ignored blocks
    model_blocks, ignored_blocks = get_blocks(model)

    # Prepare pruning information for each block
    pruning_info = {
        i: {"block": model_blocks[i], "pruning_ratio": ratio}
        for i, ratio in enumerate(pruning_ratios)
    }

    if prune_method == 'channel_pruning_Taylor_importance':
        # Initialize Taylor importance for pruning
        imp = tp.importance.TaylorImportance()

        # Move input and output samples to the device
        input_sample, output_sample = input_sample.to(device), output_sample.to(device)
        loss_function = nn.CrossEntropyLoss()

        # Compute importance scores via forward and backward passes
        if isinstance(imp, tp.importance.TaylorImportance):
            preds = model(input_sample)
            loss = loss_function(preds, output_sample)
            loss.backward()

        # Compute initial MACs and parameter count
        original_macs, original_nparams = tp.utils.count_ops_and_params(model, input_sample)

        # Iterate through each block and apply pruning
        for i, info in pruning_info.items():
            block_to_prune = info["block"]
            pruning_ratio = info["pruning_ratio"]

            if pruning_ratio == 0:
                continue  # Skip pruning for blocks with a ratio of 0

            # Ignore all blocks except the current block to be pruned
            ignored_layers_block = [pruning_info[j]["block"] for j in range(len(pruning_info)) if j != i]
            combined_ignored_layers = ignored_blocks + ignored_layers_block

            count = 0  # Counter for consecutive iterations without parameter reduction

            while True:
                # Apply pruning to the current block
                pruner_group = tp.pruner.MagnitudePruner(
                    model,
                    example_inputs=input_sample,
                    importance=imp,
                    pruning_ratio=pruning_ratio,
                    ignored_layers=combined_ignored_layers
                )
                pruner_group.step()

                # Recalculate MACs and parameters after pruning
                macs, nparams = tp.utils.count_ops_and_params(model, input_sample)

                # If no parameters were reduced, adjust the pruning ratio or terminate pruning
                if original_nparams - nparams == 0:
                    count += 1
                    if count == 1:
                        pruning_ratio = 0.5  # Adjust pruning ratio for better pruning effect
                    else:
                        break

                # Update the pruning ratio and parameter count for iterative pruning
                original_nparams = nparams

        # Free up memory
        del input_sample, output_sample, preds
        torch.cuda.empty_cache()

    return model, macs, nparams

In [7]:
def perplexity_analysis_with_contributions(original_model, device, input_sample, output_sample, criterion=None):
    """
    Analyzes the impact of block-wise pruning on model performance and resource consumption.

    Args:
        original_model (torch.nn.Module): The neural network model to analyze.
        device (torch.device): The device (CPU/GPU) for computations.
        input_sample (torch.Tensor): A sample input tensor for inference and pruning evaluation.
        output_sample (torch.Tensor): The expected output tensor for loss computation.
        criterion (callable, optional): The loss function used for evaluation (default: None).

    Returns:
        list: A list of weighted importance scores for each block, representing their contribution to performance degradation.

    Description:
        - Extracts and organizes model blocks while identifying ignored blocks.
        - Computes the baseline performance and initial MACs/parameter count.
        - Iteratively prunes each block, measuring the resulting MACs and parameter reduction.
        - Computes the relative contributions of each block to performance degradation and resource savings.
        - Returns a weighted importance score for each block, based on its impact on parameter reduction and MACs.
    """

    # Extract model blocks and ignored blocks
    model_blocks, ignored_blocks = get_blocks(original_model)
    blocks_number = len(model_blocks)

    params_reduction = []
    macs_reduction = []

    # Move the model to the specified device
    original_model.to(device)

    # Compute initial MACs and parameter count
    original_macs, original_nparams = tp.utils.count_ops_and_params(original_model, input_sample)

    # Iterate through each block and analyze pruning impact
    for block_idx in range(blocks_number):
        print(f"Replacing block {block_idx}")

        # Generate pruning ratios, pruning only the current block
        pruning_ratios = (np.eye(blocks_number) * 0.8)[block_idx]

        # Apply block-wise pruning
        pruned_model, macs, nparams = selective_block_pruning(
            original_model, 'channel_pruning_Taylor_importance',
            pruning_ratios, input_sample, output_sample, device
        )

        # Compute percentage reductions
        params_reduction.append((original_nparams - nparams) / original_nparams * 100)
        macs_reduction.append((original_macs - macs) / original_macs * 100)

        print(f"MACs reduction: {macs_reduction[-1]:.2f}% | Parameters reduction: {params_reduction[-1]:.2f}%")

        # Move the pruned model to the device for performance evaluation
        pruned_model.to(device)

    weighted_importance_scores = []
    print(f"\nRelative contribution of each block to total MACs and parameter reductions:")

    # Compute weighted importance scores for each block
    for block_idx in range(blocks_number):
        relative_contribution_params = 100 - params_reduction[block_idx]
        relative_contribution_macs = 100 - macs_reduction[block_idx]

        # Weighted importance calculation (equal weighting between MACs and parameters)
        weight_params = 0.5
        weight_macs = 0.5
        weighted_importance = (weight_params * relative_contribution_params) + (weight_macs * relative_contribution_macs)

        print(f'Block {block_idx} reduces {macs_reduction[block_idx]:.2f}% of MACs and {params_reduction[block_idx]:.2f}% of parameters.')
        print(f'Weighted importance score for Block {block_idx}: {weighted_importance:.2f}')

        weighted_importance_scores.append(weighted_importance)

    return weighted_importance_scores

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

importance_scores = perplexity_analysis_with_contributions(mobilenet_v2, input_sample=input, output_sample=output, device=device)

Replacing block 0
MACs reduction: 6.91% | Parameters reduction: 0.09%
Replacing block 1
MACs reduction: 9.42% | Parameters reduction: 0.23%
Replacing block 2
MACs reduction: 7.98% | Parameters reduction: 0.39%
Replacing block 3
MACs reduction: 4.81% | Parameters reduction: 0.44%
Replacing block 4
MACs reduction: 3.34% | Parameters reduction: 0.66%
Replacing block 5
MACs reduction: 3.34% | Parameters reduction: 0.66%
Replacing block 6
MACs reduction: 2.28% | Parameters reduction: 0.94%
Replacing block 7
MACs reduction: 3.03% | Parameters reduction: 2.43%
Replacing block 8
MACs reduction: 3.03% | Parameters reduction: 2.43%
Replacing block 9
MACs reduction: 3.03% | Parameters reduction: 2.43%
Replacing block 10
MACs reduction: 3.71% | Parameters reduction: 2.98%
Replacing block 11
MACs reduction: 6.59% | Parameters reduction: 5.29%
Replacing block 12
MACs reduction: 6.59% | Parameters reduction: 5.29%
Replacing block 13
MACs reduction: 5.12% | Parameters reduction: 6.95%
Replacing block 

In [9]:
importance_scores

[96.50298358388602,
 95.17637452857088,
 95.81562958266878,
 97.3726661323213,
 97.99966389283736,
 97.99966389283736,
 98.39090232932065,
 97.27176913797129,
 97.27176913797129,
 97.27176913797129,
 96.65740645079322,
 94.05939990621073,
 94.05939990621073,
 93.96611235251615,
 89.62809233485658,
 89.62809233485658,
 71.38556084451452,
 86.57574588715747]

In [10]:
def calculate_pruning_ratios(importance_scores, max_pruning_ratio=0.9, k=5):
    """
    Calculate pruning ratios based on intense nonlinear scaling (exponential decay) of the relative contributions.

    Parameters:
    - importance_scores (list): List of importance scores (relative contributions in percentages) of each block.
    - max_pruning_ratio (float): Maximum pruning ratio to be assigned to the least important layer. Default is 0.9.
    - k (int): Factor controlling the intensity of the scaling (larger k makes the ratio more intense).

    Returns:
    - pruning_ratios (list): List of pruning ratios for each block.
    """
    # Normalize the contributions to get values between 0 and 1
    total_contribution = sum(importance_scores)
    normalized_contributions = [contribution / total_contribution for contribution in importance_scores]

    # Apply exponential decay to magnify the effect for less important blocks
    pruning_factors = [np.exp(-k * nc) for nc in normalized_contributions]

    # Normalize the pruning factors so they stay within the max pruning ratio
    max_factor = max(pruning_factors)
    normalized_factors = [pf / max_factor for pf in pruning_factors]

    # Scale by the maximum pruning ratio
    pruning_ratios = [max_pruning_ratio * nf for nf in normalized_factors]

    pruning_ratios = [round(num, 2) for num in pruning_ratios]

    return pruning_ratios

In [11]:
max_pruning_ratio = 0.99 # Maximum pruning ratio (99%)
k = 2 # Controls the intensity of the scaling

pruning_ratios = calculate_pruning_ratios(importance_scores, max_pruning_ratio, k)

# Print the pruning ratios for each block
for i, ratio in enumerate(pruning_ratios):
    print(f"Block {i} Pruning Ratio: {ratio:.4f}")

Block 0 Pruning Ratio: 0.9600
Block 1 Pruning Ratio: 0.9600
Block 2 Pruning Ratio: 0.9600
Block 3 Pruning Ratio: 0.9600
Block 4 Pruning Ratio: 0.9600
Block 5 Pruning Ratio: 0.9600
Block 6 Pruning Ratio: 0.9600
Block 7 Pruning Ratio: 0.9600
Block 8 Pruning Ratio: 0.9600
Block 9 Pruning Ratio: 0.9600
Block 10 Pruning Ratio: 0.9600
Block 11 Pruning Ratio: 0.9600
Block 12 Pruning Ratio: 0.9600
Block 13 Pruning Ratio: 0.9600
Block 14 Pruning Ratio: 0.9700
Block 15 Pruning Ratio: 0.9700
Block 16 Pruning Ratio: 0.9900
Block 17 Pruning Ratio: 0.9700


In [12]:
def prune_model(original_model, prune_method, pruning_ratios, input_sample, output_sample, device):
    """
    Prunes a neural network model block-wise using the specified pruning method.

    Args:
        original_model (torch.nn.Module): The trained model to be pruned.
        prune_method (str): The pruning method to use. Currently supports 'channel_pruning_Taylor_importance'.
        pruning_ratios (list of float): A list of pruning ratios corresponding to each model block.
        input_sample (torch.Tensor): A sample input tensor for computing importance scores.
        output_sample (torch.Tensor): The expected output tensor for loss computation.
        device (torch.device): The device (CPU/GPU) on which pruning is performed.

    Returns:
        tuple:
            - pruned_model (torch.nn.Module): The pruned model.
            - macs (int): The number of MAC operations after pruning.
            - nparams (int): The number of parameters after pruning.

    Description:
        - Creates a deep copy of the model to preserve the original.
        - Extracts model blocks and identifies ignored blocks.
        - Computes importance scores using Taylor importance for structured pruning.
        - Iteratively prunes each block based on the specified pruning ratios.
        - Updates MACs and parameter count after pruning.
        - Frees memory after pruning to optimize GPU usage.
    """

    # Create a copy of the model and move it to the specified device
    model = copy.deepcopy(original_model).to(device)

    # Extract model blocks and ignored blocks
    model_blocks, ignored_blocks = get_blocks(model)

    # Prepare pruning information for each block
    pruning_info = {
        i: {"block": model_blocks[i], "pruning_ratio": ratio}
        for i, ratio in enumerate(pruning_ratios)
    }

    if prune_method == 'channel_pruning_Taylor_importance':
        # Initialize Taylor importance for pruning
        imp = tp.importance.TaylorImportance()

        # Move input and output samples to the device
        input_sample, output_sample = input_sample.to(device), output_sample.to(device)
        loss_function = nn.CrossEntropyLoss()

        # Compute importance scores via forward and backward passes
        preds = model(input_sample)
        loss = loss_function(preds, output_sample)
        loss.backward()

        # Compute initial MACs and parameter count
        original_macs, original_nparams = tp.utils.count_ops_and_params(model, input_sample)

        # Iterate through each block and apply pruning
        for i, info in pruning_info.items():
            block_to_prune = info["block"]
            pruning_ratio = info["pruning_ratio"]

            # Ignore all blocks except the current block to be pruned
            ignored_layers_block = [pruning_info[j]["block"] for j in range(len(pruning_info)) if j != i]
            combined_ignored_layers = ignored_blocks + ignored_layers_block

            print(f"Pruning block {i} with initial ratio: {pruning_ratio}")

            # Apply pruning to the current block
            pruner_group = tp.pruner.MagnitudePruner(
                model,
                example_inputs=input_sample,
                importance=imp,
                pruning_ratio=pruning_ratio,
                ignored_layers=combined_ignored_layers
            )
            pruner_group.step()

    # Recalculate MACs and parameters after pruning
    macs, nparams = tp.utils.count_ops_and_params(model, input_sample)

    print(f"MACs: {macs}, #Params: {nparams}")
    print(f"Parameter reduction: {original_nparams - nparams}")

    # Free up memory
    del input_sample, output_sample, preds
    torch.cuda.empty_cache()

    return model, macs, nparams

In [13]:
pruned_model, macs, nparams = prune_model(mobilenet_v2,'channel_pruning_Taylor_importance', pruning_ratios, input, output, device)

Pruning block 0 with initial ratio: 0.96
Pruning block 1 with initial ratio: 0.96
Pruning block 2 with initial ratio: 0.96
Pruning block 3 with initial ratio: 0.96
Pruning block 4 with initial ratio: 0.96
Pruning block 5 with initial ratio: 0.96
Pruning block 6 with initial ratio: 0.96
Pruning block 7 with initial ratio: 0.96
Pruning block 8 with initial ratio: 0.96
Pruning block 9 with initial ratio: 0.96
Pruning block 10 with initial ratio: 0.96
Pruning block 11 with initial ratio: 0.96
Pruning block 12 with initial ratio: 0.96
Pruning block 13 with initial ratio: 0.96
Pruning block 14 with initial ratio: 0.97
Pruning block 15 with initial ratio: 0.97
Pruning block 16 with initial ratio: 0.99
Pruning block 17 with initial ratio: 0.97
MACs: 4341561.0, #Params: 51873
Parameter reduction: 2174561


In [14]:
pruned_model

MobileNetV2(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU6(inplace=True)
    )
    (1): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
          (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU6(inplace=True)
        )
        (1): Conv2d(32, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (2): InvertedResidual(
      (conv): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(3, eps=1e