#  Layer Pruning



There are broadly two ways to prune:
1. Remove layers = more robust. 
2. Reduce the width of the hidden dimension. This can be more effective, but only if you go beyond naive pruning where random weights are removed. This script only presents naive pruning and should be done with caution.

In [None]:
!python -m pip install --upgrade pip -q
!pip install transformers -qU

In [None]:
from transformers import AutoModelForCausalLM
import torch

# Load the model
# model_name = "HuggingFaceTB/SmolLM-360M-instruct"
# model_name = "HuggingFaceTB/SmolLM-135M-instruct"

# Base models
model_name = "HuggingFaceTB/SmolLM-135M"

model = AutoModelForCausalLM.from_pretrained(model_name)

# Print the original model architecture
print("Original model:")
print(model)

## Pruning

### Layer Pruning Only

In [None]:
# Layer Pruning

from transformers import AutoModelForCausalLM
import torch
import math

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_new_lm(model_name, target_params):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    print("Original model:")
    print(model)
    
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    total_layers = len(model.model.layers)

    # Calculate number of layers to keep
    layers_to_keep = round((target_params / original_params) * total_layers)
    layers_to_remove = total_layers - layers_to_keep

    # Keep all layers except those right before the last layer
    selected_layers = (
        list(model.model.layers[:total_layers - layers_to_remove - 1]) + 
        [model.model.layers[-1]]
    )
    
    model.model.layers = torch.nn.ModuleList(selected_layers)

    model.config.num_hidden_layers = len(selected_layers)
    
    print("\nModified model (AshokLM):")
    print(model)
    
    new_params = count_parameters(model)
    print(f"Modified model parameters: {new_params:,}")
    
    reduction_percentage = (1 - new_params / original_params) * 100
    print(f"Size reduction: {reduction_percentage:.2f}%")
    
    return model

# model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
# target_params = 200_000_000

model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
target_params = 90_000_000

new_lm = create_new_lm(model_name, target_params)

modified_model_path = f"{model_name.split('/')[1]}-layer-pruned-{int(target_params/1000000)}M-raw"
new_lm.save_pretrained(modified_model_path)
print(f"\nAshokLM-100M-Instruct saved to: {modified_model_path}")

### Prune every second layer but not first or last

In [None]:
from transformers import AutoModelForCausalLM
import torch

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def create_pruned_lm(model_name, target_params):
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    print("Original model:")
    print(model)
    
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    total_layers = len(model.model.layers)
    
    # Prune every second layer, keeping the first and last
    selected_layers = [model.model.layers[i] for i in range(total_layers) if i == 0 or i == total_layers - 1 or i % 2 == 0]
    
    # Update the model layers and number of hidden layers
    model.model.layers = torch.nn.ModuleList(selected_layers)
    model.config.num_hidden_layers = len(selected_layers)
    
    print("\nModified model (Pruned):")
    print(model)
    
    new_params = count_parameters(model)
    print(f"Modified model parameters: {new_params:,}")
    
    reduction_percentage = (1 - new_params / original_params) * 100
    print(f"Size reduction: {reduction_percentage:.2f}%")
    
    return model

model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
target_params = 200_000_000

pruned_lm = create_pruned_lm(model_name, target_params)

modified_model_path = "SmolLM-360M-Instruct-layer-pruned-every2nd-200M"
pruned_lm.save_pretrained(modified_model_path)
print(f"\nPruned model saved to: {modified_model_path}")

### Layer + Width Pruning (naive)

In [None]:
from transformers import AutoModelForCausalLM
import torch
import math

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def prune_layers(model, target_params, original_params):
    total_layers = len(model.model.layers)

    # Calculate number of layers to keep
    layers_to_keep = round((target_params / original_params) * total_layers)
    layers_to_remove = total_layers - layers_to_keep

    # Keep all layers except those right before the last layer
    selected_layers = (
        list(model.model.layers[:total_layers - layers_to_remove - 1]) + 
        [model.model.layers[-1]]
    )
    
    # Assign pruned layers back to the model
    model.model.layers = torch.nn.ModuleList(selected_layers)

    # Update the config
    model.config.num_hidden_layers = len(selected_layers)

    return model

def prune_hidden_dimensions(model, target_params, current_params):
    original_hidden_size = model.config.hidden_size
    original_intermediate_size = model.config.intermediate_size
    original_proj_ratio = original_intermediate_size / original_hidden_size  # Calculate the ratio dynamically
    num_heads = model.config.num_attention_heads

    # Estimate new hidden size to target parameters
    reduction_ratio = math.sqrt(target_params / current_params)
    new_hidden_size = int(original_hidden_size * reduction_ratio)
    new_hidden_size = (new_hidden_size // (2 * num_heads)) * (2 * num_heads)  # Ensure divisibility

    num_attention_heads = model.config.num_attention_heads
    num_key_value_heads = model.config.num_key_value_heads

    # Update hidden size and intermediate size in the config
    model.config.hidden_size = new_hidden_size
    model.config.intermediate_size = int(new_hidden_size * original_proj_ratio)  # Maintain the original ratio

    for layer in model.model.layers:
        # Adjust attention projection layers
        layer.self_attn.q_proj.weight = torch.nn.Parameter(
            layer.self_attn.q_proj.weight[:new_hidden_size, :new_hidden_size].contiguous()
        )
        layer.self_attn.k_proj.weight = torch.nn.Parameter(
            layer.self_attn.k_proj.weight[:new_hidden_size, :new_hidden_size // (num_attention_heads // num_key_value_heads)].contiguous()
        )
        layer.self_attn.v_proj.weight = torch.nn.Parameter(
            layer.self_attn.v_proj.weight[:new_hidden_size, :new_hidden_size // (num_attention_heads // num_key_value_heads)].contiguous()
        )
        layer.self_attn.o_proj.weight = torch.nn.Parameter(
            layer.self_attn.o_proj.weight[:new_hidden_size, :new_hidden_size].contiguous()
        )

        # Adjust MLP layers
        new_intermediate_size = model.config.intermediate_size
        layer.mlp.gate_proj.weight = torch.nn.Parameter(
            layer.mlp.gate_proj.weight[:new_intermediate_size, :new_hidden_size].contiguous()
        )
        layer.mlp.up_proj.weight = torch.nn.Parameter(
            layer.mlp.up_proj.weight[:new_intermediate_size, :new_hidden_size].contiguous()
        )
        layer.mlp.down_proj.weight = torch.nn.Parameter(
            layer.mlp.down_proj.weight[:new_hidden_size, :new_intermediate_size].contiguous()
        )

    # Adjust rotary positional embeddings
    rotary_dim = new_hidden_size // num_heads
    model.model.rotary_emb.inv_freq = model.model.rotary_emb.inv_freq[:rotary_dim].contiguous()

    return model

def create_new_lm(model_name, target_params_1, target_params_2):
    # Step 1: Load the model
    model = AutoModelForCausalLM.from_pretrained(model_name)
    
    print("Original model:")
    print(model)
    
    # Count original parameters
    original_params = count_parameters(model)
    print(f"Original model parameters: {original_params:,}")
    
    # Step 2: Prune layers to target ~200M parameters
    model = prune_layers(model, target_params_1, original_params)
    
    new_params = count_parameters(model)
    print(f"\nModel parameters after layer pruning: {new_params:,}")
    
    # Step 3: Prune hidden dimensions to target ~100M parameters
    model = prune_hidden_dimensions(model, target_params_2, new_params)
    
    final_params = count_parameters(model)
    print(f"\nModel parameters after hidden dimension pruning: {final_params:,}")
    
    reduction_percentage = (1 - final_params / original_params) * 100
    print(f"\nTotal size reduction: {reduction_percentage:.2f}%")

    print(model)
    
    return model

# # Model name and target parameters
# model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
# target_params_1 = 250_000_000  # Target ~200M parameters after layer pruning
# target_params_2 = 200_000_000  # Target ~100M parameters after hidden dimension pruning

model_name = "HuggingFaceTB/SmolLM-135M-Instruct"
target_params_1 = 110_000_000
target_params_2 = 90_000_000

# Create the pruned model
new_lm = create_new_lm(model_name, target_params_1, target_params_2)

modified_model_path = f"{model_name.split('/')[1]}-layer-width-pruned-{int(target_params_2/1000000)}M-raw"
new_lm.save_pretrained(modified_model_path)
print(f"\nAshokLM-100M-Instruct saved to: {modified_model_path}")

## Push to Hub

In [None]:
from huggingface_hub import login
login()
## Enter your write token

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

def push_model_to_hub(local_model_path, original_model_name, repo_name):
    # Load the tokenizer from the original model
    tokenizer = AutoTokenizer.from_pretrained(original_model_name)
    
    # Load the modified model with ignore_mismatched_sizes
    model = AutoModelForCausalLM.from_pretrained(local_model_path, 
                                                 torch_dtype=torch.bfloat16,
                                                 ignore_mismatched_sizes=True # needed if pushing the model pruned by hidden dimension
                                                )

    # Push the model to the hub
    model.push_to_hub(repo_name)
    
    # Push the tokenizer to the hub
    tokenizer.push_to_hub(repo_name)

    print(f"Model and tokenizer pushed successfully to {repo_name}")

# Path to your local model is "modified_model_path"
local_model_path = modified_model_path #to push the last model you pruned above.
# local_model_path = "SmolLM-360M-Instruct-layer-pruned-200M-raw"

# Set your repository name
repository_name = f"Ashok/{local_model_path}"

# Original model name
# original_model_name = "HuggingFaceTB/SmolLM-360M-Instruct"
original_model_name = "HuggingFaceTB/SmolLM-135M-Instruct"

# Push the model
push_model_to_hub(local_model_path, original_model_name, repository_name)