In [15]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import OrderedDict
from tqdm.auto import tqdm
import gc


# 1) Load model & freeze weights
model = AutoModelForCausalLM.from_pretrained(
    "huggyllama/llama-7b",
    torch_dtype=torch.float16,
    device_map="cpu",
)
tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  7.67it/s]


In [3]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import OrderedDict
from tqdm.auto import tqdm
import gc

# Load the gradients to verify
with open("grads/llama7b_grads_out.pt", "rb") as f:
    importance_dict = torch.load(f)

# Calculate the average importance of each layer
importance_avg = OrderedDict()
for layer_name, importance in importance_dict.items():
    importance_avg[layer_name] = torch.mean(importance).item()

In [None]:

import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import OrderedDict
import numpy as np
import gc
import warnings

# Suppress a specific warning from the transformers library for cleaner output.
warnings.filterwarnings("ignore", message=".*pad_token_id.*")

def calculate_truncation_ranks(model, importance_dict, compression_ratio, smoothing_alpha):
    """
    Calculates the number of singular values (k) to keep for each layer
    based on a compression budget and layer importance scores.

    Args:
        model (nn.Module): The transformer model to be compressed.
        importance_dict (OrderedDict): A dictionary with layer names as keys and
                                       their importance scores as values.
        compression_ratio (float): The target compression ratio (e.g., 0.6 for 60%).
                                   This means the final size should be 40% of the original.
        smoothing_alpha (float): A factor to smooth the importance distribution.
                                 - 1.0: Ranks are directly proportional to importance.
                                 - (0, 1): Differences are smoothed, leading to more
                                           uniform ranks. A value closer to 0 means
                                           more smoothing.
                                 - > 1.0: Differences are exaggerated.

    Returns:
        dict: A dictionary containing the calculated ranks for each layer.
        dict: A dictionary containing detailed stats for each layer.
    """
    print("Starting rank calculation...")
    
    # --- 1. Collect Layer Information ---
    layer_info = {}
    total_original_params = 0
    
    importance_dict = {k + '.weight': v for k, v in importance_dict.items()}

    # Filter to only include linear layers present in the importance_dict
    compressible_layers = {name: param for name, param in model.named_parameters() if name in importance_dict and isinstance(model.get_submodule(name.rsplit('.', 1)[0]), nn.Linear)}

    for name, param in compressible_layers.items():
        if param.dim() == 2: # Ensure it's a 2D weight matrix (Linear layer)
            rows, cols = param.shape
            original_params = rows * cols
            
            # The number of parameters after SVD truncation to rank k is k * (rows + cols)
            param_cost_per_rank = rows + cols
            
            # Constraint: To ensure compression, k must be less than half the smallest dimension.
            # If k >= (rows * cols) / (rows + cols), we are not compressing.
            # The user-specified constraint is k < min(rows, cols) / 2.
            max_rank = (min(rows, cols) // 2) - 1
            
            if max_rank <= 0:
                print(f"Skipping layer {name} as it cannot be compressed with the given constraints.")
                continue

            layer_info[name] = {
                'shape': (rows, cols),
                'original_params': original_params,
                'cost_per_rank': param_cost_per_rank,
                'max_rank': max_rank,
                'importance': importance_dict.get(name, 0.0)
            }
            total_original_params += original_params

    if not layer_info:
        print("No compressible layers found or matched with importance_dict. Aborting.")
        return {}, {}

    print(f"Found {len(layer_info)} compressible layers.")
    print(f"Total original parameters in these layers: {total_original_params:,}")

    # --- 2. Normalize and Smooth Importance Scores ---
    total_importance = sum(info['importance'] for info in layer_info.values())
    for name in layer_info:
        # Normalize importance to sum to 1
        normalized_importance = layer_info[name]['importance'] / total_importance
        # Apply smoothing
        layer_info[name]['smoothed_importance'] = normalized_importance ** smoothing_alpha

    # Renormalize smoothed scores to sum to 1
    total_smoothed_importance = sum(info['smoothed_importance'] for info in layer_info.values())
    for name in layer_info:
        layer_info[name]['final_weight'] = layer_info[name]['smoothed_importance'] / total_smoothed_importance


    # --- 3. Iterative Rank Allocation ---
    target_total_params = total_original_params * compression_ratio
    print(f"Target parameters after compression: {int(target_total_params):,}")

    # Initialize loop variables
    final_ranks = {}
    remaining_budget = target_total_params
    layers_to_process = list(layer_info.keys())
    
    is_stable = False
    while not is_stable and layers_to_process:
        is_stable = True
        
        # Calculate the allocation constant 'C' based on the current set of layers
        # The total budget is sum(k_i * cost_i). We model k_i = C * weight_i.
        # So, budget = sum(C * weight_i * cost_i) = C * sum(weight_i * cost_i).
        # Therefore, C = budget / sum(weight_i * cost_i).
        
        current_total_weighted_cost = sum(layer_info[name]['final_weight'] * layer_info[name]['cost_per_rank'] for name in layers_to_process)
        
        if current_total_weighted_cost == 0:
            break # Avoid division by zero if no layers are left

        allocation_constant = remaining_budget / current_total_weighted_cost

        # Determine tentative ranks and identify layers that exceed their max_rank
        newly_capped_layers = []
        next_layers_to_process = []

        for name in layers_to_process:
            info = layer_info[name]
            tentative_rank = allocation_constant * info['final_weight']
            
            if tentative_rank >= info['max_rank']:
                # This layer's rank is capped. Fix it and remove from next iteration.
                is_stable = False
                final_ranks[name] = info['max_rank']
                capped_params = info['max_rank'] * info['cost_per_rank']
                remaining_budget -= capped_params
                newly_capped_layers.append(name)
            else:
                # This layer is still in contention
                next_layers_to_process.append(name)
        
        layers_to_process = next_layers_to_process

    # After the loop, allocate ranks for the remaining (uncapped) layers
    if layers_to_process:
        current_total_weighted_cost = sum(layer_info[name]['final_weight'] * layer_info[name]['cost_per_rank'] for name in layers_to_process)
        if current_total_weighted_cost > 0:
            allocation_constant = remaining_budget / current_total_weighted_cost
            for name in layers_to_process:
                final_ranks[name] = int(max(1, np.floor(allocation_constant * layer_info[name]['final_weight'])))

    # --- 4. Final Report Generation ---
    detailed_stats = OrderedDict()
    total_final_params = 0
    for name, info in sorted(layer_info.items(), key=lambda x: x[1]['importance'], reverse=True):
        rank = final_ranks.get(name, 0)
        new_params = rank * info['cost_per_rank']
        total_final_params += new_params
        
        individual_compression = 1.0 - (new_params / info['original_params']) if info['original_params'] > 0 else 0
        
        detailed_stats[name] = {
            "shape": info['shape'],
            "importance": info['importance'],
            "original_params": info['original_params'],
            "final_rank_k": rank,
            "new_params": new_params,
            "compression": f"{individual_compression:.2%}"
        }
        
    actual_compression_ratio = 1.0 - (total_final_params / total_original_params)

    print("\n--- Compression Results ---")
    print(f"Target Compression Ratio: {compression_ratio:.2%}")
    print(f"Achieved Compression Ratio: {actual_compression_ratio:.2%}")
    print(f"Original Parameters: {total_original_params:,}")
    print(f"Final Parameters: {int(total_final_params):,}")
    print("---------------------------\n")
    
    final_ranks = {k.replace('.weight', ''): v for k, v in final_ranks.items()}

    return final_ranks, detailed_stats


if __name__ == '__main__':
    # --- Configuration ---
    MODEL_ID = "huggyllama/llama-7b"
    COMPRESSION_RATIO = 0.60  # Target: 60% smaller, 40% of original size
    SMOOTHING_ALPHA = 0.0     # Value between 0 and 1. Closer to 0 = more uniform ranks.

    # --- 1. Load Model and Importance Scores ---
    print(f"Loading model: {MODEL_ID}. This may take a while...")
    # Using low_cpu_mem_usage to handle large models more efficiently.
    # If you have a GPU, you can add device_map='auto'.
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, low_cpu_mem_usage=True)
    
    # Load the gradients to verify
    with open("grads/llama7b_grads_out.pt", "rb") as f:
        importance_dict = torch.load(f)
        
    # Calculate the average importance of each layer
    importance_avg = OrderedDict()
    for layer_name, importance in importance_dict.items():
        importance_avg[layer_name] = torch.mean(importance).item()

    # --- 2. Run the Algorithm ---
    final_ranks, detailed_stats = calculate_truncation_ranks(
        model=model,
        importance_dict=importance_avg,
        compression_ratio=COMPRESSION_RATIO,
        smoothing_alpha=SMOOTHING_ALPHA
    )

    # --- 3. Print Detailed Layer-by-Layer Results ---
    if detailed_stats:
        print(f"{'Layer Name':<40} {'Importance':<12} {'Shape':<15} {'Orig. Params':<15} {'New Rank (k)':<15} {'New Params':<15} {'Compression'}")
        print("-" * 140)
        for name, stats in detailed_stats.items():
            print(f"{name:<40} {stats['importance']:<12.4f} {str(stats['shape']):<15} {stats['original_params']:,<15} {stats['final_rank_k']:,<15} {stats['new_params']:,<15} {stats['compression']}")
    
    # --- 4. Clean up ---
    del model
    gc.collect()
    print("\nDone.")

Loading model: huggyllama/llama-7b. This may take a while...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  7.81it/s]


Starting rank calculation...
Found 224 compressible layers.
Total original parameters in these layers: 6,476,005,376
Target parameters after compression: 3,885,603,225

--- Compression Results ---
Target Compression Ratio: 60.00%
Achieved Compression Ratio: 40.01%
Original Parameters: 6,476,005,376
Final Parameters: 3,885,260,800
---------------------------

Layer Name                               Importance   Shape           Orig. Params    New Rank (k)    New Params      Compression
--------------------------------------------------------------------------------------------------------------------------------------------
model.layers.31.self_attn.o_proj.weight  0.6777       (4096, 4096)    16777216,,,,,,, 1555,,,,,,,,,,, 12738560,,,,,,, 24.07%
model.layers.30.mlp.down_proj.weight     0.4827       (4096, 11008)   45088768,,,,,,, 1555,,,,,,,,,,, 23486720,,,,,,, 47.91%
model.layers.12.self_attn.v_proj.weight  0.4106       (4096, 4096)    16777216,,,,,,, 1555,,,,,,,,,,, 12738560,,,,,,, 

In [2]:
def get_truncate(in_features, out_features, ratio):
    return int(in_features * out_features * ratio / (in_features + out_features))

print(get_truncate(4096, 4096, 0.6))  # Example usage

1228


In [3]:
print(get_truncate(4096, 11008, 0.6))

1791


In [19]:
print(final_ranks)

{'model.layers.0.self_attn.q_proj': 1555, 'model.layers.0.self_attn.k_proj': 1555, 'model.layers.0.self_attn.v_proj': 1555, 'model.layers.0.self_attn.o_proj': 1555, 'model.layers.0.mlp.gate_proj': 1555, 'model.layers.0.mlp.up_proj': 1555, 'model.layers.0.mlp.down_proj': 1555, 'model.layers.1.self_attn.q_proj': 1555, 'model.layers.1.self_attn.k_proj': 1555, 'model.layers.1.self_attn.v_proj': 1555, 'model.layers.1.self_attn.o_proj': 1555, 'model.layers.1.mlp.gate_proj': 1555, 'model.layers.1.mlp.up_proj': 1555, 'model.layers.1.mlp.down_proj': 1555, 'model.layers.2.self_attn.q_proj': 1555, 'model.layers.2.self_attn.k_proj': 1555, 'model.layers.2.self_attn.v_proj': 1555, 'model.layers.2.self_attn.o_proj': 1555, 'model.layers.2.mlp.gate_proj': 1555, 'model.layers.2.mlp.up_proj': 1555, 'model.layers.2.mlp.down_proj': 1555, 'model.layers.3.self_attn.q_proj': 1555, 'model.layers.3.self_attn.k_proj': 1555, 'model.layers.3.self_attn.v_proj': 1555, 'model.layers.3.self_attn.o_proj': 1555, 'model.

In [7]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
from collections import OrderedDict, defaultdict
import numpy as np
import re
import gc
import warnings

# Suppress a specific warning from the transformers library for cleaner output.
warnings.filterwarnings("ignore", message=".*pad_token_id.*")

def calculate_truncation_ranks(model, importance_dict, compression_ratio, smoothing_alpha):
    """
    Calculates the number of singular values (k) to keep for each layer,
    applying the compression budget independently to each group of layers (e.g.,
    all 'q_proj' layers, all 'down_proj' layers, etc.).

    Args:
        model (nn.Module): The transformer model to be compressed.
        importance_dict (OrderedDict): A dictionary with layer names as keys and
                                       their importance scores as values.
        compression_ratio (float): The target compression ratio (e.g., 0.6 for 60%).
                                   This means the final size should be 40% of the original.
        smoothing_alpha (float): A factor to smooth the importance distribution.

    Returns:
        dict: A dictionary containing the calculated ranks for each layer (with clean keys).
        dict: A dictionary containing detailed stats for each layer.
    """
    print("Starting rank calculation with grouped compression...")

    # --- 1. Group Layers by Type ---
    grouped_layer_info = defaultdict(dict)
    # Regex to extract the layer type (e.g., 'self_attn.q_proj') from the full name
    layer_type_re = re.compile(r'.*\.layers\.\d+\.([a-zA-Z_]+\.[a-zA-Z_]+proj)')

    for name, param in model.named_parameters():
        module_name = name.rsplit('.', 1)[0]
        is_in_importance_dict = name in importance_dict or module_name in importance_dict
        
        try:
            is_linear_layer = isinstance(model.get_submodule(module_name), nn.Linear)
        except AttributeError:
            is_linear_layer = False

        if is_linear_layer and is_in_importance_dict and param.dim() == 2:
            match = layer_type_re.match(name)
            if not match:
                print(f"Warning: Could not determine group for layer {name}. Skipping.")
                continue
            group_key = match.group(1)

            importance_key = name if name in importance_dict else module_name
            importance_score = importance_dict.get(importance_key, 0.0)
            if isinstance(importance_score, torch.Tensor):
                importance_score = torch.mean(torch.abs(importance_score)).item()

            rows, cols = param.shape
            max_rank = (min(rows, cols) // 2) - 1
            if max_rank <= 0:
                continue

            grouped_layer_info[group_key][name] = {
                'shape': (rows, cols),
                'original_params': rows * cols,
                'cost_per_rank': rows + cols,
                'max_rank': max_rank,
                'importance': importance_score
            }

    if not grouped_layer_info:
        print("No compressible layers found or matched with importance_dict. Aborting.")
        return {}, {}

    # --- 2. Process Each Group Independently ---
    final_ranks = {}
    detailed_stats = OrderedDict()
    overall_original_params = 0
    overall_final_params = 0

    for group_name, layer_info in grouped_layer_info.items():
        print(f"\n--- Processing Group: {group_name} ({len(layer_info)} layers) ---")
        
        # --- 2a. Normalize and Smooth Importance (within group) ---
        total_importance_group = sum(info['importance'] for info in layer_info.values())
        if total_importance_group == 0:
            for name in layer_info:
                layer_info[name]['final_weight'] = 1.0 / len(layer_info)
        else:
            for name in layer_info:
                normalized_importance = layer_info[name]['importance'] / total_importance_group
                layer_info[name]['smoothed_importance'] = normalized_importance ** smoothing_alpha
            total_smoothed_importance = sum(info['smoothed_importance'] for info in layer_info.values())
            for name in layer_info:
                layer_info[name]['final_weight'] = layer_info[name]['smoothed_importance'] / total_smoothed_importance

        # --- 2b. Calculate Group Budget ---
        total_original_params_group = sum(info['original_params'] for info in layer_info.values())
        target_total_params_group = total_original_params_group * compression_ratio
        overall_original_params += total_original_params_group
        
        print(f"Group Original Params: {total_original_params_group:,}")
        print(f"Group Target Params:   {int(target_total_params_group):,}")

        # --- 2c. Iterative Rank Allocation (for this group) ---
        group_final_ranks = {}
        remaining_budget = target_total_params_group
        layers_to_process = list(layer_info.keys())
        
        is_stable = False
        while not is_stable and layers_to_process:
            is_stable = True
            weighted_cost_sum = sum(layer_info[name]['final_weight'] * layer_info[name]['cost_per_rank'] for name in layers_to_process)
            if weighted_cost_sum == 0: break
            
            alloc_const = remaining_budget / weighted_cost_sum
            next_layers_to_process = []
            
            for name in layers_to_process:
                info = layer_info[name]
                tentative_rank = alloc_const * info['final_weight']
                if tentative_rank >= info['max_rank']:
                    is_stable = False
                    group_final_ranks[name] = info['max_rank']
                    remaining_budget -= info['max_rank'] * info['cost_per_rank']
                else:
                    next_layers_to_process.append(name)
            layers_to_process = next_layers_to_process

        if layers_to_process:
            weighted_cost_sum = sum(layer_info[name]['final_weight'] * layer_info[name]['cost_per_rank'] for name in layers_to_process)
            if weighted_cost_sum > 0:
                alloc_const = remaining_budget / weighted_cost_sum
                for name in layers_to_process:
                    group_final_ranks[name] = int(max(1, np.floor(alloc_const * layer_info[name]['final_weight'])))
        
        final_ranks.update(group_final_ranks)

        # --- 2d. Update Stats for Reporting ---
        for name, info in layer_info.items():
            rank = group_final_ranks.get(name, 0)
            new_params = rank * info['cost_per_rank']
            overall_final_params += new_params
            individual_compression = 1.0 - (new_params / info['original_params']) if info['original_params'] > 0 else 0
            detailed_stats[name] = {
                "group": group_name,
                "shape": info['shape'],
                "importance": info['importance'],
                "original_params": info['original_params'],
                "final_rank_k": rank,
                "new_params": new_params,
                "compression": f"{individual_compression:.2%}"
            }

    # --- 3. Final Report Generation ---
    actual_compression_ratio = 1.0 - (overall_final_params / overall_original_params) if overall_original_params > 0 else 0
    print("\n--- Overall Compression Results ---")
    print(f"Target Compression Ratio:   {compression_ratio:.2%}")
    print(f"Achieved Compression Ratio: {actual_compression_ratio:.2%}")
    print(f"Original Parameters: {overall_original_params:,}")
    print(f"Final Parameters:    {int(overall_final_params):,}")
    print("-----------------------------------\n")

    # --- 4. Clean up keys for return ---
    final_ranks_clean = {k.replace('.weight', ''): v for k, v in final_ranks.items()}

    return final_ranks_clean, detailed_stats


if __name__ == '__main__':
    # --- Configuration ---
    MODEL_ID = "huggyllama/llama-7b"
    COMPRESSION_RATIO = 0.60  # Target: 60% smaller, 40% of original size
    SMOOTHING_ALPHA = 0.1# Value between 0 and 1. Closer to 0 = more uniform ranks.

    # --- 1. Load Model and Importance Scores ---
    print(f"Loading model: {MODEL_ID}. This may take a while...")
    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float16, low_cpu_mem_usage=True)
    
    # Load the gradients to verify
    with open("grads/llama7b_grads_out.pt", "rb") as f:
        importance_dict = torch.load(f)
        
    # Calculate the average importance of each layer
    importance_avg = OrderedDict()
    for layer_name, importance in importance_dict.items():
        importance_avg[layer_name] = torch.mean(importance).item()
    # --- 2. Run the Algorithm ---
    final_ranks, detailed_stats = calculate_truncation_ranks(
        model=model,
        importance_dict=importance_avg,
        compression_ratio=COMPRESSION_RATIO,
        smoothing_alpha=SMOOTHING_ALPHA
    )

    # --- 3. Print Detailed Layer-by-Layer Results ---
    if detailed_stats:
        # Sort stats by group and then by importance for a structured view
        sorted_stats = sorted(detailed_stats.items(), key=lambda item: (item[1]['group'], -item[1]['importance']))
        
        print(f"{'Group':<20} {'Layer Name':<45} {'Importance':<12} {'Shape':<15} {'Orig. Params':<15} {'New Rank (k)':<15} {'New Params':<15} {'Compression'}")
        print("-" * 160)
        for name, stats in sorted_stats:
            print(f"{stats['group']:<20} {name:<45} {stats['importance']:<12.4f} {str(stats['shape']):<15} {stats['original_params']:,<15} {stats['final_rank_k']:,<15} {stats['new_params']:,<15} {stats['compression']}")

    # --- 4. Clean up ---
    del model
    gc.collect()
    print("\nDone.")


Loading model: huggyllama/llama-7b. This may take a while...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  8.47it/s]


Starting rank calculation with grouped compression...

--- Processing Group: self_attn.q_proj (32 layers) ---
Group Original Params: 536,870,912
Group Target Params:   322,122,547

--- Processing Group: self_attn.k_proj (32 layers) ---
Group Original Params: 536,870,912
Group Target Params:   322,122,547

--- Processing Group: self_attn.v_proj (32 layers) ---
Group Original Params: 536,870,912
Group Target Params:   322,122,547

--- Processing Group: self_attn.o_proj (32 layers) ---
Group Original Params: 536,870,912
Group Target Params:   322,122,547

--- Processing Group: mlp.gate_proj (32 layers) ---
Group Original Params: 1,442,840,576
Group Target Params:   865,704,345

--- Processing Group: mlp.up_proj (32 layers) ---
Group Original Params: 1,442,840,576
Group Target Params:   865,704,345

--- Processing Group: mlp.down_proj (32 layers) ---
Group Original Params: 1,442,840,576
Group Target Params:   865,704,345

--- Overall Compression Results ---
Target Compression Ratio:   60.0