In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"

In [2]:
# notebooks/5_sparse.ipynb

import sys
import gc
import torch
import json
import numpy as np
import random
from datetime import datetime
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

project_root = os.path.abspath("..")
sys.path.append(project_root)

from src.data_utils import load_datasets, extract_inputs_and_labels
from src.model_utils import load_model_and_tokenizer

from src.activations import (
    load_activations, 
    load_weight_l2_info
)

from src.pruning_utils.compute_scores import compute_all_layers_scores
from src.pruning_utils.generate_masks import (
    generate_masks_for_all_layers,
    save_masks_to_file,
    compute_layerwise_sparsity
)
from src.pruning_utils.apply_pruning import apply_pruning_to_model


In [3]:
# 设置随机种子
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [4]:
MODEL_NAME = "Llama-2-7b-hf"
MODELS_ROOT_PATH = "/mnt/data102_d2/huggingface/models"
ACTIVATIONS_ROOT_PATH = '../activations'

In [5]:
METHOD = "WIFV"        # compute_scores时使用的方法 (IFV, WIFV, WIFN, VAR, MEAN)
STRUCTURE = "AL-AM"    # 生成mask时使用的结构策略
PRUNING_RATIO = 0.1    # 剪枝比例
REMOVE_HEADS = 1       # 如果STRUCTURE在需要固定移除heads的场景才会用到
GLOBAL_PRUNING = False # 是否跨层做全局排序

In [6]:
# 应用剪枝相关参数
USE_BIAS_COMPENSATION = False  # 是否进行Bias补偿
UNSTRUCTURED_MASK = True       # True表示只mask不真正删除(soft mask)


In [7]:
# 测试随机数种子
SEED = 42

torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)

In [None]:
model_path = os.path.join(MODELS_ROOT_PATH, MODEL_NAME)
model, tokenizer = load_model_and_tokenizer(model_path)
model.eval().to("cuda")

In [None]:
model.model.layers[1].self_attn.o_proj

In [None]:
activations_path = os.path.join(ACTIVATIONS_ROOT_PATH, MODEL_NAME)
activation_data_dict = load_activations(activations_path)

In [11]:
# 这里假设选择其中某个任务 (比如 'gsm8k') 的激活数据，如果你有多个任务，可以合并或挑选
TASK_KEY = 'gsm8k'
activation_data = activation_data_dict[TASK_KEY]  # 形如：{layer_idx: {...}}

In [None]:
len(activation_data[0]['mlp_input_states']['l2'])

In [None]:
# 加载权重L2信息
weight_l2_path = os.path.join(activations_path, 'weight_l2_info.pt')
weight_l2_data = load_weight_l2_info(weight_l2_path)

In [None]:
# -------------- 检查激活信息的结构 --------------
sample_layer_idx = 0
mlp_var = activation_data[sample_layer_idx]["mlp_intermediate_states"].get("var", None)
print(f"Layer {sample_layer_idx} mlp_intermediate_states.var shape = {mlp_var.shape if mlp_var is not None else None}")


In [None]:
# -------------- 准备模型的关键信息 --------------
num_layers = model.config.num_hidden_layers
hidden_size = model.config.hidden_size
num_heads = model.config.num_attention_heads
intermediate_size = model.config.intermediate_size  # 通常是 4 * hidden_size
print(f"num_layers={num_layers}, hidden_size={hidden_size}, num_heads={num_heads}, intermediate_size={intermediate_size}")


In [None]:
# -------------- 计算剪枝分数 (Compute Scores) --------------
scores_dict = compute_all_layers_scores(
    activation_data=activation_data,
    weight_data=weight_l2_data,
    num_layers=num_layers,
    hidden_size=hidden_size,
    num_heads=num_heads,
    intermediate_size=intermediate_size,
    method=METHOD  # 这里用 WIFV
)

print(f"Computed scores_dict with method={METHOD}.")

In [None]:
# 可以查看某层的分数
for layer_idx in range(min(2, num_layers)):  # 打印前2层看看
    attn_scores = scores_dict[layer_idx]["attn_scores"]
    mlp_scores = scores_dict[layer_idx]["mlp_scores"]
    print(f"\nLayer {layer_idx}: attn_scores shape={attn_scores.shape}, mlp_scores shape={mlp_scores.shape}")
    print(f"  attn_scores[:5] = {attn_scores[:5]}")
    print(f"  mlp_scores[:5]  = {mlp_scores[:5]}")

In [None]:
# -------------- 生成 Mask (Generate Masks) --------------
attn_masks, mlp_masks = generate_masks_for_all_layers(
    scores_dict=scores_dict,
    structure=STRUCTURE,
    pruning_ratio=PRUNING_RATIO,
    hidden_size=hidden_size,   # AL-AM 需要
    num_heads=num_heads        # AL-AM 需要
)

print(f"Generated masks with structure={STRUCTURE}, pruning_ratio={PRUNING_RATIO}, remove_heads={REMOVE_HEADS}.")

In [None]:
# 打印一些mask信息
for layer_idx in range(min(2, num_layers)):
    a_mask = attn_masks[layer_idx]
    m_mask = mlp_masks[layer_idx]
    print(f"Layer {layer_idx}: "
          f"attn_mask.sum() = {a_mask.sum().item()} / {len(a_mask)}, "
          f"mlp_mask.sum() = {m_mask.sum().item()} / {len(m_mask)}")

In [None]:
sparsities = compute_layerwise_sparsity(attn_masks, mlp_masks)
for layer_idx, data in sparsities.items():
    print(f"Layer {layer_idx}: attn_sparsity={data['attn_sparsity']:.3f}, "
        f"mlp_sparsity={data['mlp_sparsity']:.3f}")

In [None]:
# -------------- 应用剪枝 (Apply Pruning) --------------
apply_pruning_to_model(
    model=model,
    attn_masks=attn_masks,
    mlp_masks=mlp_masks,
    attn_mean_inps=None,   # 如果你有 baseline_inp，可传dict
    mlp_mean_inps=None,    # 同理
    device="cuda",
    bias=USE_BIAS_COMPENSATION,
    unstr=UNSTRUCTURED_MASK,
    head_dim=hidden_size // num_heads
)

print(f"Pruning applied. [bias={USE_BIAS_COMPENSATION}, unstr={UNSTRUCTURED_MASK}]")