# Load HDF5 file

In [None]:
import os
import json

import h5py
import numpy as np

hdf5_path = "kva_result/hdf5/Qwen2.5-0.5B-Instruct/kva_mmlu.h5"

with h5py.File(hdf5_path, "r") as f:
    attribution_scores = np.array(f["dataset"])
    print(attribution_scores.shape)

In [None]:
QUOTA = 30 # Percentage of parameters to train (value between 0-100)

model_name = os.path.basename(os.path.dirname(hdf5_path))
save_dir = f"./kva_result/pos_json/temp/{model_name}"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Select trainable nodes under the guidance of Layer-Balanced Strategy


In [None]:
def split_n(num_layers):
    """Split the total number into equal parts based on number of layers.
    
    Args:
        num_layers: Number of layers to split across
        
    Returns:
        List of equal fractions (1/num_layers for each layer)
    """
    return [1 / num_layers for _ in range(num_layers)]

def select_trainable_nodes(attribution_scores, quota):
    """Select trainable nodes based on attribution scores and quota.
    
    Args:
        attribution_scores: 3D array of shape (num_inferences, num_layers, num_nodes)
                          containing attribution scores for each node
        quota: Percentage of parameters to train (value between 0-100)
        
    Returns:
        List of lists containing indices of selected nodes for each layer
    """
    num_inferences, num_layers, num_nodes = attribution_scores.shape
    
    # Calculate total number of trainable nodes based on quota
    num_trainable = num_layers * num_nodes * quota / 100
    
    # Calculate how many nodes to select per layer (equal distribution)
    spindle_parts = split_n(num_layers)
    k_per_layer = list(map(lambda x: int(x * num_trainable), spindle_parts))
    print(f"Number of nodes to select per layer: {k_per_layer}")

    # Initialize matrix to count nodes selections across inferences
    node_counts = np.zeros((num_layers, num_nodes), dtype=int)

    # Process each inference and layer
    for infer_idx in range(num_inferences):
        for layer_idx in range(num_layers):
            # Get attribution scores for current layer
            layer_grad = attribution_scores[infer_idx, layer_idx, :]

            # Apply min-max normalization
            min_val = np.min(layer_grad)
            max_val = np.max(layer_grad)
            if max_val == min_val:
                normalized = np.zeros_like(layer_grad)
            else:
                normalized = (layer_grad - min_val) / (max_val - min_val)

            # Select indices of nodes with lowest normalized scores
            smallest_indices = np.argsort(normalized)[: k_per_layer[layer_idx]]
            node_counts[layer_idx, smallest_indices] += 1

    # Select final trainable nodes for each layer
    result = []
    for layer_idx in range(num_layers):
        # Get and sort nodes selection counts for current layer
        counts = node_counts[layer_idx]
        sorted_indices = np.argsort(counts)[::-1]  # Sort in descending order

        # Select top k_per_layer nodes
        selected_indices = sorted_indices[: k_per_layer[layer_idx]]
        result.append(selected_indices.tolist())

    return result


# Execute the selection process
result = select_trainable_nodes(attribution_scores, QUOTA)
result_path = f"{save_dir}/{QUOTA}.json"

# Write results to JSON file
with open(result_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=4)

print(f"JSON file generated at: {result_path}")

# Select the lowest contribution nodes globally


In [None]:
def select_trainable_nodes(attribution_scores, quota):
    """
    Select trainable nodes based on global attribution scores and quota.
    
    Args:
        attribution_scores: 3D array of shape (num_inferences, num_layers, num_nodes)
                          containing attribution scores for each node
        quota: Percentage of parameters to train (value between 0-100)
        
    Returns:
        List of lists containing indices of selected nodes for each layer
    """
    num_inferences, num_layers, num_nodes = attribution_scores.shape
    total_nodes = num_layers * num_nodes
    total_trainable = int(total_nodes * quota / 100)
    
    # Initialize node selection count matrix
    node_counts = np.zeros((num_layers, num_nodes), dtype=int)

    # Process each inference
    for infer_idx in range(num_inferences):
        # Get and flatten all layer gradients for current inference
        all_layers_grad = attribution_scores[infer_idx, :, :]
        flattened_grad = all_layers_grad.flatten()

        # Apply global min-max normalization
        min_val = np.min(flattened_grad)
        max_val = np.max(flattened_grad)
        if max_val == min_val:
            normalized = np.zeros_like(flattened_grad)
        else:
            normalized = (flattened_grad - min_val) / (max_val - min_val)

        # Select indices of nodes with lowest global normalized scores
        smallest_global_indices = np.argsort(normalized)[:total_trainable]

        # Update node selection counts
        for global_idx in smallest_global_indices:
            layer_idx = global_idx // num_nodes
            node_idx = global_idx % num_nodes
            node_counts[layer_idx, node_idx] += 1

    # Select nodes with highest selection counts
    flattened_counts = node_counts.flatten()
    sorted_global_indices = np.argsort(flattened_counts, kind='stable')[::-1][:total_trainable]

    # Organize results by layer
    result = [[] for _ in range(num_layers)]
    for global_idx in sorted_global_indices:
        layer_idx = global_idx // num_nodes
        node_idx = global_idx % num_nodes
        result[layer_idx].append(int(node_idx))

    return result

# Execute node selection
result = select_trainable_nodes(attribution_scores, QUOTA)
result_path = f"{save_dir}/GLOBAL_{QUOTA}_L.json"

# Write results to JSON file
with open(result_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=4)
print(f"JSON file generated at: {result_path}")

# Select the highest contribution nodes globally

In [None]:
def select_trainable_nodes(attribution_scores, quota):
    """
    Select trainable nodes based on global attribution scores and quota,
    prioritizing nodes with highest attribution scores.
    
    Args:
        attribution_scores: 3D array of shape (num_inferences, num_layers, num_nodes)
                          containing attribution scores for each node
        quota: Percentage of parameters to train (value between 0-100)
        
    Returns:
        List of lists containing indices of selected nodes for each layer
    """
    num_inferences, num_layers, num_nodes = attribution_scores.shape
    total_nodes = num_layers * num_nodes
    total_trainable = int(total_nodes * quota / 100)
    
    # Initialize node selection count matrix
    node_counts = np.zeros((num_layers, num_nodes), dtype=int)

    # Process each inference
    for infer_idx in range(num_inferences):
        # Get and flatten all layer gradients for current inference
        all_layers_grad = attribution_scores[infer_idx, :, :]
        flattened_grad = all_layers_grad.flatten()

        # Apply global min-max normalization
        min_val = np.min(flattened_grad)
        max_val = np.max(flattened_grad)
        if max_val == min_val:
            normalized = np.zeros_like(flattened_grad)
        else:
            normalized = (flattened_grad - min_val) / (max_val - min_val)

        # Select indices of nodes with HIGHEST normalized scores
        largest_global_indices = np.argsort(-normalized)[:total_trainable]  # Negative sign for descending order

        # Update node selection counts
        for global_idx in largest_global_indices:
            layer_idx = global_idx // num_nodes
            node_idx = global_idx % num_nodes
            node_counts[layer_idx, node_idx] += 1

    # Select nodes with highest selection counts (original logic maintained)
    flattened_counts = node_counts.flatten()
    sorted_global_indices = np.argsort(flattened_counts, kind='stable')[::-1][:total_trainable]

    # Organize results by layer
    result = [[] for _ in range(num_layers)]
    for global_idx in sorted_global_indices:
        layer_idx = global_idx // num_nodes
        node_idx = global_idx % num_nodes
        result[layer_idx].append(int(node_idx))

    return result

# Execute node selection
result = select_trainable_nodes(attribution_scores, QUOTA)
result_path = f"{save_dir}/GLOBAL_{QUOTA}_H.json"

# Write results to JSON file
with open(result_path, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=4)
print(f"JSON file generated at: {result_path}")

# Merge HDF5 files
The current code does not support multi GPU execution of KVA, so we provide a multi GPU acceleration solution: multiple GPUs execute subtasks on MMLU in parallel, and then merge their computation results together. You can use the code here to implement merging.


In [None]:
import os

import h5py

def merge_h5_files(input_dir, output_file):
    """
    Merge all HDF5 files' datasets from the specified folder into the output file.

    Args:
        input_dir (str): Path to the input folder.
        output_file (str): Path to the output file.
    """
    # Collect all HDF5 files (excluding the output file itself)
    input_files = []
    output_abspath = os.path.abspath(output_file)
    for fname in os.listdir(input_dir):
        fpath = os.path.abspath(os.path.join(input_dir, fname))
        if fpath == output_abspath:
            continue
        if fname.endswith(('.h5', '.hdf5')):
            input_files.append(fpath)
    
    if not input_files:
        raise ValueError("No HDF5 files found in the input directory.")
    input_files.sort()  # Sort by file name

    # Check the dataset structure and dtype of all files
    ref_shape, dtype = None, None
    total_input_rows = 0
    for fpath in input_files:
        with h5py.File(fpath, 'r') as f:
            if 'dataset' not in f:
                raise ValueError(f"File {fpath} is missing 'dataset'")
            dset = f['dataset']
            if ref_shape is None:
                ref_shape = dset.shape[1:]
                dtype = dset.dtype
                total_input_rows = dset.shape[0]
            else:
                if dset.shape[1:] != ref_shape or dset.dtype != dtype:
                    raise ValueError(f"Dataset shape or dtype mismatch in file {fpath}")
                total_input_rows += dset.shape[0]

    # Process the output file
    with h5py.File(output_file, 'a') as f_out:
        if 'dataset' in f_out:
            # Check if existing dataset is compatible
            existing_dset = f_out['dataset']
            if existing_dset.shape[1:] != ref_shape or existing_dset.dtype != dtype:
                raise ValueError("Dataset in output file is not compatible.")
            original_size = existing_dset.shape[0]
            new_size = original_size + total_input_rows
            existing_dset.resize((new_size,) + ref_shape)
            current_pos = original_size
        else:
            # Create a new dataset
            existing_dset = f_out.create_dataset(
                'dataset',
                shape=(total_input_rows,) + ref_shape,
                maxshape=(None,) + ref_shape,
                dtype=dtype
            )
            current_pos = 0

        # Copy data file by file
        for fpath in input_files:
            with h5py.File(fpath, 'r') as f_in:
                dset_in = f_in['dataset']
                rows = dset_in.shape[0]
                existing_dset[current_pos:current_pos+rows] = dset_in[:]
                current_pos += rows

    print(f"Successfully merged {len(input_files)} files into {output_file}")

merge_h5_files("", "")
