# 导入库

In [1]:
import statistics
import subprocess
import ctypes

import sten

import math
import numpy as np
import torch
from torch.profiler import profile, record_function, ProfilerActivity

from pathlib import Path

import timeit
import argparse
import sys
import time

from grouped_nmv_tensor import SrNMTensor, nm_vector_mask_sparsify

import spatha

  return self.fget.__get__(instance, owner)()
  value = getter(object, key)


In [3]:
class NMVectorSparsifier:
    # 类的构造函数，初始化实例变量。
    def __init__(self, n, m, tileM):
        self.n = n          # n 是用于某些稀疏操作的参数，具体含义根据上下文而定
        self.m = m          # m 是分块大小或其他相关操作的参数
        self.tileM = tileM  # tileM 是用于分块的尺寸，影响最终稀疏矩阵的结构

    # 特殊方法__call__使得实例可以像函数那样被调用
    def __call__(self, tensor, grad_fmt=None):
        # uncomment to use magnitude-pruning -> mask, columns
        # mask, columns = nm_vector_mask_sparsify(tensor, sparsifier.n, sparsifier.m, sparsifier.tileM)

        # uncomment to use random pruning (cuSparseLt-like approach) -> mask, columns
        
        # 以下是处理张量以生成稀疏结构的过程
        # 计算张量的行和列数
        nrows, ncols = tensor.shape

        # 创建存储列索引的张量，初始为全0，其大小由原张量的行数和列数决定，并按tileM和m进行调整
        columns = torch.zeros(nrows//self.tileM, ncols//self.m*4, dtype=torch.int32)
        # 重新形状并添加特定的列索引，这里加上了[0,1,2,3]，目的是为每个分块创建重复的模式
        columns = columns.reshape((-1,4)) + torch.tensor([0,1,2,3], dtype=torch.int32)
        # 再次调整形状，确保columns的形状与处理后的张量结构相匹配
        columns = columns.reshape((nrows//self.tileM, ncols//self.m*4))

        # 创建掩码张量，初始为全0，其形状与输入张量相同
        mask = torch.zeros(tensor.shape, dtype=tensor.dtype)
        # 创建一个小的模式掩码，并将其重复以填充整个掩码张量
        m = torch.cat( (torch.tensor([1,0,1,0]), torch.zeros(self.m-4)), 0 )
        # 通过广播添加模式掩码m到每个块
        mask = mask.reshape(-1, self.tileM, self.m) + m
        # 恢复掩码的原始形状以匹配输入张量
        mask = mask.reshape(tensor.shape)

        # 使用sten库创建一个稀疏张量包装器，这个稀疏张量基于之前创建的mask和columns
        sparse_mtx = sten.SparseTensorWrapper.wrapped_from_dense(
            SrNMTensor(self.n, self.m, self.tileM, tensor, mask, columns, tensor.device),
            tensor,
            grad_fmt,
        )

        # 返回处理后的稀疏矩阵对象
        return sparse_mtx

In [6]:
## 生成一个4096x2048的随机张量
torch.manual_seed(0)
origin_weight = torch.rand((4096, 2048), dtype=torch.float32)
#将w移动到cuda上使用半精度浮点数
original_weight = origin_weight.cuda().half()

In [7]:
w = NMVectorSparsifier(n, m, v)(original_weight).wrapped_tensor

In [9]:
values = torch.nn.Parameter(w.values)
values

Parameter containing:
tensor([0.4963, 0.0885, 0.2386,  ..., 0.6001, 0.4131, 0.2450], device='cuda:0',
       dtype=torch.float16, requires_grad=True)

In [10]:
columns = w.columns
columns

tensor([[0, 1, 2,  ..., 1, 2, 3],
        [0, 1, 2,  ..., 1, 2, 3],
        [0, 1, 2,  ..., 1, 2, 3],
        ...,
        [0, 1, 2,  ..., 1, 2, 3],
        [0, 1, 2,  ..., 1, 2, 3],
        [0, 1, 2,  ..., 1, 2, 3]], device='cuda:0', dtype=torch.int32)

In [3]:
class NMVectorSparsifier:
    # 类的构造函数，初始化实例变量。
    def __init__(self, n, m, tileM):
        self.n = n          # n 是用于某些稀疏操作的参数，具体含义根据上下文而定
        self.m = m          # m 是分块大小或其他相关操作的参数
        self.tileM = tileM  # tileM 是用于分块的尺寸，影响最终稀疏矩阵的结构

    # 特殊方法__call__使得实例可以像函数那样被调用
    def __call__(self, tensor, grad_fmt=None):
        # uncomment to use magnitude-pruning -> mask, columns
        # mask, columns = nm_vector_mask_sparsify(tensor, sparsifier.n, sparsifier.m, sparsifier.tileM)

        # uncomment to use random pruning (cuSparseLt-like approach) -> mask, columns
        
        # 以下是处理张量以生成稀疏结构的过程
        # 计算张量的行和列数
        nrows, ncols = tensor.shape

        # 创建存储列索引的张量，初始为全0，其大小由原张量的行数和列数决定，并按tileM和m进行调整
        columns = torch.zeros(nrows//self.tileM, ncols//self.m*4, dtype=torch.int32)
        # 重新形状并添加特定的列索引，这里加上了[0,1,2,3]，目的是为每个分块创建重复的模式
        columns = columns.reshape((-1,4)) + torch.tensor([0,1,2,3], dtype=torch.int32)
        # 再次调整形状，确保columns的形状与处理后的张量结构相匹配
        columns = columns.reshape((nrows//self.tileM, ncols//self.m*4))

        # 创建掩码张量，初始为全0，其形状与输入张量相同
        mask = torch.zeros(tensor.shape, dtype=tensor.dtype)
        # 创建一个小的模式掩码，并将其重复以填充整个掩码张量
        m = torch.cat( (torch.tensor([1,0,1,0]), torch.zeros(self.m-4)), 0 )
        # 通过广播添加模式掩码m到每个块
        mask = mask.reshape(-1, self.tileM, self.m) + m
        # 恢复掩码的原始形状以匹配输入张量
        mask = mask.reshape(tensor.shape)

        # 使用sten库创建一个稀疏张量包装器，这个稀疏张量基于之前创建的mask和columns
        sparse_mtx = sten.SparseTensorWrapper.wrapped_from_dense(
            SrNMTensor(self.n, self.m, self.tileM, tensor, mask, columns, tensor.device),
            tensor,
            grad_fmt,
        )

        # 返回处理后的稀疏矩阵对象
        return sparse_mtx

In [4]:
def sparse_dense_mul_dispatch(sparse_values, sparse_indices, sparse_metadata, dense, nrows_sp, ncols_sp, ncols_d, m, n, v, nnz, bias):

    dense_ = dense.contiguous()

    output = spatha.spmm(sparse_metadata,  # metadata
                          sparse_indices,   # indices
                          sparse_values,    # values
                          dense_,           # rhs_matrix
                          bias,
                          nrows_sp,         # A_num_rows
                          ncols_sp,         # A_num_cols
                          ncols_d,          # B_num_cols
                          v,                # vec_length
                          n,                # n
                          m,                # m
                          nnz,              # nnz
                          0,                # seed
                          32,               # mbrow
                          4                 # brow
                          )

    return output

In [5]:
class SrnmSpmm(torch.nn.Module):
    def __init__(self, original: torch.nn.Linear):
        super().__init__()
        self.bias = original.bias

        # Convert weights from original module to SrNM
        w = NMVectorSparsifier(n, m, v)(original.weight).wrapped_tensor

        self.values = torch.nn.Parameter(w.values)
        #self.columns = self.register_buffer('columns', w.columns)
        self.columns = w.columns
        self.metadata = w.metadata

        self.nrows_sp = w.nrows
        self.ncols_sp = w.ncols
        self.nnz      = w.nnz

    def forward(self, input):

        flattened_input = torch.flatten(input, start_dim=0, end_dim=-2)

        ncols_d  = flattened_input.T.shape[1]
        DM, _    = flattened_input.shape

        output = sparse_dense_mul_dispatch(self.values, self.columns, self.metadata, flattened_input.T, self.nrows_sp, self.ncols_sp,
                                           ncols_d, m, n, v, self.nnz, self.bias)
        output = output.reshape((*input.shape[0:-1], -1))[..., :DM]
        return output

In [6]:
def report_time(name, data, number):
    for d in data:
        time_ms = d / number * 1000
        #print(f'n {n} m {m} format {name} time_ms {time_ms:.3f}')
    ds = [(d / number * 1000) for d in data]
    mean = statistics.mean(ds)
    median = statistics.median(ds)
    std = statistics.stdev(ds)

    if name == "n:m":
        cfg = str(n)+","+str(m)+","
    else:
        cfg = "0,0,"
    print(
        "0,"+cfg+str(v)+","+str(mean)+","+str(median)+","+str(std)+","+str(len(ds))
    )

In [7]:
def linear_to_spmm(mod, weights_to_sparsify):
    if isinstance(mod, torch.nn.Linear):
        return SrnmSpmm(mod)

    for name, m in mod.named_children():
        if isinstance(m, SrnmSpmm):
            continue
        if isinstance(m, torch.nn.Linear):
            setattr(mod, name, SrnmSpmm(m))
        elif m is not mod:
            linear_to_spmm(m, weights_to_sparsify)

    return mod

In [8]:
def transformer_encoder_layer_prototype(num_repeats, number):
    # 加载原始的BERT大模型
    model = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-large-uncased')

    # 加载第二个BERT大模型，并转移到CUDA设备上，使用半精度浮点数(half precision)进行计算以提高性能
    model2 = torch.hub.load('huggingface/pytorch-transformers', 'model', 'bert-large-uncased').to(device='cuda:0').half()

    # 生成一个随机整数输入，大小为32x512，用于模型输入
    input = torch.randint(low=0, high=100, size=(32, 512))

    # 生成一个列表，包含模型中所有线性层（Linear layers）的权重，这些权重将被转换为稀疏格式
    weights_to_sparsify = [
        module
        for module_name, module in model.named_modules()
        if (
            isinstance(module, torch.nn.modules.linear.Linear)
            and "encoder.layer" in module_name
        )
    ]

    # 把模型和输入都转移到CUDA设备上，并使用半精度浮点数进行处理
    model = model.to(device='cuda:0').half()
    input = input.to(device='cuda:0')

    # 将选定的权重转换为稀疏格式，并创建一个新的稀疏模型
    sparse_model = linear_to_spmm(model, weights_to_sparsify)

    # 执行模型一次，通常用于预热缓存
    output = sparse_model(input)

    # 如果命令行参数指定进行性能分析
    if args.profile:
        # 开启一个性能分析会话，记录CPU和CUDA的活动
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
            with record_function("model_inference"):
                # 执行模型推理，记录相关数据
                output = sparse_model(input)
        # 导出分析数据
        prof.export_stacks("/tmp/profiler_stacks.txt", "self_cuda_time_total")
        prof.export_chrome_trace("trace_sparse.json")
        print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))

        # 重复上述过程，但是这次使用密集模型
        with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True, with_stack=True) as prof:
            with record_function("model_inference"):
                output = model2(input)
        prof.export_stacks("/tmp/profiler_stacks_dense.txt", "self_cuda_time_total")
        prof.export_chrome_trace("trace_dense.json")
        print(prof.key_averages().table(sort_by="self_cuda_time_total", row_limit=15))
        exit()

    # 使用timeit库，重复执行密集模型10次作为预热，然后进行正式的性能测试
    timeit.repeat('output = model2(input)', repeat=10, number=number, globals=locals())
    dense_times = timeit.repeat('output = model2(input)', repeat=num_repeats, number=number, globals=locals())
    report_time('dense', dense_times, number)

    # 对稀疏模型进行同样的预热和性能测试
    timeit.repeat('output = sparse_model(input)', repeat=10, number=number, globals=locals())
    sparse_times = timeit.repeat('output = sparse_model(input)', repeat=num_repeats, number=number, globals=locals())
    report_time('n:m', sparse_times, number)

In [9]:
torch. set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x7fb51f328500>

In [10]:
transformer_encoder_layer_prototype(num_repeats=30, number=1)

Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main
Using cache found in /root/.cache/torch/hub/huggingface_pytorch-transformers_main


0,0,0,64,28.492342207270365,28.310824069194496,0.7472337186168171,30
0,2,8,64,56.45544687286019,56.43744091503322,0.9001977038644426,30
