In [1]:
import gc
import os
import time
import torch
import numpy as np
from tqdm import tqdm
from copy import deepcopy
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from instruct_pipeline import InstructionTextGenerationPipeline
os.environ["CUDA_MODULE_LOADING"] = "LAZY"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
pretrained_dir = 'checkpoints/sparse_dolly_12/' # 1:2 sparsity
model_name = 'databricks/dolly-v2-12b'

model = AutoModelForCausalLM.from_pretrained(pretrained_dir, torch_dtype=torch.half).to(device)
model.eval()
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
generate_text = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)

text = "Explain to me the difference between nuclear fission and fusion."
with torch.no_grad():
    res = generate_text(text)
print('The model output is: ', res[0]["generated_text"])

The model output is:  Nuclear fission and fusion are different types of nuclear fission. Nuclear fission occurs when atomic nuclei split into pieces and nuclear fission is what happens when nuclei split into nuclear fragments. Nuclear fusion occurs when atomic nuclei merge into pieces. Nuclear fusion happens when atomic nuclei unite into fragments. Nuclear fission is also referred to as explosion or explosion. Nuclear fusion is also known as fusion. Nuclear fission is also referred to as fission. Nuclear fusion is also known as fusion. Nuclear fission is also known as explosion. Nuclear fusion is also known as fusion. Nuclear fission is also referred to as explosion. Nuclear fusion is also known as fusion. Nuclear fission is also known as explosion. Nuclear fusion is also known as fusion. Nuclear fission is also referred to as explosion. Nuclear fusion is also known as fusion. Nuclear fission is also referred to as explosion. Nuclear fusion is also known as fusion. Nuclear fission is a

In [3]:
def count_ratio_zero_param(cur_model):
    torch.cuda.empty_cache()  # Clear any unused memory to get accurate results
    torch.cuda.reset_peak_memory_stats(device)  # Reset peak memory stats
    torch.cuda.synchronize(device)  # Wait for all operations to finish
    gc.collect()
    param, zero_param = 0, 0
    for name, p in cur_model.named_parameters():
        if 'dense' in name or 'query_key_value' in name:
            param += p.numel()
            zero_param += torch.sum(p==0).item()
    return (zero_param/param)*100.0
print('Ratio of zero parameters in dense layers of model: {:.3f}'.format(count_ratio_zero_param(model.cpu())))

Ratio of zero parameters in dense layers of model: 49.993


In [4]:
def compress_weights(W):
    W = W.detach().numpy()
    rows, cols = W.shape
    sparsity_mask = (W != 0)

    x = np.random.randn(cols, 1)
    a = W @ x

    V = W[sparsity_mask.astype(bool)].reshape(rows, cols // 2)
    P = V[np.newaxis, ...]*x.reshape(cols// 2, 2).T[:, np.newaxis, :]
    Q = P * sparsity_mask.reshape(-1, 2).T.reshape(2, rows, cols // 2)
    b = np.sum(Q, axis=(0, 2))
    assert np.allclose(a.flatten(), b) == True
    assert cols//2==V.shape[1]
    return V


def verify_dolly_sparsity(cur_model):
    torch.cuda.empty_cache()  # Clear any unused memory to get accurate results
    torch.cuda.reset_peak_memory_stats(device)  # Reset peak memory stats
    torch.cuda.synchronize(device)  # Wait for all operations to finish
    gc.collect()
    param, new_param = 0, 0
    for name, p in cur_model.named_parameters():
        if 'dense' in name or 'query_key_value' in name:
            if len(p.shape)>1:
                V = compress_weights(p)
                new_param += V.size
            else:
                new_param+=p.numel()
            param += p.numel()
    return new_param, param
new_param, param = verify_dolly_sparsity(model.cpu())
print(new_param, param)
print('Ratio of New model to old model: {:.3f}'.format((new_param/param)*100.0))

5663969280 11326279680
Ratio of New model to old model: 50.007


In [5]:
# cur_attn = model.gpt_neox.layers[2].attention.query_key_value.weight.data
# sparsity_mask = (cur_attn != 0).cpu().numpy()
# rows, cols = cur_attn.shape
# W = cur_attn.cpu().numpy()
# x = np.random.randn(cols, 1)
# a = W @ x

# V = W[sparsity_mask.astype(bool)].reshape(rows, cols // 2)
# P = V[np.newaxis, ...]*x.reshape(cols// 2, 2).T[:, np.newaxis, :]
# Q = P * sparsity_mask.reshape(-1, 2).T.reshape(2, rows, cols // 2)
# b = np.sum(Q, axis=(0, 2))
# print(np.allclose(a.flatten(), b))
# print(cur_attn.shape, V.shape)

In [6]:
# # Calculate the ratio of zeros in cur_attn
# cur_attn = model.gpt_neox.layers[2].attention.query_key_value.weight.data
# ratio_zeros = (torch.sum(cur_attn == 0).item() / cur_attn.numel()) * 100.0
# print('Ratio of zero parameters in current attention layer: {:.3f}%'.format(ratio_zeros))
# print(cur_attn.shape)
# print(cur_attn[:10, :5])
# # Plot the values of the attention matrix
# plt.figure(figsize=(10, 10))
# plt.imshow(cur_attn[:10, :10].cpu().numpy(), cmap='hot', interpolation='nearest')
# plt.colorbar()
# plt.show()