# 计算完整数组的皮尔逊相关性

In [1]:
import json
import pandas as pd
from scipy.stats import pearsonr
import re

def cal_all_data_pearson(filename1,filename2,rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"):
    with open(rootpath+filename1, 'r') as f:
        data1 = json.load(f)
    
    with open(rootpath+filename2, 'r') as f:
        data2 = json.load(f)
    
    # 将数据转换为 DataFrame 格式
    df1 = pd.DataFrame(data1).T  # 转置以确保行代表样本
    df2 = pd.DataFrame(data2).T
    
    # 将两组数据展平为一维数组
    flat_data1 = df1.values.flatten()
    flat_data2 = df2.values.flatten()
    # print(flat_data1)
    # print(flat_data2)
    csv_data = pd.DataFrame({
                            'flat_data1': flat_data1,
                            'flat_data2': flat_data2
                            })
    csv_path = rootpath + 'flattened_data.csv'
    csv_data.to_csv(csv_path, index=False)

    # 计算总体皮尔逊相关系数
    corr, p_value = pearsonr(flat_data1, flat_data2)
     
    # 输出结果
    print(f"Pearson Correlation Coefficient: {corr:.4f}")
    print(f"Two-tailed p-value: {p_value:.4e}")
    
    # 检验显著性
    alpha = 0.05  # 显著性水平
    if p_value < alpha:
        print("The correlation is statistically significant.")
    else:
        print("The correlation is not statistically significant.")

def cal_all_data_pearson_with_difference(filename1,filename2,rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"):
    with open(rootpath+filename1, 'r') as f:
        data1 = json.load(f)
    
    with open(rootpath+filename2, 'r') as f:
        data2 = json.load(f)

    original_perplexity = float(re.search(r"(\d+\.\d+)", filename2).group(1))
    
    # 将数据转换为 DataFrame 格式
    df1 = pd.DataFrame(data1).T  # 转置以确保行代表样本
    df2 = pd.DataFrame(data2).T 
    
    # 将两组数据展平为一维数组
    flat_data1 = df1.values.flatten()
    flat_data2 = df2.values.flatten() - original_perplexity
    
    # 计算总体皮尔逊相关系数
    corr, p_value = pearsonr(flat_data1, flat_data2)
    
    # 输出结果
    print(f"Pearson Correlation Coefficient: {corr:.4f}")
    print(f"Two-tailed p-value: {p_value:.4e}")
    
    # 检验显著性
    alpha = 0.05  # 显著性水平
    if p_value < alpha:
        print("The correlation is statistically significant.")
    else:
        print("The correlation is not statistically significant.")

In [2]:
filename1 = "fisher_data_1024_2_2024-12-21-15-55-27.json"
filename2 = "modified_perplexitys_1024_2_2024-12-21-15-55-27_2.7421875.json"
cal_all_data_pearson(filename1,filename2)

Pearson Correlation Coefficient: 0.1961
Two-tailed p-value: 3.2023e-03
The correlation is statistically significant.


In [18]:
cal_all_data_pearson_with_difference(filename1,filename2)

Overall Pearson Correlation: 0.1961


In [19]:
filename1 = "fisher_data_1024_10_2024-12-21-16-44-29.json"
filename2 = "modified_perplexitys_1024_10_2024-12-21-16-44-29_2.60546875.json"
cal_all_data_pearson(filename1,filename2)

Overall Pearson Correlation: -0.0364


In [20]:
filename1 = "fisher_data_4096_20_2024-12-21-15-40-48.json"
filename2 = "modified_perplexitys_4096_20_2024-12-21-15-40-48_2.44140625.json"
cal_all_data_pearson(filename1,filename2)

Overall Pearson Correlation: 0.2216


# 计算每个key的皮尔逊值

In [3]:
import json
import pandas as pd
from scipy.stats import pearsonr

def cal_key_pearson(filename1,filename2,rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"):
    with open(rootpath+filename1, 'r') as f:
        data1 = json.load(f)["0"]
    
    with open(rootpath+filename2, 'r') as f:
        data2 = json.load(f)["0"]
        
    # 将数据转换为 DataFrame 格式
    df1 = pd.DataFrame(data1).T  # 转置以确保行代表样本
    df2 = pd.DataFrame(data2).T
    
    # 对应列计算皮尔逊相关性
    correlation_results = {}
    for column in df1.columns:
        if column in df2.columns:
            corr, _ = pearsonr(df1[column], df2[column])
            correlation_results[column] = corr
    
    # 输出结果
    for key, value in correlation_results.items():
        print(f"Column: {key}, Pearson Correlation: {value:.4f}")

In [22]:
filename1 = "fisher_data_1024_2_2024-12-21-15-55-27.json"
filename2 = "modified_perplexitys_1024_2_2024-12-21-15-55-27_2.7421875.json"
cal_key_pearson(filename1,filename2)

Column: self_attn.q_proj.weight, Pearson Correlation: 0.4893
Column: self_attn.k_proj.weight, Pearson Correlation: 0.0602
Column: self_attn.v_proj.weight, Pearson Correlation: -0.0854
Column: self_attn.o_proj.weight, Pearson Correlation: 0.1791
Column: mlp.gate_proj.weight, Pearson Correlation: 0.2548
Column: mlp.up_proj.weight, Pearson Correlation: 0.1098
Column: mlp.down_proj.weight, Pearson Correlation: 0.2691


In [23]:
filename1 = "fisher_data_4096_20_2024-12-21-15-40-48.json"
filename2 = "modified_perplexitys_4096_20_2024-12-21-15-40-48_2.44140625.json"
cal_key_pearson(filename1,filename2)

Column: self_attn.q_proj.weight, Pearson Correlation: 0.1740
Column: self_attn.k_proj.weight, Pearson Correlation: 0.4538
Column: self_attn.v_proj.weight, Pearson Correlation: -0.1413
Column: self_attn.o_proj.weight, Pearson Correlation: 0.0323
Column: mlp.gate_proj.weight, Pearson Correlation: 0.3305
Column: mlp.up_proj.weight, Pearson Correlation: 0.3184
Column: mlp.down_proj.weight, Pearson Correlation: 0.4135


In [4]:
filename1 = "fisher_data_1024_2_2024-12-23-14-07-30.json"
filename2 = "modified_perplexitys_1024_2_2024-12-23-14-07-30_2.7421875.json"
cal_key_pearson(filename1,filename2)

Column: self_attn.q_proj, Pearson Correlation: 0.2881
Column: self_attn.k_proj, Pearson Correlation: 0.0377
Column: self_attn.v_proj, Pearson Correlation: 0.1300
Column: self_attn.o_proj, Pearson Correlation: 0.1302
Column: mlp.gate_proj, Pearson Correlation: 0.1258
Column: mlp.up_proj, Pearson Correlation: 0.1269
Column: mlp.down_proj, Pearson Correlation: 0.1360


# 绘制不同位宽设定下的结果表格

In [14]:
import json
import pandas as pd

rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"
# Load the data from the provided files
with open(rootpath+'fisher_data_1024_2_2024-12-22-19-12-03.json', 'r') as f:
    fisher_data = json.load(f)
with open(rootpath+'modified_perplexitys_1024_2_2024-12-22-19-12-03_2.7421875.json', 'r') as f:
    perplexity_data = json.load(f)

# 假设数据已经按照量化位宽 -> block_id -> layer_name组织
# reorganized_fisher_data = {width: {block_id: fisher_data[block_id] for block_id in fisher_data.keys()} for width, fisher_data in zip(bit_widths, fisher_data.values())}
# reorganized_perplexity_data = {width: {block_id: perplexity_data[block_id] for block_id in perplexity_data.keys()} for width, perplexity_data in zip(bit_widths, perplexity_data.values())}


In [34]:
# 创建一个用于存储结果的列表
bit_widths = [8,4,3,2]
reorganized_results = []

for block_id in fisher_data["8"].keys():  # 假设 "8" 是一个示例位宽
    for layer in fisher_data["8"][block_id].keys():
        row = {"Block ID": block_id, "Layer Name": layer}
        for width in bit_widths:
            fisher_value = fisher_data.get(f"{width}", {}).get(block_id, {}).get(layer)
            perplexity_value = perplexity_data.get(f"{width}", {}).get(block_id, {}).get(layer)
                     
            if fisher_value is not None and perplexity_value is not None:
                row[f"Width {width} (Fisher, Perplexity)"] = ("{:.2f}".format(fisher_value), "{:.5f}".format(perplexity_value))
            else:
                row[f"Width {width} (Fisher, Perplexity)"] = (None, None)  # 处理缺失值
        
        reorganized_results.append(row)


# 创建DataFrame
df_reorganized = pd.DataFrame(reorganized_results)

# 查看表格的前几行
df_reorganized.head(21)

Unnamed: 0,Block ID,Layer Name,"Width 8 (Fisher, Perplexity)","Width 4 (Fisher, Perplexity)","Width 3 (Fisher, Perplexity)","Width 2 (Fisher, Perplexity)"
0,0,self_attn.q_proj,"(0.46, 2.74219)","(0.41, 2.74414)","(0.34, 2.74219)","(0.69, 2.74219)"
1,0,self_attn.k_proj,"(4.75, 2.74219)","(4.80, 2.74219)","(4.59, 2.74219)","(4.91, 2.74414)"
2,0,self_attn.v_proj,"(1.29, 2.74219)","(1.22, 2.74219)","(1.27, 2.74609)","(894.50, 2.85547)"
3,0,self_attn.o_proj,"(1.13, 2.74219)","(1.12, 2.74219)","(1.04, 2.74219)","(1.20, 2.74609)"
4,0,mlp.gate_proj,"(0.98, 2.74219)","(0.99, 2.74414)","(0.96, 2.74219)","(1.13, 2.74609)"
5,0,mlp.up_proj,"(1.24, 2.74219)","(1.25, 2.74219)","(1.22, 2.74414)","(1.24, 2.74414)"
6,0,mlp.down_proj,"(1.16, 2.74219)","(1.20, 2.74414)","(1.17, 2.74414)","(1.27, 2.74609)"
7,1,self_attn.q_proj,"(0.67, 2.74219)","(0.67, 2.74219)","(0.68, 2.74414)","(0.54, 2.73828)"
8,1,self_attn.k_proj,"(0.97, 2.74219)","(0.97, 2.74414)","(1.04, 2.74219)","(1.31, 2.74609)"
9,1,self_attn.v_proj,"(1.26, 2.74414)","(1.41, 2.74609)","(1.70, 2.74609)","(3.34, 2.74609)"


# 量化位宽与fisher info 以及 量化位宽和困惑度变化的相关系数计算

In [2]:
import json
import pandas as pd
from scipy.stats import pearsonr
import re

def cal_bits2fisher_pearson(filename,rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"):
    with open(rootpath+filename, 'r') as f:
        data = json.load(f)

    bitwidths = [2, 3, 4, 8]
    results = {}

    for key in data["8"][str(0)].keys():  # 获取所有key（假设每个block都有相同的key）
        # 对于每个量化位宽，计算对应key层在32个block_id上的平均值
        avg_fisher_values = []
        
        for bw in bitwidths:
            fisher_values_for_bw = []
            
            # 收集所有block_id中该量化位宽下的Fisher信息
            for block_id in range(32):
                fisher_values_for_bw.append(data[str(bw)][str(block_id)][key])
            
            # 计算该量化位宽下的平均Fisher信息
            avg_fisher = sum(fisher_values_for_bw) / len(fisher_values_for_bw)
            avg_fisher_values.append(avg_fisher)
        
        # 计算量化位宽和平均Fisher信息之间的皮尔逊相关系数及p-value
        corr, p_value = pearsonr(bitwidths*32, avg_fisher_values*32)

        print(avg_fisher_values)
        
        # 保存该key的结果
        results[key] = {
            "corr": corr,
            "p_value": p_value
        }
        
    df = pd.DataFrame(results).T
    # 输出结果
    print(df.head(7))

def cal_bits2perplexity_pearson(filename,rootpath="/root/autodl-tmp/methods/mix_quantize/model_info/llama2-7b/"):
    with open(rootpath+filename, 'r') as f:
        data = json.load(f)

    bitwidths = [2, 3, 4, 8]
    results = {}

    for key in data["8"][str(0)].keys():  # 获取所有key（假设每个block都有相同的key）
        # 对于每个量化位宽，计算对应key层在32个block_id上的平均值
        avg_ple_values = []
        
        for bw in bitwidths:
            ple_values_for_bw = []
            
            # 收集所有block_id中该量化位宽下的Fisher信息
            for block_id in range(32):
                ple_values_for_bw.append(data[str(bw)][str(block_id)][key])
            
            # 计算该量化位宽下的平均Fisher信息
            avg_ple = sum(ple_values_for_bw) / len(ple_values_for_bw)
            avg_ple_values.append(avg_ple)

        print(avg_ple_values)
        
        # 计算量化位宽和平均Fisher信息之间的皮尔逊相关系数及p-value
        corr, p_value = pearsonr(bitwidths*32, avg_ple_values*32)
        
        # 保存该key的结果
        results[key] = {
            "corr": corr,
            "p_value": p_value
        }
        
    df = pd.DataFrame(results).T
    # 输出结果
    print(df.head(7))


In [29]:
filename1 = "fisher_data_1024_2_2024-12-22-19-12-03.json"
filename2 = "modified_perplexitys_1024_2_2024-12-22-19-12-03_2.7421875.json"
cal_bits2fisher_pearson(filename1)
print("----"*10)
cal_bits2perplexity_pearson(filename2)

[1.33563232421875, 1.2452621459960938, 1.2432327270507812, 1.23663330078125]
[1.45599365234375, 1.3392333984375, 1.3439178466796875, 1.3390655517578125]
[29.12786865234375, 1.1214599609375, 1.084869384765625, 1.0731964111328125]
[1.1143646240234375, 1.1068878173828125, 1.1125335693359375, 1.115203857421875]
[1.164154052734375, 1.1173248291015625, 1.1090240478515625, 1.107513427734375]
[1.1347503662109375, 1.1279144287109375, 1.15338134765625, 1.151123046875]
[2.568939208984375, 1.141204833984375, 1.0986328125, 1.1203460693359375]
                      corr       p_value
self_attn.q_proj -0.632796  1.118280e-15
self_attn.k_proj -0.581320  6.264712e-13
self_attn.v_proj -0.571330  1.887489e-12
self_attn.o_proj  0.438540  2.247970e-07
mlp.gate_proj    -0.662916  1.540007e-17
mlp.up_proj       0.665661  1.016812e-17
mlp.down_proj    -0.573687  1.459922e-12
----------------------------------------
                      corr       p_value
self_attn.q_proj -0.884251  1.732190e-43
self_attn.k_p

In [3]:
filename1 = "fisher_data_1024_8_2024-12-28-11-58-49.json"
filename2 = "modified_perplexitys_1024_8_2024-12-28-11-58-49_2.623046875.json"
cal_bits2fisher_pearson(filename1)
print("----"*10)
cal_bits2perplexity_pearson(filename2)

[0.10924863815307617, 0.1071937084197998, 0.10734868049621582, 0.10463905334472656]
[0.09789252281188965, 0.1138380765914917, 0.10732710361480713, 0.10660231113433838]
[12.87924575805664, 1.8666038513183594, 1.8310279846191406, 1.7975082397460938]
[0.43309688568115234, 0.38916015625, 0.3827180862426758, 0.3842477798461914]
[0.12584829330444336, 0.12731456756591797, 0.11959362030029297, 0.1186528205871582]
[0.2528400421142578, 0.25220680236816406, 0.24974441528320312, 0.24515342712402344]
[6.668909072875977, 3.586162567138672, 3.1209278106689453, 3.126615524291992]
                      corr        p_value
self_attn.q_proj -0.953898   1.028537e-67
self_attn.k_proj  0.200767   2.306658e-02
self_attn.v_proj -0.574284   1.367413e-12
self_attn.o_proj -0.610379   2.026282e-14
mlp.gate_proj    -0.791431   1.034676e-28
mlp.up_proj      -0.990653  6.973975e-111
mlp.down_proj    -0.633419   1.028385e-15
----------------------------------------
                      corr       p_value
self_attn.q