In [22]:
GIGACHAT_DIR = "..."
DEEPSEEK_DIR = "..."

DEVICE = "cuda:0"
NUM_LAYERS = 61

RESULT_CSV_PATH = '...'

In [23]:
import os, re, glob, time
import torch
import pandas as pd
import numpy as np
from contextlib import ExitStack
from typing import Optional, Sequence
from safetensors import safe_open

# ---------- regex setup ----------
DEFAULT_PREFIXES = (
    r"model\.layers\.(\d+)\.",       # llama/qwen/deepseek-style
)

# --- skip patterns (ignore at indexing/load stage) ---
DEFAULT_SKIP_REGEXES = (
    r"\.experts\.",   # ignore MoE experts tensors
)

def iter_safetensors_files(model_dir):
    files = sorted(glob.glob(os.path.join(model_dir, "*.safetensors")))
    if not files:
        raise FileNotFoundError(f"No .safetensors files in {model_dir}")
    return files

def compile_regexes(patterns):
    if not patterns:
        return []
    return [re.compile(p) for p in patterns]

def compile_prefixes(prefixes):
    return compile_regexes(prefixes)

def _should_skip_key(key, regs):
    for rgx in regs:
        if rgx.search(key):
            return True
    return False

def extract_layer_id(key, regs):
    for rgx in regs:
        m = rgx.search(key)
        if m:
            return int(m.group(1))
    return None

# ---------- 1) lightweight index: layer -> {key: filepath} ----------
def build_layer_index(
    model_dir,
    n_layers=3,
    prefixes=DEFAULT_PREFIXES,
    skip_regexes=DEFAULT_SKIP_REGEXES,
):
    files = iter_safetensors_files(model_dir)
    regs = compile_prefixes(prefixes)
    skip_regs = compile_regexes(skip_regexes)

    layer_to_keys = {i: {} for i in range(n_layers)}
    for path in files:
        with safe_open(path, framework="pt", device="cpu") as f:
            for k in f.keys():
                if skip_regs and _should_skip_key(k, skip_regs):
                    continue
                lid = extract_layer_id(k, regs)
                if lid is None or lid >= n_layers:
                    continue
                layer_to_keys[lid][k] = path
    return layer_to_keys

# ---------- 2) metrics ----------
def _cos_sim(fa, fb, eps=1e-12):
    denom = (fa.norm() * fb.norm()).clamp_min(eps)
    return (fa @ fb / denom).item()

def _pearson(fa, fb, eps=1e-12):
    fa_c = fa - fa.mean()
    fb_c = fb - fb.mean()
    denom = (fa_c.norm() * fb_c.norm()).clamp_min(eps)
    return (fa_c @ fb_c / denom).item()

def _spearman(fa, fb, eps=1e-12):
    # ранги через двойной argsort
    ra = torch.argsort(torch.argsort(fa))
    rb = torch.argsort(torch.argsort(fb))
    ra = ra.float() - ra.float().mean()
    rb = rb.float() - rb.float().mean()
    denom = (ra.norm() * rb.norm()).clamp_min(eps)
    return (ra @ rb / denom).item()

def _quantiles(abs_diff, qs=(0.01, 0.05, 0.5, 0.95, 0.99), max_samples=None):
    if abs_diff.numel() == 0:
        return {}

    vals = abs_diff.flatten()

    if max_samples is not None and vals.numel() > max_samples:
        idx = torch.randperm(vals.numel(), device=vals.device)[:max_samples]
        vals = vals[idx]

    qv = torch.tensor(qs, device=vals.device, dtype=vals.dtype)
    try:
        out = torch.quantile(vals, qv)
    except RuntimeError:
        vals_cpu = vals.float().cpu().numpy()
        qv_np = qv.float().cpu().numpy()
        out_np = np.quantile(vals_cpu, qv_np)
        out = torch.from_numpy(out_np).to(vals)

    return {f"q{int(q*100):02d}_abs_diff": out[i].item() for i, q in enumerate(qs)}

def _sample_pair(fa, fb, max_samples=None):
    if max_samples is None or fa.numel() <= max_samples:
        return fa, fb
    idx = torch.randint(0, fa.numel(), (max_samples,), device=fa.device)
    return fa[idx], fb[idx]

def _rel_l2_err(fa, fb, eps=1e-12):
    return (fa - fb).norm().div(fa.norm().clamp_min(eps)).item()

def _rel_max_abs_diff(fa, fb, eps=1e-12):
    denom = fa.abs().clamp_min(eps)
    return ((fa - fb).abs() / denom).max().item()

def _rel_mean_abs_diff(fa, fb, eps=1e-12):
    denom = fa.abs().clamp_min(eps)
    return ((fa - fb).abs() / denom).mean().item()


# ---------- 2.5) spectral / SVD helpers ----------
def _to_2d_matrix(t: torch.Tensor) -> torch.Tensor:
    """
    Приводим произвольный тензор к 2D-матрице для SVD.
    Первая ось = строки, остальное = столбцы.
      - 1D -> (1, N)
      - 2D -> (M, N)
      - kD -> (d0, prod(d1..dk))
    """
    if t.ndim == 1:
        return t.unsqueeze(0)
    if t.ndim == 2:
        return t
    return t.reshape(t.shape[0], -1)

def _safe_svd_full(x: torch.Tensor, *, eps=1e-12):
    """
    Полное SVD (без усечения ранга): x = U diag(S) Vh.
    full_matrices=False -> экономичное U/Vh, но спектр S полный по рангу.
    """
    if x.numel() == 0 or x.abs().max().item() < eps:
        m, n = x.shape
        r = min(m, n)
        U = torch.zeros((m, r), device=x.device, dtype=x.dtype)
        S = torch.zeros((r,), device=x.device, dtype=x.dtype)
        Vh = torch.zeros((r, n), device=x.device, dtype=x.dtype)
        return U, S, Vh

    return torch.linalg.svd(x, full_matrices=False)

def _spectral_metrics(
    a: torch.Tensor,
    b: torch.Tensor,
    *,
    topk: int = 128,
    max_numel: Optional[int] = None,  # <-- по умолчанию без гарда
    force_cpu: bool = False,
    dtype: torch.dtype = torch.float32,
):
    """
    Считает метрики по полному SVD:
      - spec_cos_sim / spec_rel_l2_err / spec_pearson / spec_spearman по спектрам S
      - spec_topk_* по сингулярным векторам (U/V), |cos| с учётом знака
      - spec_subspace_overlap_* по подпространствам топ-k (1=совпало)
    """
    A = _to_2d_matrix(a)
    B = _to_2d_matrix(b)

    # гард на размер (если задан)
    if (max_numel is not None) and (A.numel() > max_numel or B.numel() > max_numel):
        return {"spec_note": f"skipped_large(numel_a={A.numel()}, numel_b={B.numel()})"}

    if force_cpu:
        A_ = A.detach().to("cpu", dtype=dtype)
        B_ = B.detach().to("cpu", dtype=dtype)
    else:
        A_ = A.detach().to(dtype=dtype)
        B_ = B.detach().to(dtype=dtype)

    try:
        Ua, Sa, Vha = _safe_svd_full(A_)
        Ub, Sb, Vhb = _safe_svd_full(B_)
    except RuntimeError as e:
        return {"spec_note": f"svd_failed({type(e).__name__})"}

    r = min(Sa.numel(), Sb.numel())
    Sa = Sa[:r]
    Sb = Sb[:r]

    out = {}
    out["spec_rel_l2_err"] = _rel_l2_err(Sa, Sb)
    out["spec_a_max"] = Sa.max()
    out["spec_a_min"] = Sa.min()

    out["spec_b_max"] = Sb.max()
    out["spec_b_min"] = Sb.min()

    k = int(min(topk, r))
    if k > 0:
        Ua_k = Ua[:, :k]
        Ub_k = Ub[:, :k]
        Va_k = Vha[:k, :].T   # V = Vh^T
        Vb_k = Vhb[:k, :].T

        # cos соответствующих сингулярных векторов (берём abs из-за знака)
        u_pair_cos = (Ua_k * Ub_k).sum(dim=0) / (
            Ua_k.norm(dim=0) * Ub_k.norm(dim=0)
        ).clamp_min(1e-12)
        v_pair_cos = (Va_k * Vb_k).sum(dim=0) / (
            Va_k.norm(dim=0) * Vb_k.norm(dim=0)
        ).clamp_min(1e-12)

        out["spec_topk_u_cos_mean"] = u_pair_cos.abs().mean().item()
        out["spec_topk_u_cos_min"]  = u_pair_cos.abs().min().item()
        out["spec_topk_v_cos_mean"] = v_pair_cos.abs().mean().item()
        out["spec_topk_v_cos_min"]  = v_pair_cos.abs().min().item()

        # overlap подпространств U/V: ||U_a^T U_b||_F / k
        overlap_u = torch.linalg.norm(Ua_k.T @ Ub_k, ord="fro") / max(k, 1)
        overlap_v = torch.linalg.norm(Va_k.T @ Vb_k, ord="fro") / max(k, 1)
        out["spec_subspace_overlap_u"] = overlap_u.item()
        out["spec_subspace_overlap_v"] = overlap_v.item()

    return out


# ---------- 3) streaming compare per layer ----------
def compare_models_streaming(
    gigachat_dir,
    deepseek_dir,
    n_layers=3,
    prefixes=DEFAULT_PREFIXES,
    cast_to=torch.float32,
    device="cpu",
    do_spearman=True,
    quantile_max_samples: Optional[int] = None,
    spearman_max_samples: Optional[int] = None,
    log_every=5,
    max_tensors=None,
    verbose=True,
    skip_regexes=DEFAULT_SKIP_REGEXES,

    # --- spectral params ---
    do_spectral: bool = True,
    spectral_topk: int = 128,
    spectral_max_numel: Optional[int] = None,  # <-- без гарда по умолчанию
    spectral_force_cpu: bool = False,
    spectral_dtype: torch.dtype = torch.float32,
):
    # lightweight indexes
    gc_index = build_layer_index(
        gigachat_dir,
        n_layers=n_layers,
        prefixes=prefixes,
        skip_regexes=skip_regexes,
    )
    ds_index = build_layer_index(
        deepseek_dir,
        n_layers=n_layers,
        prefixes=prefixes,
        skip_regexes=skip_regexes,
    )

    rows = []
    processed = 0
    interrupted = False
    start_time = time.time()
    stop_requested = False

    try:
        for lid in range(n_layers):
            gc_keys = gc_index[lid]
            ds_keys = ds_index[lid]
            common_keys = sorted(set(gc_keys) & set(ds_keys))

            if verbose:
                print(
                    f"[{time.strftime('%H:%M:%S')}] layer {lid} "
                    f"({len(common_keys)} common tensors)"
                )

            if not common_keys:
                continue

            layer_start = time.time()

            with ExitStack() as stack:
                gc_handles = {}
                ds_handles = {}

                def get_tensor(handles, path, key):
                    if path not in handles:
                        handles[path] = stack.enter_context(
                            safe_open(path, framework="pt", device=str(device))
                        )
                    return handles[path].get_tensor(key)

                for idx, k in enumerate(common_keys, 1):
                    a = get_tensor(gc_handles, gc_keys[k], k).to(dtype=cast_to)
                    b = get_tensor(ds_handles, ds_keys[k], k).to(dtype=cast_to)

                    if a.shape != b.shape:
                        rows.append(dict(
                            layer=lid, key=k,
                            shape_a=tuple(a.shape), shape_b=tuple(b.shape),
                            note="shape_mismatch",
                        ))
                        processed += 1
                        continue

                    fa = a.reshape(-1)
                    fb = b.reshape(-1)
                    abs_diff = (fa - fb).abs()

                    row = dict(
                        layer=lid,
                        key=k,
                        shape=tuple(a.shape),
                        numel=int(fa.numel()),

                        norm_a=fa.norm().item(),
                        norm_b=fb.norm().item(),
                        norm_ratio=(fa.norm() / fb.norm().clamp_min(1e-12)).item(),

                        mean_a=fa.mean().item(),
                        mean_b=fb.mean().item(),
                        mean_ratio=(fa.mean() / fb.mean().clamp_min(1e-12)).item(),

                        std_a=fa.std(unbiased=False).item(),
                        std_b=fb.std(unbiased=False).item(),
                        std_ratio=(
                            fa.std(unbiased=False)
                            / fb.std(unbiased=False).clamp_min(1e-12)
                        ).item(),

                        zero_frac_a=(fa == 0).float().mean().item(),
                        zero_frac_b=(fb == 0).float().mean().item(),

                        cos_sim=_cos_sim(fa, fb),
                        rel_l2_err=_rel_l2_err(fa, fb),
                        mean_rel_diff=_rel_mean_abs_diff(fa, fb),
                        max_rel_diff=_rel_max_abs_diff(fa, fb),

                        pearson=_pearson(fa, fb),

                        max_abs_diff=abs_diff.max().item(),
                        mean_abs_diff=abs_diff.mean().item(),
                    )

                    if do_spearman:
                        fa_sp, fb_sp = _sample_pair(fa, fb, spearman_max_samples)
                        row["spearman"] = _spearman(fa_sp, fb_sp)

                    row.update(_quantiles(abs_diff, max_samples=quantile_max_samples))

                    # --- spectral add ---
                    if do_spectral:
                        row.update(_spectral_metrics(
                            a, b,
                            topk=spectral_topk,
                            max_numel=spectral_max_numel,
                            force_cpu=spectral_force_cpu,
                            dtype=spectral_dtype,
                        ))

                    rows.append(row)
                    processed += 1

                    if verbose and (processed == 1 or processed % max(1, log_every) == 0):
                        elapsed = time.time() - start_time
                        rate = processed / max(elapsed, 1e-6)
                        layer_elapsed = time.time() - layer_start
                        print(
                            f"[{time.strftime('%H:%M:%S')}] processed "
                            f"{processed} tensors ({rate:.1f}/s, "
                            f"layer {lid} {idx}/{len(common_keys)} in {layer_elapsed:.1f}s)"
                        )

                    if max_tensors is not None and processed >= max_tensors:
                        stop_requested = True
                        break

                if stop_requested:
                    break

    except KeyboardInterrupt:
        interrupted = True
        if verbose:
            print(
                f"[{time.strftime('%H:%M:%S')}] KeyboardInterrupt received, "
                "returning partial results."
            )

    total_elapsed = time.time() - start_time
    if verbose:
        status = "interrupted" if interrupted else "finished"
        print(
            f"[{time.strftime('%H:%M:%S')}] {status}: {processed} tensors "
            f"in {total_elapsed:.1f}s ({processed / max(total_elapsed, 1e-6):.1f}/s)"
        )

    df = pd.DataFrame(rows)
    df.attrs["interrupted"] = interrupted
    df.attrs["processed_tensors"] = processed
    df.attrs["elapsed_seconds"] = total_elapsed
    df.attrs["stop_requested"] = stop_requested
    return df


# ---------- 4) usage ----------
# Ожидается, что Вы заранее задали:
# GIGACHAT_DIR, DEEPSEEK_DIR, DEVICE, NUM_LAYERS
df = compare_models_streaming(
    gigachat_dir=GIGACHAT_DIR,
    deepseek_dir=DEEPSEEK_DIR,
    n_layers=NUM_LAYERS,
    device=DEVICE,
    # do_spectral=True по умолчанию и spectral_max_numel=None по умолчанию
)



[08:49:11] layer 0 (12 common tensors)
[08:49:11] processed 1 tensors (138.9/s, layer 0 1/12 in 0.0s)
[08:49:53] processed 5 tensors (0.1/s, layer 0 5/12 in 41.8s)
[08:49:53] processed 10 tensors (0.2/s, layer 0 10/12 in 41.9s)
[08:49:54] layer 1 (12 common tensors)
[08:50:21] processed 15 tensors (0.2/s, layer 1 3/12 in 27.5s)
[08:50:36] layer 2 (12 common tensors)
[08:50:36] processed 25 tensors (0.3/s, layer 2 1/12 in 0.0s)
[08:51:17] processed 30 tensors (0.2/s, layer 2 6/12 in 41.3s)
[08:51:17] processed 35 tensors (0.3/s, layer 2 11/12 in 41.6s)
[08:51:17] layer 3 (14 common tensors)
[08:51:18] processed 40 tensors (0.3/s, layer 3 4/14 in 0.6s)
[08:51:19] processed 45 tensors (0.4/s, layer 3 9/14 in 1.7s)
[08:51:19] layer 4 (14 common tensors)
[08:51:21] processed 55 tensors (0.4/s, layer 4 5/14 in 1.1s)
[08:51:21] layer 5 (14 common tensors)
[08:51:21] processed 65 tensors (0.5/s, layer 5 1/14 in 0.0s)
[08:51:23] processed 70 tensors (0.5/s, layer 5 6/14 in 1.6s)
[08:51:23] laye

In [24]:
SUMMARY_METRICS = [
    # base
    "cos_sim",
    "pearson",
    "spearman",
    "rel_l2_err",
    "mean_rel_diff",
    "max_rel_diff",
    "mean_abs_diff",
    "max_abs_diff",
    "norm_ratio",
    "std_ratio",
    "mean_ratio",

    # spectral (SVD)
    "spec_a_max",
    "spec_a_min",
    "spec_b_max",
    "spec_b_min",
    "spec_rel_l2_err",
    "spec_topk_u_cos_mean",
    "spec_topk_u_cos_min",
    "spec_topk_v_cos_mean",
    "spec_topk_v_cos_min",
    "spec_subspace_overlap_u",
    "spec_subspace_overlap_v",
]

def _safe_filter_columns(df, cols: Sequence[str]):
    return [c for c in cols if c in df.columns]

# --- LayerNorm detection ---
LAYERNORM_SUFFIXES = (
    ".layernorm.weight",
    ".layer_norm.weight",
    ".ln.weight",
    ".ln_f.weight",
    "layernorm.weight",
    "layer_norm.weight",
    # bias here too
    "e_score_correction_bias",
)
def is_layernorm_key(key: str, suffixes=LAYERNORM_SUFFIXES) -> bool:
    return any(key.endswith(suf) for suf in suffixes)

def _describe(df_part: pd.DataFrame, metrics: Sequence[str]):
    if df_part is None or len(df_part) == 0:
        return pd.DataFrame()
    return df_part[list(metrics)].describe(
        percentiles=[0.01, 0.05, 0.5, 0.95, 0.99]
    )

def _closeness_stats(df_part: pd.DataFrame, prefix=""):
    out = {}
    if df_part is None or len(df_part) == 0:
        return out
    if "cos_sim" in df_part:
        out[f"{prefix}cos_gt_0.99_frac"] = (df_part["cos_sim"] > 0.99).mean()
        out[f"{prefix}cos_gt_0.999_frac"] = (df_part["cos_sim"] > 0.999).mean()
    return out
    
def summarize_results(df: pd.DataFrame):
    # drop shape mismatches and other non-comparable rows
    comparable = df[df.get("note").isna()]
    shape_mismatches = df["note"].eq("shape_mismatch").sum() if "note" in df else 0

    metrics = _safe_filter_columns(comparable, SUMMARY_METRICS)

    # split layernorm vs others
    if "key" in comparable:
        ln_mask = comparable["key"].apply(is_layernorm_key)
    else:
        ln_mask = pd.Series(False, index=comparable.index)

    ln_part = comparable[ln_mask]
    other_part = comparable[~ln_mask]

    ln_summary = _describe(ln_part, metrics)
    other_summary = _describe(other_part, metrics)

    layer_summary = {}
    layer_summary_ln = {}
    layer_summary_other = {}
    if "layer" in comparable:
        if len(comparable) > 0:
            layer_summary = comparable.groupby("layer")[metrics].mean()
        if len(ln_part) > 0:
            layer_summary_ln = ln_part.groupby("layer")[metrics].mean()
        if len(other_part) > 0:
            layer_summary_other = other_part.groupby("layer")[metrics].mean()

    closeness = {}
    closeness.update(_closeness_stats(comparable, prefix="all_"))
    closeness.update(_closeness_stats(ln_part, prefix="ln_"))
    closeness.update(_closeness_stats(other_part, prefix="other_"))

    coverage = {
        "total_rows": len(df),
        "comparable_rows": len(comparable),
        "shape_mismatches": int(shape_mismatches),
        "layernorm_rows": int(len(ln_part)),
        "other_rows": int(len(other_part)),
    }

    return {
        "coverage": coverage,
        "closeness": closeness,
        "layer_mean": layer_summary,
        "layernorm_summary": ln_summary,
        "other_summary": other_summary,
        "layernorm_layer_mean": layer_summary_ln,
        "other_layer_mean": layer_summary_other,
    }

def print_summary(summary):
    if isinstance(summary.get("other_summary"), pd.DataFrame) and not summary["other_summary"].empty:
        print("\n=== non-layernorm stats (describe) ===")
        print(summary["other_summary"])

    if isinstance(summary.get("other_layer_mean"), pd.DataFrame) and not summary["other_layer_mean"].empty:
        print("\n=== per-layer mean metrics (non-layernorm) ===")
        print(summary["other_layer_mean"])

    if isinstance(summary.get("layernorm_summary"), pd.DataFrame) and not summary["layernorm_summary"].empty:
        print("\n=== layernorm stats (describe) ===")
        print(summary["layernorm_summary"])

summary = summarize_results(df)
print_summary(summary)


=== non-layernorm stats (describe) ===
          cos_sim     pearson    spearman    rel_l2_err  mean_rel_diff  \
count  363.000000  363.000000  363.000000    363.000000     363.000000   
mean    -0.000037   -0.000040   -0.000002   4294.096856   57084.277631   
std      0.001984    0.001982    0.001965   2463.475244   43614.327872   
min     -0.009123   -0.009121   -0.008588      1.144950       4.780324   
1%      -0.006066   -0.006074   -0.006189      1.159569       5.800548   
5%      -0.003547   -0.003558   -0.003408      1.181075       6.962997   
50%      0.000011    0.000011    0.000010   4663.833496   53996.414062   
95%      0.003594    0.003576    0.003543   7572.696387  123171.325000   
99%      0.006864    0.006845    0.006718  11331.787813  209752.977500   
max      0.007981    0.007958    0.008460  16530.933594  326736.812500   

       max_rel_diff  mean_abs_diff  max_abs_diff  norm_ratio   std_ratio  \
count  3.630000e+02     363.000000    363.000000  363.000000  363.000

In [25]:
pd.set_option('display.max_rows', None)        # все строки
pd.set_option('display.max_columns', None)     # все колонки
pd.set_option('display.width', None)           # не ограничивать ширину вывода
pd.set_option('display.max_colwidth', None)    # не обрезать содержимое ячеек
# df
ln_mask = df["key"].str.endswith(LAYERNORM_SUFFIXES, na=False)
df_ln = df[ln_mask]
df_nonln = df[~ln_mask]
df_nonln

Unnamed: 0,layer,key,shape,numel,norm_a,norm_b,norm_ratio,mean_a,mean_b,mean_ratio,std_a,std_b,std_ratio,zero_frac_a,zero_frac_b,cos_sim,rel_l2_err,mean_rel_diff,max_rel_diff,pearson,max_abs_diff,mean_abs_diff,spearman,q01_abs_diff,q05_abs_diff,q50_abs_diff,q95_abs_diff,q99_abs_diff,spec_rel_l2_err,spec_a_max,spec_a_min,spec_b_max,spec_b_min,spec_topk_u_cos_mean,spec_topk_u_cos_min,spec_topk_v_cos_mean,spec_topk_v_cos_min,spec_subspace_overlap_u,spec_subspace_overlap_v,shape_a,shape_b,note
1,0,model.layers.0.mlp.down_proj.weight,"(7168, 18432)",132120576.0,59.667873,428978.46875,0.000139,1.9384e-07,-0.005042707,193840.0,0.005191,37.320744,0.000139,0.0,4.7e-05,3.7e-05,7189.438477,83933.0625,624574700000.0,3.7e-05,448.040527,23.491154,-6.912752e-06,0.207336,1.003906,13.997543,80.000687,143.996338,7188.874512,"tensor(10.6355, device='cuda:0')","tensor(0.0307, device='cuda:0')","tensor(88471.5312, device='cuda:0')","tensor(123.8643, device='cuda:0')",0.009858,2.135959e-05,0.006279,8.940118e-05,0.011917,0.007174,,,
2,0,model.layers.0.mlp.gate_proj.weight,"(18432, 7168)",132120576.0,56.593945,233500.296875,0.000242,-1.141391e-05,0.2023005,-5.642058e-05,0.004924,20.313309,0.000242,0.0,0.000274,-0.000151,4125.888672,94711.351562,1379779000000.0,-0.000128,448.040039,6.796145,-0.002233854,0.035805,0.186165,2.499613,23.998825,79.999825,4124.767578,"tensor(42.7907, device='cuda:0')","tensor(0.0222, device='cuda:0')","tensor(172167.4219, device='cuda:0')","tensor(14.0500, device='cuda:0')",0.01198,1.085157e-05,0.009933,6.676973e-05,0.009958,0.011808,,,
3,0,model.layers.0.mlp.up_proj.weight,"(18432, 7168)",132120576.0,51.375027,589028.5,8.7e-05,4.222969e-07,-0.003271475,422296.9,0.00447,51.244949,8.7e-05,0.0,2.6e-05,-1.7e-05,11465.268555,144081.3125,694428400000.0,-1.7e-05,448.027222,35.005745,1.957554e-05,0.376633,1.876915,23.983521,111.997208,176.000168,11464.057617,"tensor(9.5120, device='cuda:0')","tensor(0.0260, device='cuda:0')","tensor(171213.8281, device='cuda:0')","tensor(193.5397, device='cuda:0')",0.005732,7.1109e-05,0.008338,0.0002253988,0.00719,0.011946,,,
6,0,model.layers.0.self_attn.kv_a_proj_with_mqa.weight,"(576, 7168)",4128768.0,42.883209,143063.109375,0.0003,8.415574e-06,-0.05285055,8415574.0,0.021105,70.407242,0.0003,0.0,2.3e-05,-0.000718,3336.111328,27890.207031,5200393000.0,-0.000717,448.052734,47.079998,-0.0008733455,0.450989,2.255432,27.993683,160.000259,239.989075,3335.079834,"tensor(10.8030, device='cuda:0')","tensor(0.3528, device='cuda:0')","tensor(24165.8203, device='cuda:0')","tensor(349.4814, device='cuda:0')",0.034686,0.000327646,0.010963,1.366049e-05,0.040606,0.011874,,,
7,0,model.layers.0.self_attn.kv_b_proj.weight,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(20480, 512)","(32768, 512)",shape_mismatch
8,0,model.layers.0.self_attn.o_proj.weight,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(7168, 12288)","(7168, 16384)",shape_mismatch
10,0,model.layers.0.self_attn.q_a_proj.weight,"(1536, 7168)",11010048.0,32.806965,295782.71875,0.000111,1.08129e-06,-0.01848673,1081290.0,0.009887,89.141144,0.000111,0.0,1e-05,0.000173,9015.852539,108942.070312,84577820000.0,0.000173,448.038086,68.106529,-2.073852e-05,0.940406,4.51532,52.008484,176.012634,255.995758,9015.641602,"tensor(13.7596, device='cuda:0')","tensor(0.0613, device='cuda:0')","tensor(49268.8477, device='cuda:0')","tensor(448.7613, device='cuda:0')",0.020428,0.0003701173,0.009279,0.0001076259,0.025462,0.011806,,,
11,0,model.layers.0.self_attn.q_b_proj.weight,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"(12288, 1536)","(24576, 1536)",shape_mismatch
13,1,model.layers.1.mlp.down_proj.weight,"(7168, 18432)",132120576.0,31.056589,311506.96875,0.0001,1.97459e-08,-0.006014036,19745.9,0.002702,27.100826,0.0001,0.0,6.7e-05,-8.5e-05,10030.302734,92496.351562,221007400000.0,-8.5e-05,448.031982,15.785647,-8.367174e-05,0.142975,0.748177,9.001137,52.000927,104.000656,10030.518555,"tensor(14.0962, device='cuda:0')","tensor(0.0196, device='cuda:0')","tensor(152488.9688, device='cuda:0')","tensor(54.7479, device='cuda:0')",0.009657,2.635098e-05,0.005309,2.208141e-05,0.011796,0.006596,,,
14,1,model.layers.1.mlp.gate_proj.weight,"(18432, 7168)",132120576.0,46.198711,275344.03125,0.000168,7.893201e-06,0.21295,3.706598e-05,0.004019,23.953739,0.000168,0.0,0.000473,-0.002543,5959.996582,80385.34375,190185800000.0,-0.002561,448.058594,5.853664,-0.004157898,0.020668,0.102169,1.375315,17.999088,111.999542,5958.773926,"tensor(39.3205, device='cuda:0')","tensor(0.0202, device='cuda:0')","tensor(243742.2188, device='cuda:0')","tensor(14.6983, device='cuda:0')",0.0114,2.853623e-05,0.00934,9.212323e-05,0.009154,0.011891,,,


In [27]:
df_ln

Unnamed: 0,layer,key,shape,numel,norm_a,norm_b,norm_ratio,mean_a,mean_b,mean_ratio,std_a,std_b,std_ratio,zero_frac_a,zero_frac_b,cos_sim,rel_l2_err,mean_rel_diff,max_rel_diff,pearson,max_abs_diff,mean_abs_diff,spearman,q01_abs_diff,q05_abs_diff,q50_abs_diff,q95_abs_diff,q99_abs_diff,spec_rel_l2_err,spec_a_max,spec_a_min,spec_b_max,spec_b_min,spec_topk_u_cos_mean,spec_topk_u_cos_min,spec_topk_v_cos_mean,spec_topk_v_cos_min,spec_subspace_overlap_u,spec_subspace_overlap_v,shape_a,shape_b,note
0,0,model.layers.0.input_layernorm.weight,"(7168,)",7168.0,5.333566,3.708255,1.438295,0.05873,0.041479,1.415882,0.022791,0.014067,1.62016,0.0,0.0,0.883294,0.50512,0.325193,5.045249,0.003609,0.272217,0.021565,-0.007118,0.000244,0.001709,0.016113,0.060791,0.127278,0.304732,"tensor(5.3336, device='cuda:0')","tensor(5.3336, device='cuda:0')","tensor(3.7083, device='cuda:0')","tensor(3.7083, device='cuda:0')",1.0,1.0,0.883294,0.883294,1.0,0.883294,,,
4,0,model.layers.0.post_attention_layernorm.weight,"(7168,)",7168.0,1.600154,1.976582,0.809556,0.016732,0.016968,0.986042,0.00879,0.016035,0.548183,0.0,0.0,0.64594,0.964386,1.760695,4720.670898,0.007855,0.676208,0.009883,0.009918,0.000122,0.000732,0.007568,0.023434,0.040568,0.235245,"tensor(1.6002, device='cuda:0')","tensor(1.6002, device='cuda:0')","tensor(1.9766, device='cuda:0')","tensor(1.9766, device='cuda:0')",1.0,1.0,0.64594,0.64594,1.0,0.645941,,,
5,0,model.layers.0.self_attn.kv_a_layernorm.weight,"(512,)",512.0,0.15246,0.231502,0.658569,0.005553,0.006916,0.802899,0.003816,0.007539,0.506165,0.0,0.0,0.550863,1.277797,2.887821,84.512604,-0.015007,0.068909,0.006171,-0.009046,0.000122,0.000535,0.004852,0.015239,0.021301,0.518445,"tensor(0.1525, device='cuda:0')","tensor(0.1525, device='cuda:0')","tensor(0.2315, device='cuda:0')","tensor(0.2315, device='cuda:0')",1.0,1.0,0.550863,0.550863,1.0,0.550863,,,
9,0,model.layers.0.self_attn.q_a_layernorm.weight,"(1536,)",1536.0,8.237963,17.68976,0.465691,0.193372,0.443633,0.435884,0.082398,0.08318,0.990599,0.0,0.0,0.905126,1.312957,1.929219,23.032129,0.01273,1.400146,0.252936,0.021972,0.013672,0.058105,0.260742,0.419006,0.477258,1.147347,"tensor(8.2380, device='cuda:0')","tensor(8.2380, device='cuda:0')","tensor(17.6898, device='cuda:0')","tensor(17.6898, device='cuda:0')",1.0,1.0,0.905127,0.905127,1.0,0.905127,,,
12,1,model.layers.1.input_layernorm.weight,"(7168,)",7168.0,4.966424,4.025096,1.233865,0.053581,0.044444,1.20558,0.023877,0.016881,1.414466,0.0,0.0,0.853763,0.522459,0.437025,12.734513,-0.000896,0.185303,0.022439,-0.002995,0.000244,0.001465,0.016632,0.064941,0.096353,0.189538,"tensor(4.9664, device='cuda:0')","tensor(4.9664, device='cuda:0')","tensor(4.0251, device='cuda:0')","tensor(4.0251, device='cuda:0')",1.0,1.0,0.853763,0.853763,1.0,0.853763,,,
16,1,model.layers.1.post_attention_layernorm.weight,"(7168,)",7168.0,4.805261,5.418404,0.886841,0.045868,0.04839,0.947874,0.033429,0.041884,0.798134,0.0,0.0,0.607914,0.948953,1.46162,755.537292,-0.008112,0.530029,0.031187,-0.004826,0.000285,0.001953,0.020996,0.080315,0.259021,0.127598,"tensor(4.8053, device='cuda:0')","tensor(4.8053, device='cuda:0')","tensor(5.4184, device='cuda:0')","tensor(5.4184, device='cuda:0')",1.0,1.0,0.607914,0.607914,1.0,0.607914,,,
17,1,model.layers.1.self_attn.kv_a_layernorm.weight,"(512,)",512.0,0.091417,0.262622,0.348093,0.002844,0.008173,0.348004,0.002869,0.008241,0.348181,0.0,0.0,0.498078,2.528081,19.07065,1066.807495,0.004631,0.041565,0.007834,-0.01043,0.000132,0.0005,0.005886,0.020419,0.025437,1.872793,"tensor(0.0914, device='cuda:0')","tensor(0.0914, device='cuda:0')","tensor(0.2626, device='cuda:0')","tensor(0.2626, device='cuda:0')",1.0,1.0,0.498078,0.498078,1.0,0.498078,,,
21,1,model.layers.1.self_attn.q_a_layernorm.weight,"(1536,)",1536.0,5.265309,17.526243,0.300424,0.116436,0.419999,0.277228,0.067022,0.15356,0.436454,0.0,0.0,0.81317,2.581912,5.037098,172.381821,-0.004717,0.845947,0.306409,0.003983,0.009131,0.05542,0.300781,0.59375,0.704395,2.328625,"tensor(5.2653, device='cuda:0')","tensor(5.2653, device='cuda:0')","tensor(17.5262, device='cuda:0')","tensor(17.5262, device='cuda:0')",1.0,1.0,0.81317,0.81317,1.0,0.81317,,,
24,2,model.layers.2.input_layernorm.weight,"(7168,)",7168.0,6.698238,5.042212,1.328432,0.074627,0.056358,1.324173,0.026269,0.019253,1.364385,0.0,0.0,0.894958,0.468261,0.384318,10.901235,0.02179,0.197266,0.028536,0.022656,0.000488,0.002197,0.022949,0.07316,0.113362,0.247233,"tensor(6.6982, device='cuda:0')","tensor(6.6982, device='cuda:0')","tensor(5.0422, device='cuda:0')","tensor(5.0422, device='cuda:0')",1.0,1.0,0.894958,0.894958,1.0,0.894959,,,
28,2,model.layers.2.post_attention_layernorm.weight,"(7168,)",7168.0,5.107561,3.691749,1.383507,0.053593,0.039059,1.372092,0.027699,0.019384,1.428917,0.0,0.0,0.798047,0.607275,0.457887,44.881279,0.011214,0.599915,0.024096,0.028088,0.000244,0.001709,0.018066,0.063089,0.121575,0.277199,"tensor(5.1076, device='cuda:0')","tensor(5.1076, device='cuda:0')","tensor(3.6917, device='cuda:0')","tensor(3.6917, device='cuda:0')",1.0,1.0,0.798047,0.798047,1.0,0.798047,,,


In [26]:
df.to_csv(RESULT_CSV_PATH, index=False)