GDV

In [2]:
import os

# Assuming you have already defined the computeGDV function in your code
# ROUTINES FOR COMPUTING THE GENERAL DISCRIMINATION VALUE

'''
The routine cmpGDV(dta,lab) expects
  a matrix of data (rows = data vectors) and
  a vector of corresponding labels.

It returns
  the mean intra-cluster distance,
  the mean inter-cluster distance, and
  the gdv-value
'''

# ***** IMPORTS ****************************************************************

from numpy import unique, concatenate, zeros
from numpy import isnan, isinf, sum, sqrt, array, triu
from numpy.random import seed, multivariate_normal
from scipy.spatial import distance

# ------------------------------------------------------------------------------

def makeGDVData(dta,lab):
  res = []
  labels = unique(lab)
  for L in labels:
    res.append( dta[ lab==L ] )
  return res

# ------------------------------------------------------------------------------

def zScoreSpecial(data):

  # get parameters
  NC = len(data) # nr. of clusters
  ND = data[0].shape[1] # nr. of dimensions

  # copy data --> zData
  zData = []
  for C in range(NC):
    arr = data[C].copy()
    zData.append(arr)

  # compute means and STDs for each dimension, over ALL data
  all = concatenate(zData)
  mu =  zeros(shape=ND, dtype=float)
  sig = zeros(shape=ND, dtype=float)
  for D in range(ND):
    mu[D]  = all[:,D].mean()
    sig[D] = all[:,D].std()

  # z-score the data in each cluster
  for C in range(NC):
    for D in range(ND):
      zData[C][:,D] = ( zData[C][:,D] - mu[D] ) / ( 2 * sig[D] )

  # replace nan and inf by 0
  for C in range(NC):
    nanORinf = isnan(zData[C]) | isinf(zData[C])
    zData[C][ nanORinf ] = 0.0

  return zData

# ------------------------------------------------------------------------------

def computeGDV(data):

  '''
  Returns the Generalized Discrimination Value
  as well as intraMean and interMean

  data is expected to be a list of label-sorted point 'clusters':
  data = [cluster1, cluster2, ...]

  Each cluster is a NumPy matrix,
  and the rows of this matrix
  are n-dimensional data vectors,
  each belonging to the same label.
  '''

  # get parameters
  NC = len(data) # nr. of clusters
  ND = data[0].shape[1] # nr. of dimensions

  # copy data --> zData
  zData = []
  for C in range(NC):
    arr = data[C].copy()
    zData.append(arr)

  # dimension-wise z-scoring
  zData = zScoreSpecial(zData)

  # intra-cluster distances
  dIntra = zeros(shape=NC, dtype=float)
  for C in range(NC):
    NP = zData[C].shape[0]
    dis = distance.cdist(zData[C], zData[C], 'euclidean')
    # dis is symmetric with zero diagonal
    dIntra[C] = sum(dis) / (NP*(NP-1)) # divide by nr. of non-zero el.
  #print('dIntra = ',dIntra)

  # inter-cluster distances
  dInter = zeros(shape=(NC,NC), dtype=float)
  for C1 in range(NC):
    NP1 = zData[C1].shape[0]
    for C2 in range(NC):
      NP2 = zData[C2].shape[0]
      dis = distance.cdist(zData[C1], zData[C2], 'euclidean')
      dInter[C1][C2] = sum(dis) / (NP1*NP2) # divide by nr. of non-zero el.
  #print('dInter =\n',dInter)

  # compute GDV
  pre = 1.0 / sqrt(float(ND))
  intraMean = dIntra.mean()
  interMean = sum( triu(dInter,k=1) ) / (NC*(NC-1)/2) # divide by nr. of non-zero el.
  #print('intraMean=',intraMean,'\ninterMean=',interMean)
  gdv = pre * (intraMean - interMean)

  return pre*intraMean, pre*interMean,gdv

# ------------------------------------------------------------------------------

def cmpGDV(dta,lab):
  gdvData = makeGDVData(dta,lab)
  intraMean,interMean,gdv = computeGDV(gdvData)
  return intraMean,interMean,gdv

# ------------------------------------------------------------------------------

def TestGDV():

  # TEST 1

  # generate first cluster
  mean = array([0.0, 0.0])
  cov = array([[0.04, 0.0 ],
               [0.0 , 0.04]])
  seed(978820)
  cluster1 =  multivariate_normal(mean,cov,1000)
  print(cluster1)

  # generate second cluster
  mean = array([1.0, 1.0])
  cov = array([[0.04, 0.0 ],
               [0.0 , 0.04]])
  seed(978820)
  cluster2 =  multivariate_normal(mean,cov,1000)

  # data = list of clusters
  data = []
  data.append(cluster1)
  data.append(cluster2)
  #Plot2D(data,0,1,'case1.png')

  # compute GDV
  intraMean,interMean,gdv = computeGDV(data)
  print('GDV = ',gdv)

  # TEST 2

  # generate first cluster
  mean = array([0.0, 0.0])
  cov = array([[1.0, 0.0 ],
               [0.0 ,1.0]])
  seed(978820)
  cluster1 =  multivariate_normal(mean,cov,1000)

  # generate second cluster
  mean = array([1.0, 1.0])
  cov = array([[1.0, 0.0 ],
               [0.0 , 1.0]])
  seed(978820)
  cluster2 =  multivariate_normal(mean,cov,1000)

  # data = list of clusters
  data = []
  data.append(cluster1)
  data.append(cluster2)
  #Plot2D(data,0,1,'case1.png')

  # compute GDV
  intraMean,interMean,gdv = computeGDV(data)
  print('GDV = ',gdv)

Data

In [None]:
import numpy as np
import os

# List of .npz file paths (layers 1-12)/ for the CxG-bert only change the paths in "file_paths".
file_paths = [
    'BERT-base_layer_01.npz',
    'BERT-base_layer_02.npz',
    'BERT-base_layer_03.npz',
    'BERT-base_layer_04.npz',
    'BERT-base_layer_05.npz',
    'BERT-base_layer_06.npz',
    'BERT-base_layer_07.npz',
    'BERT-base_layer_08.npz',
    'BERT-base_layer_09.npz',
    'BERT-base_layer_10.npz',
    'BERT-base_layer_11.npz',
    'BERT-base_layer_12.npz'
]

# Loop through each file and check what it contains
for file in file_paths:
    print(f"\n--- {file} ---")
    data = np.load(file)
    print("Keys:", data.files)   # list of array names stored in the file
    for key in data.files:
        arr = data[key]
        print(f"  {key}: shape={arr.shape}, dtype={arr.dtype}")

Robustness

In [4]:
import numpy as np
import matplotlib.pyplot as plt
import os
from scipy.spatial.distance import pdist, squareform



def zscore_special_matrix(X):
    """Z-score matching your GDV definition: (x - mu) / (2*sigma)"""
    mu = X.mean(axis=0)
    sig = X.std(axis=0)
    Z = (X - mu) / (2.0 * (sig + 1e-12))
    Z[~np.isfinite(Z)] = 0.0
    return Z

def gdv_from_distance_matrix(D, y, ndims):
    """Compute GDV from distance matrix"""
    pre = 1.0 / np.sqrt(float(ndims))
    classes = np.unique(y)
    k = len(classes)

    # Intra-cluster means
    intra_vals = []
    for c in classes:
        idx = np.where(y == c)[0]
        if len(idx) < 2:
            continue
        d_sub = D[np.ix_(idx, idx)]
        n = len(idx)
        tri = d_sub[np.triu_indices(n, k=1)]
        intra_vals.append(tri.mean() if tri.size > 0 else 0.0)
    intra_mean = np.mean(intra_vals) if len(intra_vals) > 0 else 0.0

    # Inter-cluster mean
    inter_vals = []
    for i in range(k):
        for j in range(i+1, k):
            idx_i = np.where(y == classes[i])[0]
            idx_j = np.where(y == classes[j])[0]
            if len(idx_i) == 0 or len(idx_j) == 0:
                continue
            block = D[np.ix_(idx_i, idx_j)]
            inter_vals.append(block.mean())
    inter_mean = np.mean(inter_vals) if len(inter_vals) > 0 else 0.0

    gdv = pre * (intra_mean - inter_mean)
    return pre*intra_mean, pre*inter_mean, gdv

def observed_gdv(X, y):
    """Compute observed GDV"""
    ndims = X.shape[1]
    Z = zscore_special_matrix(X)
    D = squareform(pdist(Z, metric='euclidean'))
    return gdv_from_distance_matrix(D, y, ndims), D

def load_family_from_npz(npz_path, keys_in_family):
    """Load embeddings for a verb family"""
    d = np.load(npz_path)
    X_list, y_list, names = [], [], []
    label_id = 0
    for key in keys_in_family:
        if key in d.files:
            Xi = d[key]
            X_list.append(Xi)
            y_list.append(np.full(len(Xi), label_id, dtype=int))
            names.append(key)
            label_id += 1
    if label_id < 2:
        return None, None, names
    X = np.vstack(X_list)
    y = np.concatenate(y_list)
    return X, y, names

# ============================================================
# Bootstrap CI for GDV
# ============================================================

def bootstrap_gdv_ci(X, y, n_boot=1000, ci=95, seed=42):
    """Bootstrap resample to estimate CI on GDV"""
    rng = np.random.default_rng(seed)
    n = X.shape[0]
    ndims = X.shape[1]

    gdv_boots = np.empty(n_boot, dtype=float)
    for b in range(n_boot):
        idx = rng.choice(n, size=n, replace=True)
        Xb, yb = X[idx], y[idx]
        Z = zscore_special_matrix(Xb)
        D = squareform(pdist(Z, metric='euclidean'))
        _, _, gdv_b = gdv_from_distance_matrix(D, yb, ndims)
        gdv_boots[b] = gdv_b

    alpha = (100 - ci) / 2.0
    lower = np.percentile(gdv_boots, alpha)
    upper = np.percentile(gdv_boots, 100 - alpha)
    mean = np.mean(gdv_boots)

    return mean, lower, upper, gdv_boots

# ============================================================
# Main analysis
# ============================================================

file_paths = [f'BERT-base_layer_{i:02d}.npz' for i in range(1, 13)]

groups = {
    "agree": ['agree_on', 'agree_with', 'agree_that', 'agree_to'],
    "come":  ['come_back', 'come_in', 'come_out'],
    "give":  ['give_up', 'give_in', 'give_out', 'give_away'],
    "all":   ['agree_on', 'agree_with', 'agree_that', 'agree_to',
              'come_back', 'come_in', 'come_out',
              'give_up', 'give_in', 'give_out', 'give_away'],
}

N_BOOT = 1000
CI_LEVEL = 95
results_boot = []

for npz_path in file_paths:
    layer_name = os.path.basename(npz_path).replace('.npz', '')
    # FIX: Extract layer number correctly
    layer_idx = int(layer_name.split('_')[2])  # Changed from [1] to [2]

    for group_name, family_keys in groups.items():
        X, y, class_names = load_family_from_npz(npz_path, family_keys)
        if X is None or len(np.unique(y)) < 2:
            continue

        # Observed GDV
        (_, _, gdv_obs), _ = observed_gdv(X, y)

        # Bootstrap CI
        gdv_mean, gdv_lo, gdv_hi, gdv_boots = bootstrap_gdv_ci(
            X, y, n_boot=N_BOOT, ci=CI_LEVEL, seed=42
        )
        gdv_sd = np.std(gdv_boots)

        results_boot.append({
            'layer_idx': layer_idx,
            'layer': layer_name,
            'group': group_name,
            'gdv_obs': gdv_obs,
            'gdv_boot_mean': gdv_mean,
            'gdv_boot_sd': gdv_sd,
            'gdv_boot_lower': gdv_lo,
            'gdv_boot_upper': gdv_hi,
        })

        print(f"{layer_name} | {group_name}: "
              f"GDV_obs={gdv_obs:.3f}, "
              f"Boot mean={gdv_mean:.3f}, SD={gdv_sd:.4f}, "
              f"{CI_LEVEL}% CI=[{gdv_lo:.3f}, {gdv_hi:.3f}]")

# Save results
import pandas as pd
df_boot = pd.DataFrame(results_boot)
df_boot.to_csv("CxG-BERT_gdv_bootstrap_ci.csv", index=False)

# ============================================================
# Plot with CI and SD
# ============================================================

os.makedirs("plots_with_ci", exist_ok=True)

for group_name in df_boot['group'].unique():
    gdf = df_boot[df_boot['group'] == group_name].sort_values('layer_idx')

    # Shaded CI plot
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.plot(gdf['layer_idx'], gdf['gdv_obs'],
            marker='o', label='Observed GDV', color='C0')
    ax.fill_between(gdf['layer_idx'],
                     gdf['gdv_boot_lower'],
                     gdf['gdv_boot_upper'],
                     alpha=0.3, color='C0',
                     label=f'{CI_LEVEL}% Bootstrap CI')
    ax.set_xlabel('Layer', fontsize=12)
    ax.set_ylabel('GDV', fontsize=12)
    ax.set_title(f'GDV across layers — {group_name} (with {CI_LEVEL}% CI)',
                 fontsize=13)
    ax.legend()
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"plots_with_ci/gdv_with_ci_{group_name}.png", dpi=200)
    plt.close()

    # Error bar plot
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.errorbar(gdf['layer_idx'], gdf['gdv_obs'],
                yerr=gdf['gdv_boot_sd'],
                marker='o', capsize=5, label='Observed GDV ± SD')
    ax.set_xlabel('Layer', fontsize=12)
    ax.set_ylabel('GDV', fontsize=12)
    ax.set_title(f'GDV across layers — {group_name} (with bootstrap SD)',
                 fontsize=13)
    ax.legend()
    ax.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"plots_with_ci/gdv_with_sd_{group_name}.png", dpi=200)
    plt.close()

print("\n✓ Bootstrap CI/SD analysis complete.")
print(f"  - CSV: CxG-BERT_gdv_bootstrap_ci.csv")
print(f"  - Plots: plots_with_ci/ ({len(df_boot['group'].unique())*2} files)")

BERT-base_layer_01 | agree: GDV_obs=-0.062, Boot mean=-0.068, SD=0.0023, 95% CI=[-0.073, -0.064]
BERT-base_layer_01 | come: GDV_obs=-0.090, Boot mean=-0.096, SD=0.0027, 95% CI=[-0.102, -0.091]
BERT-base_layer_01 | give: GDV_obs=-0.098, Boot mean=-0.105, SD=0.0021, 95% CI=[-0.109, -0.100]
BERT-base_layer_01 | all: GDV_obs=-0.356, Boot mean=-0.360, SD=0.0013, 95% CI=[-0.363, -0.358]
BERT-base_layer_02 | agree: GDV_obs=-0.105, Boot mean=-0.111, SD=0.0028, 95% CI=[-0.116, -0.106]
BERT-base_layer_02 | come: GDV_obs=-0.138, Boot mean=-0.144, SD=0.0034, 95% CI=[-0.151, -0.138]
BERT-base_layer_02 | give: GDV_obs=-0.163, Boot mean=-0.169, SD=0.0026, 95% CI=[-0.173, -0.163]
BERT-base_layer_02 | all: GDV_obs=-0.347, Boot mean=-0.351, SD=0.0015, 95% CI=[-0.354, -0.348]
BERT-base_layer_03 | agree: GDV_obs=-0.241, Boot mean=-0.247, SD=0.0043, 95% CI=[-0.255, -0.238]
BERT-base_layer_03 | come: GDV_obs=-0.236, Boot mean=-0.241, SD=0.0047, 95% CI=[-0.251, -0.231]
BERT-base_layer_03 | give: GDV_obs=-0.3

In [5]:
import shutil
import os

# Compress the folder
shutil.make_archive('/content/plots_with_ci', 'zip', '/content/plots_with_ci')

# Download the zip file
from google.colab import files
files.download('/content/plots_with_ci.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>