# Setup

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AutoTokenizer
import matplotlib.pyplot as plt
import numpy as np
from dataclasses import dataclass
from transformers import AutoConfig, AutoModelForCausalLM, PreTrainedModel
from safetensors.torch import load_model, save_model, safe_open
from weak_to_strong.model import TransformerWithHead
import seaborn as sns
from sklearn.decomposition import PCA
from datasets import load_dataset
from sklearn.manifold import TSNE
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
from sklearn.utils import gen_batches
from tqdm import tqdm
import torch.nn.functional as F

In [None]:
device = 'cpu'
print("Using: " + str(device))

In [12]:
torch.cuda.get_device_name()

'NVIDIA H100 80GB HBM3'

# Representation Extraction

In [15]:
def get_activation(name, activations):
    """ Helper function to capture the activation at each layer. """
    def hook(model, input, output):
        # We expect 'output' to be a tuple where the first element is the last hidden state
        activations[name] = output[0].detach().to(device)
    return hook

def extract_hidden_states(model, datapoint):
    """ Extract hidden states for all layers of a given model for a specific datapoint. """
    activations = {}
    hooks = []

    # Registering hooks for each layer of the transformer
    for name, module in model.transformer.named_modules():
        if isinstance(module, torch.nn.modules.Module):  # You may want to filter only certain types of layers
            hook = module.register_forward_hook(get_activation(name, activations))
            hooks.append(hook)
    
    datapoint = datapoint.to(device)
    
    # Run the datapoint through the model
    model.eval()
    with torch.no_grad():
        _ = model(datapoint)

    # Remove hooks after use
    for hook in hooks:
        hook.remove()

    return activations

def compare_models(model_name, finetuned_model_path, datapoint): 
    """ Extract and compare hidden states from two models for a given datapoint. """
        # Load both models
    pre_model = TransformerWithHead.from_pretrained(model_name).to(device)
    post_model = TransformerWithHead.from_pretrained(model_name)
    
    load_model(post_model, finetuned_model_path,'cpu')
    
    post_model = post_model.to(device)
    
    # datapoint = datapoint.to(model1.device)  # Ensure datapoint is on the same device as model
    activations_model1 = extract_hidden_states(pre_model, datapoint)
    activations_model2 = extract_hidden_states(post_model, datapoint)
    
    return activations_model1, activations_model2

def convert_input(text, model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name) 

    # Tokenize the text and convert to input IDs
    tokens = tokenizer.tokenize(text)
    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    datapoint = torch.tensor([input_ids])
    
    return datapoint

def print_layer_names(activations):
    stor = []
    for key, val in activations.items(): 
        stor.append(key)
        print(key)
    return key

def extract_act_pipeline(model_name, train_data, finetuned_model_path, layer_name = "h.11"): 
    pre_ft_activations = []
    post_ft_activations = []
    print("Converting input to activations.")
    for datapoint in tqdm(train_data):
        activations_model1, activations_model2 = compare_models(model_name, finetuned_model_path, datapoint)
        pre_ft_activations.append(activations_model1[layer_name].squeeze(0))
        post_ft_activations.append(activations_model2[layer_name].squeeze(0))
    print("Activation Loaded.")
    return pre_ft_activations, post_ft_activations



In [5]:
def plot_activation_changes(activations_pre, activations_post, layer_name, method='PCA', components=2):
    """
    Visualize changes in activations using PCA or t-SNE.

    Parameters:
    activations_pre (dict): Activations from the model before finetuning.
    activations_post (dict): Activations from the model after finetuning.
    layer_name (str): The layer whose activations are to be visualized.
    method (str): 'PCA' or 't-SNE', the method to use for dimensionality reduction.
    components (int): Number of components for the dimensionality reduction.
    """
    # Extract activations for a specific layer
    data_pre = activations_pre[layer_name].cpu().numpy()
    data_post = activations_post[layer_name].cpu().numpy()
    
    # Check if data is three-dimensional and apply mean pooling if so
    if data_pre.ndim == 3:
        # Mean across the sequence length dimension
        data_pre = data_pre.mean(axis=0)
    if data_post.ndim == 3:
        # Mean across the sequence length dimension
        data_post = data_post.mean(axis=0)
    
    # Concatenate data from both states for unified transformation in PCA/t-SNE
    data_combined = np.concatenate([data_pre, data_post], axis=0)
    
    if method == 'PCA':
        reducer = PCA(n_components=components)
    elif method == 't-SNE':
        reducer = TSNE(n_components=components, learning_rate='auto', init='random')
    else:
        raise ValueError("Unsupported dimensionality reduction method")
    
    # Fit and transform the data
    reduced_data = reducer.fit_transform(data_combined)
    
    # Split the transformed data
    reduced_data_pre = reduced_data[:data_pre.shape[0]]
    reduced_data_post = reduced_data[data_pre.shape[0]:]

    # Plotting
    plt.figure(figsize=(10, 6))
    plt.scatter(reduced_data_pre[:, 0], reduced_data_pre[:, 1], c='blue', alpha=0.5, label='Pre-Finetuning')
    plt.scatter(reduced_data_post[:, 0], reduced_data_post[:, 1], c='red', alpha=0.5, label='Post-Finetuning')
    plt.title(f'Layer: {layer_name} - {method} Visualization')
    plt.xlabel(f'{method} Component 1')
    plt.ylabel(f'{method} Component 2')
    plt.legend()
    plt.show()


In [6]:
#### SCIPY MINIMIZE

def loss_per_sample(params, lambda_x, lambda_x_tilde, dim):
    A_len = dim**2
    A = params[:A_len].reshape(dim, dim)
    delta = params[A_len:].reshape(dim, 1)
    transformed = lambda_x @ A + delta.T  # Note the transpose to match dimensions
    loss = np.linalg.norm(transformed - lambda_x_tilde, 'fro')**2
    return loss

def loss_function(params, batch_pre_ft, batch_post_ft, dim):
    losses = [loss_per_sample(params, pre_ft, post_ft, dim) for pre_ft, post_ft in zip(batch_pre_ft, batch_post_ft)]
    return np.sum(losses) / len(batch_pre_ft)


def optimize_Adelta(pre_ft_activations, post_ft_activations, batch_size, tol=1e-5, max_iter=10):
    # activation list shape: n_sample * (token_num * feature)
    # A shape: feature * feature
    # delta shape: 1 * feature 
    # batch list shape: batch_size * (token_num * feature)
    dim = pre_ft_activations[0].shape[1]
    print(dim)
    A_len = dim**2
    print(A_len)

    # Initialize A and delta
    A_init = np.eye(dim).flatten()
    delta_init = np.zeros((dim, 1)).flatten()
    initial_params = np.concatenate([A_init, delta_init])

    n_samples = len(pre_ft_activations)
    previous_loss = np.inf
    converged = False

    # Function to perform optimization on a batch
    def optimize_batch(start, end, params):
        batch_pre_ft = pre_ft_activations[start:end]
        batch_post_ft = post_ft_activations[start:end]

        result = minimize(
            fun=loss_function,
            x0=params,
            args=(batch_pre_ft, batch_post_ft, dim),
            method= 'L-BFGS-B'
        )
        return result

    # Iterate over mini-batches and optimize
    for iteration in range(max_iter):
        for batch in gen_batches(n_samples, batch_size):
            result = optimize_batch(batch.start, batch.stop, initial_params)

            if result.success:
                initial_params = result.x
                current_loss = result.fun
                loss_change = previous_loss - current_loss
                previous_loss = current_loss

                print(f"Iteration {iteration}, Batch {batch.start}-{batch.stop}, "
                      f"Current Loss: {current_loss:.6f}, Loss Change: {loss_change:.6f}")

                if np.abs(loss_change) < tol:
                    converged = True
                    break
            else:
                print(f"Optimization failed at Iteration {iteration}, Batch {batch.start}-{batch.stop}.")
                break

        if converged:
            print("Convergence criterion met.")
            break
    else:
        print("Reached maximum iterations without convergence.")

    # After going through all batches, reshape the final parameters back into A and delta
    A_optimized = initial_params[:A_len].reshape(dim, dim)
    delta_optimized = initial_params[A_len:].reshape(dim, 1)
    
    print("Optimization finished.")
    return A_optimized, delta_optimized

def avg_samples(act_list): 
    # Compute the mean along the axis 0 (num_tokens dimension)
    res = []
    for act in act_list: 
        mean_values = torch.mean(act, dim=0)

        result = mean_values.unsqueeze(0)
        res.append(result)
    return np.array(res)

# Representation Predictions [fixed]

In [None]:
def torch_loss_function(A, delta, lambda_x, lambda_x_tilde):
    """
    Computes the loss for a batch of data.

    Args:
    A (torch.Tensor): The affine transformation matrix.
    delta (torch.Tensor): The rank-one update vector.
    lambda_x (torch.Tensor): Pre-finetuning activations (batch).
    lambda_x_tilde (torch.Tensor): Post-finetuning activations (batch).

    Returns:
    torch.Tensor: The computed loss.
    """
    # Compute the affine transformation and add delta
    transformed = torch.mm(lambda_x, A) + delta
    # Calculate the Frobenius norm of the difference, scaled by the number of samples
    loss = torch.norm(transformed - lambda_x_tilde, p='fro') ** 2 / lambda_x.size(0)
    return loss

def optimize_Adelta(pre_ft_activations, post_ft_activations, batch_size, lr=1e-3, tol=1e-5, max_iter=20000):
    """
    Optimizes A and delta parameters using the provided pre and post-finetuning activations.

    Args:
    pre_ft_activations (np.array): Pre-finetuning activations.
    post_ft_activations (np.array): Post-finetuning activations.
    dim (int): The dimensionality of each feature vector.
    batch_size (int): The size of each batch for optimization.
    lr (float): Learning rate for the optimizer.
    tol (float): Tolerance for convergence.
    max_iter (int): Maximum number of iterations.

    Returns:
    Tuple[np.array, np.array]: Optimized A and delta.
    """
    dim = pre_ft_activations[0].shape[1]
    # Convert numpy arrays to torch tensors
    pre_ft_activations_tensor = torch.tensor(pre_ft_activations, dtype=torch.float32)
    post_ft_activations_tensor = torch.tensor(post_ft_activations, dtype=torch.float32)
    
    # Initialize A and delta as torch tensors
    A = nn.Parameter(torch.eye(dim, requires_grad=True))
    delta = nn.Parameter(torch.zeros(1, dim, requires_grad=True))

    # Use the Adam optimizer
    optimizer = optim.Adam([A, delta], lr=lr)

    previous_loss = float('inf')
    for iteration in range(max_iter):
        for i in range(0, len(pre_ft_activations), batch_size):
            optimizer.zero_grad()
            batch_pre_ft = pre_ft_activations_tensor[i:i+batch_size].squeeze(1)
            batch_post_ft = post_ft_activations_tensor[i:i+batch_size].squeeze(1)
#             print(batch_pre_ft.shape)
#             print(batch_post_ft.shape)
            loss = torch_loss_function(A, delta, batch_pre_ft, batch_post_ft)
            loss.backward()
            optimizer.step()

            current_loss = loss.item()
            if iteration % 50 == 0 and i == 0:  # Print the loss for the first batch every 10 iterations
                print(f"Iteration {iteration}, Loss: {current_loss:.6f}")
            
            if abs(previous_loss - current_loss) < tol:
                print("Convergence criterion met.")
                return A.detach().numpy(), delta.detach().numpy()
            previous_loss = current_loss

    print("Optimization finished.")
    return A.detach().numpy(), delta.detach().numpy()

In [None]:
A, d = optimize_Adelta(final_preact, final_postact, batch_size = 256)

In [None]:
def new_torch_loss_function(A, delta, lambda_x, lambda_x_tilde):
    """
    Computes the loss for a batch of data.

    Args:
    A (torch.Tensor): The affine transformation matrix.
    delta (torch.Tensor): The rank-one update vector.
    lambda_x (torch.Tensor): Pre-finetuning activations (batch).
    lambda_x_tilde (torch.Tensor): Post-finetuning activations (batch).

    Returns:
    torch.Tensor: The computed loss.
    """
    # Compute the affine transformation and add delta
    transformed = torch.mm(lambda_x, A) + lambda_x * delta
    # Calculate the Frobenius norm of the difference, scaled by the number of samples
    loss = torch.norm(transformed - lambda_x_tilde, p='fro') ** 2 / lambda_x.size(0)
    return loss

def optimize_Adelta_newloss(pre_ft_activations, post_ft_activations, batch_size, lr=1e-3, tol=1e-5, max_iter=20000):
    """
    Optimizes A and delta parameters using the provided pre and post-finetuning activations.

    Args:
    pre_ft_activations (np.array): Pre-finetuning activations.
    post_ft_activations (np.array): Post-finetuning activations.
    dim (int): The dimensionality of each feature vector.
    batch_size (int): The size of each batch for optimization.
    lr (float): Learning rate for the optimizer.
    tol (float): Tolerance for convergence.
    max_iter (int): Maximum number of iterations.

    Returns:
    Tuple[np.array, np.array]: Optimized A and delta.
    """
    dim = pre_ft_activations[0].shape[1]
    # Convert numpy arrays to torch tensors
    pre_ft_activations_tensor = torch.tensor(pre_ft_activations, dtype=torch.float32)
    post_ft_activations_tensor = torch.tensor(post_ft_activations, dtype=torch.float32)
    
    # Initialize A and delta as torch tensors
    A = nn.Parameter(torch.eye(dim, requires_grad=True))
    delta = nn.Parameter(torch.zeros(1, dim, requires_grad=True))

    # Use the Adam optimizer
    optimizer = optim.Adam([A, delta], lr=lr)

    previous_loss = float('inf')
    for iteration in range(max_iter):
        for i in range(0, len(pre_ft_activations), batch_size):
            optimizer.zero_grad()
            batch_pre_ft = pre_ft_activations_tensor[i:i+batch_size].squeeze(1)
            batch_post_ft = post_ft_activations_tensor[i:i+batch_size].squeeze(1)
#             print(batch_pre_ft.shape)
#             print(batch_post_ft.shape)
            loss = new_torch_loss_function(A, delta, batch_pre_ft, batch_post_ft)
            loss.backward()
            optimizer.step()

            current_loss = loss.item()
            if iteration % 50 == 0 and i == 0:  # Print the loss for the first batch every 10 iterations
                print(f"Iteration {iteration}, Loss: {current_loss:.6f}")
            
            if abs(previous_loss - current_loss) < tol:
                print("Convergence criterion met.")
                return A.detach().numpy(), delta.detach().numpy()
            previous_loss = current_loss

    print("Optimization finished.")
    return A.detach().numpy(), delta.detach().numpy()

In [None]:
newA, newd = optimize_Adelta_newloss(final_preact, final_postact, batch_size = 256)

# LORA

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

def torch_loss_function_lora(B, C, delta, lambda_x, lambda_x_tilde):
    """
    Computes the loss for a batch of data using LORA.

    Args:
    B, C (torch.Tensor): Low-rank matrices decomposing A.
    delta (torch.Tensor): The rank-one update vector.
    lambda_x (torch.Tensor): Pre-finetuning activations (batch).
    lambda_x_tilde (torch.Tensor): Post-finetuning activations (batch).

    Returns:
    torch.Tensor: The computed loss.
    """
    A = torch.mm(B, C)  # Reconstruct A from B and C
    transformed = torch.mm(lambda_x, A) + delta
    loss = torch.norm(transformed - lambda_x_tilde, p='fro') ** 2 / lambda_x.size(0)
    return loss

def optimize_lora(pre_ft_activations, post_ft_activations, rank= 3, batch_size=256, lr=1e-3, tol=1e-5, max_iter=10000):
    """
    Optimizes B, C, and delta parameters using LORA.

    Args:
    pre_ft_activations, post_ft_activations (np.array): Activation arrays.
    rank (int): Rank for the low-rank decomposition of A.
    batch_size (int): Batch size for optimization.
    lr (float): Learning rate.
    tol (float): Tolerance for convergence.
    max_iter (int): Max number of iterations.

    Returns:
    Tuple[np.array, np.array, np.array]: Optimized B, C, and delta.
    """
    dim = pre_ft_activations[0].shape[1]
    pre_ft_activations_tensor = torch.tensor(pre_ft_activations, dtype=torch.float32)
    post_ft_activations_tensor = torch.tensor(post_ft_activations, dtype=torch.float32)
    
    B = nn.Parameter(torch.randn(dim, rank, requires_grad=True))
    C = nn.Parameter(torch.randn(rank, dim, requires_grad=True))
    delta = nn.Parameter(torch.zeros(1, dim, requires_grad=True))

    optimizer = optim.Adam([B, C, delta], lr=lr)

    previous_loss = float('inf')
    for iteration in range(max_iter):
        for i in range(0, len(pre_ft_activations), batch_size):
            optimizer.zero_grad()
            batch_pre_ft = pre_ft_activations_tensor[i:i+batch_size].squeeze(1)
            batch_post_ft = post_ft_activations_tensor[i:i+batch_size].squeeze(1)
            loss = torch_loss_function_lora(B, C, delta, batch_pre_ft, batch_post_ft)
            loss.backward()
            optimizer.step()

            current_loss = loss.item()
            if iteration % 50 == 0 and i == 0:
                print(f"Iteration {iteration}, Loss: {current_loss:.6f}")
            
            if abs(previous_loss - current_loss) < tol:
                print("Convergence criterion met.")
                return B.detach().numpy(), C.detach().numpy(), delta.detach().numpy()
            previous_loss = current_loss

    print("Optimization finished.")
    return B.detach().numpy(), C.detach().numpy(), delta.detach().numpy()

In [None]:
B, C, delta = optimize_lora(final_preact, final_postact, max_iter=10000)

# Ground Truth Linear Probe

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def linear_probe(activations, labels):

    X = np.array([(np.array([1]) + act.detach().cpu().numpy()).mean(axis=0) for act in activations])
    y = np.array(labels)

    print(X.shape)
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the linear classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    return clf

def cosine_similarity_torch(vec1, vec2):
    vec1 = torch.tensor(vec1)
    vec2 = torch.tensor(vec2)
    
    similarity = F.cosine_similarity(vec1, vec2)
    
    return similarity

def plot_vecs(weights, d):
    try: 
        weights_np = weights.detach().numpy() if weights.requires_grad else weights.numpy()
        d_np = d.detach().numpy() if d.requires_grad else d.numpy()
    except Exception: 
        weights_np = weights
        d_np = d
    # Stack the vectors for PCA
    data = np.vstack([weights_np, d_np])

    # Initialize PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    data_reduced = pca.fit_transform(data)

    # Plot the reduced data
    plt.figure(figsize=(8, 6))
    plt.scatter(data_reduced[:, 0], data_reduced[:, 1], c=['red', 'blue'], label=['Weights', 'd'])
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('2D Visualization of High-Dimensional Vectors')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
labels = []
for i in tqdm(range(len(post_ft_activations))): 
    labels.append(ds["train"][i]["label"])

In [None]:
clf = linear_probe(post_ft_activations, labels)

In [None]:
weights = clf.coef_

In [None]:
weights.shape

In [None]:
d.shape

In [None]:
similarity = cosine_similarity_torch(weights, d)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(weights, delta)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(d, delta)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(weights, newd)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(delta, newd)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(d, newd)
print("Cosine similarity:", similarity)

In [None]:
similarity = cosine_similarity_torch(newd, weights)
print("Cosine similarity:", similarity)

# data prep

In [None]:
ds = load_dataset("amazon_polarity")

In [8]:
train_dps=[]
for i in range(100): 
    entry = ds["train"][i]
    train_dps.append(convert_input(entry['title'] + " " + entry['content'], "gpt2"))
    print(f"Completed {i}")

Completed 0
Completed 1
Completed 2
Completed 3
Completed 4
Completed 5
Completed 6
Completed 7
Completed 8
Completed 9
Completed 10
Completed 11
Completed 12
Completed 13
Completed 14
Completed 15
Completed 16
Completed 17
Completed 18
Completed 19
Completed 20
Completed 21
Completed 22
Completed 23
Completed 24
Completed 25
Completed 26
Completed 27
Completed 28
Completed 29
Completed 30
Completed 31
Completed 32
Completed 33
Completed 34
Completed 35
Completed 36
Completed 37
Completed 38
Completed 39
Completed 40
Completed 41
Completed 42
Completed 43
Completed 44
Completed 45
Completed 46
Completed 47
Completed 48
Completed 49
Completed 50
Completed 51
Completed 52
Completed 53
Completed 54
Completed 55
Completed 56
Completed 57
Completed 58
Completed 59
Completed 60
Completed 61
Completed 62
Completed 63
Completed 64
Completed 65
Completed 66
Completed 67
Completed 68
Completed 69
Completed 70
Completed 71
Completed 72
Completed 73
Completed 74
Completed 75
Completed 76
Completed

In [None]:
mp = "/net/scratch/weak_to_strong/weak-to-strong/results/default/bs=32-dn=amaz_pola-e=2-ee=1000000-lp=0-l=xent-l=5e-05-ls=cosi_anne-mc=1024-ms=gpt2-nd=20000-ntd=10000-o=adam-s=0-twd=0"
#fine-tuned model path

In [None]:
pre_ft_activations, post_ft_activations = extract_act_pipeline("gpt2", train_dps, mp) 

In [11]:
pre_ft_activations

NameError: name 'pre_ft_activations' is not defined

In [12]:
final_preact = avg_samples(pre_ft_activations)
final_postact = avg_samples(post_ft_activations)

NameError: name 'pre_ft_activations' is not defined

In [None]:
A_optimized, delta_optimized = optimize_Adelta(pre_ft_activations, post_ft_activations, 32)

### Usage Example

In [None]:
model_name = 'gpt2-medium'

strong_model_fintuned_path = "/net/scratch/weak_to_strong/weak-to-strong/results/default/bs=32-dn=sciq-e=2-ee=1000000-lp=0-l=xent-l=5e-05-ls=cosi_anne-mc=1024-ms=gpt2-medium-nd=20000-ntd=10000-o=adam-s=0-twd=0-wms=gpt2/model.safetensors"

# activations1, activations2 = compare_models(model_name, strong_model_fintuned_path, test_datapoint)

In [None]:
test_datapoint = convert_input("Summary Changes of state are examples of phase changes, or phase transitions. All phase changes are accompanied by changes in the energy of a system. Changes from a more-ordered state to a less-ordered state (such as a liquid to a gas) areendothermic. Changes from a less-ordered state to a more-ordered state (such as a liquid to a solid) are always exothermic. The conversion of a solid to a liquid is called fusion (or melting). The energy required to melt 1 mol of a substance is its enthalpy of fusion (ΔHfus). The energy change required to vaporize 1 mol of a substance is the enthalpy of vaporization (ΔHvap). The direct conversion of a solid to a gas is sublimation. The amount of energy needed to sublime 1 mol of a substance is its enthalpy of sublimation (ΔHsub) and is the sum of the enthalpies of fusion and vaporization. Plots of the temperature of a substance versus heat added or versus heating time at a constant rate of heating are calledheating curves. Heating curves relate temperature changes to phase transitions. A superheated liquid, a liquid at a temperature and pressure at which it should be a gas, is not stable. A cooling curve is not exactly the reverse of the heating curve because many liquids do not freeze at the expected temperature. Instead, they form a supercooled liquid, a metastable liquid phase that exists below the normal melting point. Supercooled liquids usually crystallize on standing, or adding a seed crystal of the same or another substance can induce crystallization.", model_name)


activations1, activations2 = compare_models(model_name, finetuned_model_path, test_datapoint)

# Testing with GPT2 finetuned on Polarity

In [None]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

finetuned_model_path = "/home/slhleosun/weak-to-strong-main/results/default/bs=32-dn=amaz_pola-e=2-ee=1000000-lp=0-l=xent-l=5e-05-ls=cosi_anne-mc=1024-ms=gpt2-nd=20000-ntd=10000-o=adam-s=0-twd=0/model.safetensors"

In [None]:
# datapoint = torch.randint(50257, (1, 10))  # Sample input ids, assuming GPT-2's vocabulary size

test_datapoint = convert_input("This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^", model_name)


activations1, activations2 = compare_models(model_name, finetuned_model_path, test_datapoint)

In [None]:
activations1, activations2

# Linear Probe

In [None]:
from datasets import load_dataset

polarity_ds = load_dataset("amazon_polarity")
data_len = polarity_ds["train"].num_rows

In [None]:
data_len

In [None]:
i = 0

activations_pre = []
activations_post = []
labels = []

In [None]:
while i < 300: 
    print("Working on " + str(i))
    datapoint = convert_input(polarity_ds['train'][i]["title"] + " " + polarity_ds['train'][i]["content"], model_name)
    label = polarity_ds['train'][i]["label"]
    act1, act2 = compare_models(model_name, finetuned_model_path, datapoint) 
    act1 = act1["h.11"]
    act2 = act2["h.11"]
    if act1.ndim == 3:
        act1 = act1.mean(axis=1)
    if act2.ndim == 3:
        act2 = act2.mean(axis=1)
    activations_pre.append(act1)
    activations_post.append(act2)
    labels.append(label)
    
    i += 1

In [None]:
i

In [None]:
clf

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

def linear_probe(activations, labels):

    X = np.array([(np.array([1]) + act.detach().cpu().numpy()).mean(axis=0) for act in activations])
    y = np.array(labels)

    print(X.shape)
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the linear classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = clf.predict(X_test)

    # Calculate the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    return clf

In [None]:
clf = linear_probe(activations_post, labels)

In [None]:
plt.figure(figsize=(10, 6))
plt.bar(range(len(clf.coef_[0])), clf.coef_[0])
plt.xlabel('Features')
plt.ylabel('Coefficient Value')
plt.title('Feature Importance')
plt.show()

In [None]:
from sklearn.decomposition import PCA

X = np.array([(np.array([1]) + act.detach().cpu().numpy()).mean(axis=0) for act in activations_post])
y = np.array(labels)

# Project data to 2D using PCA
pca = PCA(n_components=2)
X_r = pca.fit_transform(X)

# Fit logistic regression on the 2D projected data
clf_2d = LogisticRegression()
clf_2d.fit(X_r, y)

# Create a mesh to plot the decision boundary
x_min, x_max = X_r[:, 0].min() - .5, X_r[:, 0].max() + .5
y_min, y_max = X_r[:, 1].min() - .5, X_r[:, 1].max() + .5
h = .02  # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf_2d.predict(np.c_[xx.ravel(), yy.ravel()])

# Plot decision boundary and data points
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.8)
plt.scatter(X_r[:, 0], X_r[:, 1], c=y, edgecolors='k', cmap=plt.cm.Paired)
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('2D PCA Projection and Decision Boundary')
plt.show()


In [None]:
print_layer_names(activations1)

    wte: Word Token Embeddings - The embedding matrix for input tokens.
    wpe: Word Position Embeddings - The embedding matrix for the position of tokens in the sequence.
    drop: Dropout - A regularization layer that randomly sets input units to 0 at each update during training time to prevent overfitting.
    h: Indicates a layer of the transformer. The number following h denotes the index of the layer within the model (e.g., h.0 is the first layer, h.1 is the second layer, and so on).
    ln_1, ln_2: Layer Normalization - Normalizes the input layer by re-centering and re-scaling, often used before or after the self-attention and feed-forward components in transformer blocks.
    attn: Attention - The self-attention mechanism of the transformer. This component helps the model to focus on different parts of the input sequence when making predictions.
        c_attn: Creates the query, key, and value projections for the attention mechanism.
        attn_dropout: Dropout applied to the attention scores.
        c_proj: Projects the output of the attention mechanism back to the hidden size dimensions.
        resid_dropout: Dropout applied to the residual connection (often added after the attention output and before layer normalization).
    mlp: Multi-Layer Perceptron - A small feed-forward neural network that follows the attention mechanism in each transformer block. It usually consists of two linear layers with an activation in between.
        c_fc: The first linear layer (fully connected) of the MLP.
        act: The activation function of the MLP (e.g., GELU or ReLU).
        c_proj: The second linear layer of the MLP.
        dropout: Dropout applied within the MLP to prevent overfitting.
    ln_f: Final Layer Normalization - The layer normalization applied after the last transformer block, before the final output layer.

### Guide for Interpreting the Graphs 
- The relative position of blue and red dots indicates how similar or different the activations are before and after finetuning. If they overlap significantly, the representations are relatively stable through finetuning. If they are separate, it suggests that finetuning has significantly shifted the activations.
- The shift from blue to red dots indicates the direction of change in the hidden states due to finetuning. For instance, if the red dots are predominantly to the right of the blue dots, this indicates that the activations have shifted along the direction represented by PCA Component 1.
- Any consistent pattern of shift (e.g., most red dots are in a specific quadrant relative to the blue dots) may suggest how finetuning has affected the internal representations of the neural network. Perhaps the finetuning has made the network more sensitive to certain features, or has made it focus on different aspects of the input data.

In [None]:
plot_activation_changes(activations1, activations2, 'h.11', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.11.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.10', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.10.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.9', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.8', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.7', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.6.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.6', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.5.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.4.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.3.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.2.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.1.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.0.mlp', method='PCA')

In [None]:
plot_activation_changes(activations1, activations2, 'h.0.mlp', method='PCA')

# Legacy: Weight Changes

In [None]:
def extract_model_weights(model):
    weights = []
    # Iterate through each block in the GPT-2 transformer
    for block in model.transformer.h:
        # Each block has two main components: the attention mechanism and the MLP
        # Extract weights from attention mechanism's query, key, value matrices
        attn = block.attn
        weights.append(attn.query.weight.data)
        weights.append(attn.key.weight.data)
        weights.append(attn.value.weight.data)
        weights.append(attn.out.weight.data)
        
        # Extract weights from MLP
        mlp = block.mlp
        weights.append(mlp.c_fc.weight.data)
        weights.append(mlp.c_proj.weight.data)
    return weights

weights_pre = pre_model.score.weight.detach().numpy()
weights_post = post_model.score.weight.detach().numpy()


In [None]:
def visualize_weights(initial_weights, finetuned_weights):
    fig, axs = plt.subplots(1, 2, figsize=(12, 12))
    sns.heatmap(initial_weights, ax=axs[0], cmap='viridis')
    axs[0].set_title('Initial Weights')
    sns.heatmap(finetuned_weights, ax=axs[1], cmap='viridis')
    axs[1].set_title('Fine-tuned Weights')
    plt.show()
    
visualize_weights(weights_pre, weights_post)
visualize_weight_changes(weights_pre, weights_post, "linear head")

In [None]:
original_weights = pre_model.transformer.wte.weight.detach().numpy()  # Example: word embedding layer
finetuned_weights = post_model.transformer.wte.weight.detach().numpy()

visualize_weights(original_weights, finetuned_weights)
visualize_weight_changes(original_weights, finetuned_weights, "word embedding")

In [None]:
original_weights = pre_model.transformer.wte.weight.detach().numpy()  # Example: word embedding layer
finetuned_weights = post_model.transformer.wte.weight.detach().numpy()

visualize_weights(original_weights, finetuned_weights)

In [None]:
original_query_weights = original_model.transformer.h[0].attn.c_attn.weight.detach().numpy()
original_key_weightsb = original_model.transformer.h[0].attn.c_attn.weight.detach().numpy()
original_value_weights = original_model.transformer.h[0].attn.c_attn.weight.detach().numpy()


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_weight_changes(initial_weights, finetuned_weights, layer_name):
    # Calculate the absolute differences between the weights
    weight_differences = finetuned_weights - initial_weights
    
    # Normalize the differences to have a better visual comparison
    # This is optional and can be commented out if raw differences are preferred
    weight_differences = weight_differences / np.std(weight_differences)
    
    # Visualize the differences using a heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(weight_differences, cmap='coolwarm', center=0)
    plt.title(f'Weight Changes in {layer_name}')
    plt.xlabel('Neurons in Previous Layer')
    plt.ylabel('Neurons in Current Layer')
    plt.colorbar(label='Magnitude of Weight Change')
    plt.show()


In [None]:
original_ffn_weights = pre_model.transformer.h[0].mlp.c_fc.weight.detach().numpy()
finetuned_ffn_weights = post_model.transformer.h[0].mlp.c_fc.weight.detach().numpy()

visualize_weights(original_ffn_weights, finetuned_ffn_weights)
visualize_weight_changes(original_ffn_weights, finetuned_ffn_weights, "ffn")

In [None]:
original_weights = pre_model.transformer.h[0].attn.c_proj.weight.detach().numpy()
finetuned_weights = post_model.transformer.h[0].attn.c_proj.weight.detach().numpy()

visualize_weights(original_weights, finetuned_weights)

In [None]:
tensors

In [None]:
post_model = TransformerWithHead.from_pretrained(model_name)