In [1]:
import transformers
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch
import torch.nn as nn
from transformer_xray.record_utils import ActivationRecorder
from transformer_xray.perturb_utils import register_pertubation_hooks
from modular_transformers.models import components

import numpy as np

from tqdm import tqdm

from sklearn.decomposition import PCA

import os
import pandas as pd
import json
import subprocess
import gc

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data(num_bigrams):
    bigram_text = open("bigrams.txt", "r").read().split("\n")
    bigrams = [(int(b[b.find('\t')+1:]), b[:b.find('\t')]) for b in bigram_text if len(b) > 0]
    bigrams.sort(key=lambda x: x[0], reverse=True)
    bigrams = bigrams[:num_bigrams]

    bigram_tokens = [tokenizer.encode(b[1]) for b in bigrams]
    # bigram_tokens = [b + [tokenizer.pad_token_id]*(max_token_num-len(b)) for b in bigram_tokens]
    bigram_tokens = [torch.tensor(b) for b in bigram_tokens]
    bigram_tokens = [b for b in bigram_tokens if len(b) == 2]
    # bigram_tokens = torch.cat(bigram_tokens, dim=0)
    return bigram_tokens

In [3]:
def make_activations(model, hook_locations, num_bigrams=100):
    activation_recorder = ActivationRecorder(model, hook_locations)
    activation_recorder.register_recording_hooks()

    bigram_tokens = load_data(num_bigrams)

    logits = []
    for bigram in tqdm(bigram_tokens):
        input = bigram.to(device)
        output = model(input)
        input.detach_()
        logits.append(output.logits[-1, :])
    logits = torch.stack(logits, dim=0)
    logits.detach_()x
    activations = activation_recorder.get_activations()

    return activations, logits

In [4]:
def load_model(model_path = "gpt2"):
    if model_path == "gpt2":
        orig_model = GPT2LMHeadModel.from_pretrained("gpt2")
    else:
        orig_model = components.LM.from_pretrained(model_path)
    return orig_model.to(device)

In [5]:
def get_orthogonal_vector(v):
    if np.all(v == 0):
        raise ValueError("The input vector cannot be the zero vector.")
    # Create a matrix with the input vector as the first row
    # and fill the rest with random values
    A = np.vstack([v, np.random.rand(len(v)-1, len(v))])
    # Use the null space to find a vector orthogonal to the input vector
    u = np.linalg.svd(A)[2][-1]
    return u

In [6]:
def get_pcas(orig_activations, model, location):
    activations_matrix = {}
    for layer in range(len(model.transformer.h)):
        new_activations = orig_activations[layer][location]
        reshaped_tensors = torch.cat(new_activations, dim=0)
        activations_matrix[layer] = reshaped_tensors 
    
    token_num = activations_matrix[0].shape[1]
    pcas = {layer: [] for layer in range(len(model.transformer.h))}

    for layer, activations in activations_matrix.items():
        for token in range(token_num):
            token_activations = activations[:, token, :].squeeze()
            pca = PCA()
            pca.fit(token_activations)
            pcas[layer].append(pca)
    
    return pcas


In [15]:
def get_activations_matrix(num_layers, activations, location):
    activations_matrix = {}
    for layer in range(num_layers):
        new_activations = activations[layer][location]
        reshaped_tensors = [t.reshape(1, -1) for t in new_activations]
        concatenated_tensor = torch.cat(reshaped_tensors, dim=0)
        activations_matrix[layer] = concatenated_tensor
        
    return activations_matrix

In [8]:
def get_orthog_pcas(pcas, num_layers, num_components):
    orthog_pcas = {layer: [] for layer in range(num_layers)}
    token_num = len(pcas[0])
    for layer in range(num_layers):
        for token in range(token_num):
            pc = pcas[layer][token].components_[:num_components, :]
            pc = pc.mean(axis=0)
            pc = pc / np.linalg.norm(pc)
            orthog_pc = get_orthogonal_vector(pc)
            orthog_pcas[layer].append(orthog_pc)
    
    return orthog_pcas

In [9]:
def get_gpu_memory_usage():
    try:
        result = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'], encoding='utf-8')
        # Convert the output into a list of integers for each GPU
        memory_usage = [int(x) for x in result.strip().split('\n')]
        return memory_usage
    except subprocess.CalledProcessError as e:
        print("Failed to run nvidia-smi: ", e)
        return []

In [10]:
def complete_bigrams(model, bigrams):
    completions = []
    for bigram in bigrams:
        input = torch.tensor(tokenizer.encode(bigram)).to(device)
        output = model(input)
        prediction = output.logits[-1, :].argmax(dim=-1)
        tokenized = tokenizer.decode(prediction)
        completions.append(tokenized)
    return completions

In [16]:
def cosine_divergence(act1, act2):
    cosine_difs = torch.nn.functional.cosine_similarity(act1, act2)
    return cosine_difs.mean().item()

def distance_divergence(act1, act2):
    difference = act1 - act2
    dist = torch.norm(difference, p=2, dim=1).mean()
    return dist.item()

def get_KL_logit_divergence(perturbed_logits, orig_logits):
    KL_divergence = torch.nn.functional.kl_div(torch.nn.functional.log_softmax(perturbed_logits, dim=-1), torch.nn.functional.softmax(orig_logits, dim=-1), reduction='batchmean')
    return KL_divergence.item()

In [17]:
perturbation_loc = "before_attn"
num_bigrams = 100

orig_model = load_model("gpt2")
orig_model.to(device)
orig_model.eval()
num_layers = len(orig_model.transformer.h)

hook_locations = {"all": [perturbation_loc]}
orig_activations, orig_logits = make_activations(orig_model, hook_locations, num_bigrams)
pcas = get_pcas(orig_activations, orig_model, perturbation_loc)
orig_activations = get_activations_matrix(num_layers, orig_activations, perturbation_loc)

#iter starts here

perturbation_size = 0.01
num_components = 10
name = "orthogonal_pc"
orthog_pcas = get_orthog_pcas(pcas, num_layers, num_components)

layers = [i for i in range(len(orig_model.transformer.h))]

act_save_path = f"/om2/user/jackking/MyData/dynamics/activations/{perturbation_loc}/{name}/{perturbation_size}/"
save_path = f"/om2/user/jackking/transformer_xray/data/{perturbation_loc}/{name}/{perturbation_size}/"
if not os.path.exists(act_save_path):
    # Create the directory if it does not exist
    os.makedirs(act_save_path)
if not os.path.exists(save_path):
    # Create the directory if it does not exist
    os.makedirs(save_path)

test_bigrams = json.load(open("/om2/user/jackking/transformer_xray/scripts/dynamics/test_bigrams.json"))
test_bigrams = test_bigrams["interesting"] + test_bigrams["most_variance"] + test_bigrams["least_variance"]

bigram_results_df = pd.DataFrame(columns=['bigram'] + [layer for layer in layers])
bigram_results_df['bigram'] = test_bigrams

column_names = ["layer_num"] + ["cosine_lyapunov", "distance_lyapunov", "KL_logit_div"] + [f"cosine_sim_{i}" for i in layers] + [f"distance_{i}" for i in layers] 
df = pd.DataFrame(columns=column_names)

def perturbation_function(input, layer, token):
    try:
        orthog_pc = orthog_pcas[layer][token]
    except:
        orthog_pc = 0
    return torch.tensor(orthog_pc) / np.linalg.norm(orthog_pc) * perturbation_size

for layer in layers:
    torch.cuda.empty_cache() 
    print(f'Running layer {layer}')
    gpu_memory_usage = get_gpu_memory_usage()
    print("GPU Memory Usage (in MB):", gpu_memory_usage)

    model = load_model("gpt2")
    model.to(device)
    model.eval()

    num_layers = len(model.transformer.h)

    perturbation_hooks = {"all": [("before_attn", "all", perturbation_function)]}
    register_pertubation_hooks(model, perturbation_hooks, device)

    perturbed_activations, perturbed_logits = make_activations(model, hook_locations, num_bigrams)
    perturbed_activations = get_activations_matrix(num_layers, perturbed_activations, perturbation_loc)

    torch.save(perturbed_activations, f'{act_save_path}/{layer}_perturbed_activations.pt')

    completions = complete_bigrams(model, test_bigrams)
    bigram_results_df[layer] = completions
    del model
    gc.collect()
    torch.cuda.empty_cache() 

    KL_logit_div = get_KL_logit_divergence(perturbed_logits, orig_logits)

    if not layer == 11:
        cosine_difs = []
        distances = []
        for compare_layer in layers[layer+1:]:
            cosine_dif = 1 - cosine_divergence(orig_activations[compare_layer], perturbed_activations[compare_layer])
            distance = distance_divergence(orig_activations[compare_layer], perturbed_activations[compare_layer])
            cosine_difs.append(cosine_dif)
            distances.append(distance)
        cosine_lyapunov = np.log(np.abs(np.array(cosine_difs) + 1e-9 / perturbation_size)).sum() / len(cosine_difs)
        distance_lyapunov = np.log(np.array(distances) + 1e-9 / perturbation_size).sum() / len(distances)
        
        new_df = pd.DataFrame({
            "layer_num": [layer],
            "cosine_lyapunov": [cosine_lyapunov],
            "distance_lyapunov": [distance_lyapunov],
            "KL_logit_div": [KL_logit_div],
            **{f"cosine_sim_{i}": [cosine_difs[i - (layer + 1)]] for i in layers[layer+1:]},
            **{f"distance_{i}": [distances[i - (layer + 1)]] for i in layers[layer+1:]}
        })
    else:
        new_df = pd.DataFrame({
            "layer_num": [layer],
            "KL_logit_div": [KL_logit_div],
        })
    df = pd.concat([df, new_df], ignore_index=True)
    
    df.to_csv(f'{save_path}.csv')      
    bigram_results_df.to_csv(f'{save_path}/bigram_results.csv')  

100%|██████████| 96/96 [00:01<00:00, 53.20it/s]


Running layer 0
GPU Memory Usage (in MB): [1617]


100%|██████████| 96/96 [00:02<00:00, 35.84it/s]


Running layer 1
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.98it/s]


Running layer 2
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.01it/s]


Running layer 3
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.55it/s]


Running layer 4
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 36.95it/s]


Running layer 5
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 36.67it/s]


Running layer 6
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.61it/s]


Running layer 7
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.67it/s]


Running layer 8
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.82it/s]


Running layer 9
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.21it/s]


Running layer 10
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.61it/s]


Running layer 11
GPU Memory Usage (in MB): [1159]


100%|██████████| 96/96 [00:02<00:00, 37.56it/s]
