In [1]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
import torch as torch
from torch.utils.data import Dataset
from tqdm import tqdm
import os


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
path = "/om2/user/jackking/modular_transformers/scripts/dimensionality_reduction"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_data(datatype, sub_datatype):

    val_data_path = f"{path}/data/{datatype}/valid_data_{sub_datatype}.pkl"

    with open(val_data_path, "rb") as f:
        val_data = pickle.load(f)
    
    if sub_datatype == "natural":
        data = torch.tensor(val_data)
        labels = [0] * len(data)
    else:
        data = torch.tensor(val_data["inputs"])
        labels = val_data["labels"]
    
    return data, labels

In [3]:
num_labels = 1
model_name = "eager-dawn-33"
datatype = "natural_language"
sub_datatype = "natural"
model_type = "lm"

# model_path = f"{path}/models/{datatype}/{sub_datatype}/{model_type}/{model_name}/epoch_26"
# model = GPT2LMHeadModel.from_pretrained(model_path)

model_name = "gpt2-xl"
datatype = "natural_language_ud"
sub_datatype = "natural"
model_type = "lm"

model = GPT2LMHeadModel.from_pretrained(model_name)
model.to(device)

# data, labels = load_data(datatype, sub_datatype)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 1600)
    (wpe): Embedding(1024, 1600)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-47): 48 x GPT2Block(
        (ln_1): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((1600,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=1600, out_features=50257, bias=False)
)

In [4]:
datasource = "ud"
sentence_length = 10
dt = f"{datasource}/{sentence_length}_word"
full_path = f"/om2/user/jackking/modular_transformers/scripts/attention_interpretability/data/{dt}"

data = torch.tensor(pickle.load(open(f"{full_path}/sentences.pkl", "rb")))
labels = np.zeros(len(data))

In [5]:
embedding_dim = model.config.n_embd
num_layers = model.config.n_layer
context_len = len(data[0])
num_samples = len(data)

activations = np.zeros((num_labels, int(num_samples/num_labels), num_layers+1, context_len, embedding_dim))

for sample_idx, (sample, label) in tqdm(enumerate(zip(data, labels))):
    sample = sample.to(device)
    output = model(sample.reshape(1, -1), output_hidden_states=True)
    for layer_idx, hidden_state in enumerate(output.hidden_states):
        activations[int(label), int(sample_idx%(num_samples/num_labels)), layer_idx, :, :] = hidden_state.cpu().detach().numpy()

5815it [03:03, 31.73it/s]


In [6]:
activation_path = f"{path}/activations/{datatype}/{sub_datatype}/{model_type}/{model_name}"
if not os.path.exists(activation_path):
    os.makedirs(activation_path)
with open(f"{activation_path}.pkl", "wb") as f:
    pickle.dump(activations, f)