In [1]:
import torch
import nnsight
import datasets
import text_dataset
from torch.utils.data import DataLoader
import h5py
import os

# Check if CUDA is available
cuda_available = torch.cuda.is_available()
print("CUDA available:", cuda_available)

if cuda_available:
    # Number of GPUs
    num_gpus = torch.cuda.device_count()
    print("Number of GPUs:", num_gpus)

    # List each device’s name
    for i in range(num_gpus):
        name = torch.cuda.get_device_name(i)
        print(f"GPU {i}: {name}")
else:
    print("No CUDA devices found")

  from .autonotebook import tqdm as notebook_tqdm


CUDA available: True
Number of GPUs: 4
GPU 0: Tesla V100-PCIE-32GB
GPU 1: Tesla V100-PCIE-32GB
GPU 2: Tesla V100-PCIE-32GB
GPU 3: Tesla V100-PCIE-32GB


In [2]:
model = nnsight.LanguageModel('openai-community/gpt2', device_map='auto', dispatch=True)

In [3]:
model.requires_grad_(False)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (generator): Generator(
    (streamer): Streamer()
  

In [4]:
dataset = datasets.load_dataset('Skylion007/openwebtext', split='train')

In [5]:
token_dataset = text_dataset.TextDataset(
    dataset,
    model.tokenizer,
    40,
    drop_last_batch=False,
    seq_len=1023,
)

In [6]:
text_dataset_loader = iter(
    DataLoader(
        token_dataset,
        batch_size=None,
        shuffle=False,
        num_workers=5,
        prefetch_factor=5,
        worker_init_fn=text_dataset.worker_init_fn,
    )
)

Token indices sequence length is longer than the specified maximum sequence length for this model (1561 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1174 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1217 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2459 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2027 > 1024). Running this sequence through the model will result in indexing errors


In [7]:
def extract_activations(model, tokens):
    with model.trace(tokens) as tracer:
        mlp_ins = []
        mlp_outs = []
        for i in range(12):
            mlp_in = model.transformer.h[i].ln_2.input.save()
            mlp_ins.append(mlp_in)
            mlp_out = model.transformer.h[i].mlp.output.save()
            mlp_outs.append(mlp_out)
    # batch layer in/out d_model
    mlp_ins = torch.stack(mlp_ins, dim=2)
    mlp_outs = torch.stack(mlp_outs, dim=2)
    mlp_acts = torch.stack([mlp_ins, mlp_outs], dim=2)
    return mlp_acts  # batch seq_len in/out n_layer d_model

In [8]:
store_path = '/var/local/glang/activations'
filename = 'clt-activations.h5'
store_size = 1000000
actv_size = model.config.n_embd

with h5py.File(os.path.join(store_path, filename), "w") as f:
    h5_dataset = f.create_dataset(
        'tensor', (store_size, 2, model.config.n_layer, model.config.n_embd), dtype='float32'
    )

    h5_pointer = 0
    for batch in text_dataset_loader:
        print(h5_pointer / store_size * 100, "% done")
        # prepend BOS (important)
        batch = torch.roll(batch, shifts=1, dims=1)
        batch[:, 0] = model.config.bos_token_id

        # extract activations
        mlp_acts = extract_activations(model, batch)

        # store activations
        mlp_acts = mlp_acts.flatten(0, 1)
        n_acts = mlp_acts.shape[0]

        if h5_pointer + n_acts > store_size:
            h5_dataset[h5_pointer:] = (
                mlp_acts[: int(store_size - h5_pointer)].cpu().numpy()
            )
            break
        else:
            h5_dataset[h5_pointer : h5_pointer + n_acts] = mlp_acts.cpu().numpy()
            h5_pointer += n_acts

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


0.0 % done
4.092 % done
8.184 % done
12.276 % done
16.368 % done
20.46 % done
24.552 % done
28.644 % done
32.736 % done
36.828 % done
40.92 % done
45.012 % done
49.104 % done
53.196 % done
57.288 % done
61.38 % done
65.472 % done
69.56400000000001 % done
73.656 % done
77.74799999999999 % done
81.84 % done
85.932 % done
90.024 % done
94.116 % done
98.208 % done
