In [14]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
import tensorflow as tf
from tensorboard.plugins import projector
import os
from tqdm import tqdm

log_dir = './logs/gpt2/vocab/'

tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

In [15]:
word_embeddings = model.transformer.wte.weight # Word Token Embeddings

print(word_embeddings.shape)

torch.Size([50257, 768])


In [16]:
# Create list of tokens in vocab sorted by their index in vocab

vocab_list = sorted(tokenizer.vocab.items(), key=lambda x:x[1])

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# save them as csv
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for word, idx in tqdm(vocab_list):
        f.write("{}\n".format(str(word.encode(encoding='iso-8859-1', errors='replace'))))

100%|██████████| 50257/50257 [00:00<00:00, 1108748.97it/s]


In [17]:
embeddings = tf.Variable(model.transformer.wte.weight.detach().numpy())
checkpoint = tf.train.Checkpoint(embedding=embeddings)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

'./logs/gpt2/vocab/embedding.ckpt-1'

In [18]:
# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)