In [1]:
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
from tensorboard.plugins import projector
import os
from tqdm import tqdm

log_dir = './logs/phi1_5/vocab/'

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
model = AutoModel.from_pretrained("microsoft/phi-1_5")

In [2]:
word_embeddings = model.embed_tokens.weight

In [3]:
# Create list of tokens in vocab sorted by their index in vocab

vocab_list = sorted(tokenizer.vocab.items(), key=lambda x:x[1])

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# save them as csv
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for word, idx in tqdm(vocab_list):
        f.write("{}\n".format(str(word.encode(encoding='iso-8859-1', errors='replace'))))

100%|██████████| 50295/50295 [00:00<00:00, 744460.36it/s]


In [4]:
embeddings = tf.Variable(model.embed_tokens.weight.detach().numpy())
checkpoint = tf.train.Checkpoint(embedding=embeddings)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

'./logs/phi1_5/vocab/embedding.ckpt-1'

In [5]:
# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)