In [4]:
from transformers import AutoTokenizer, AutoModel
import tensorflow as tf
from tensorboard.plugins import projector
import os
from tqdm import tqdm

log_dir = './logs/phi1_5/vocab/'

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
model = AutoModel.from_pretrained("microsoft/phi-1_5")
print(model)

PhiModel(
  (embed_tokens): Embedding(51200, 2048)
  (embed_dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-23): 24 x PhiDecoderLayer(
      (self_attn): PhiSdpaAttention(
        (q_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (k_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (v_proj): Linear(in_features=2048, out_features=2048, bias=True)
        (dense): Linear(in_features=2048, out_features=2048, bias=True)
        (rotary_emb): PhiRotaryEmbedding()
      )
      (mlp): PhiMLP(
        (activation_fn): NewGELUActivation()
        (fc1): Linear(in_features=2048, out_features=8192, bias=True)
        (fc2): Linear(in_features=8192, out_features=2048, bias=True)
      )
      (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
      (resid_dropout): Dropout(p=0.0, inplace=False)
    )
  )
  (final_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
  (rotary_emb): PhiRotaryEmbed

In [6]:
word_embeddings = model.embed_tokens.weight

print(word_embeddings.shape)

torch.Size([51200, 2048])


In [7]:
# Create list of tokens in vocab sorted by their index in vocab

vocab_list = sorted(tokenizer.vocab.items(), key=lambda x:x[1])

if not os.path.exists(log_dir):
    os.makedirs(log_dir)

# save them as csv
with open(os.path.join(log_dir, 'metadata.tsv'), "w") as f:
    for word, idx in tqdm(vocab_list):
        f.write("{}\n".format(str(word.encode(encoding='iso-8859-1', errors='replace'))))

100%|██████████| 50295/50295 [00:00<00:00, 1007404.51it/s]


In [10]:
print(model.embed_tokens.weight)
embeddings = tf.Variable(model.embed_tokens.weight.detach().numpy())
checkpoint = tf.train.Checkpoint(embedding=embeddings)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

Parameter containing:
tensor([[ 9.7046e-03, -1.5488e-02,  6.0272e-02,  ...,  9.5520e-03,
         -5.4169e-02, -5.8174e-03],
        [ 2.4323e-02,  5.4321e-02,  1.7776e-02,  ...,  2.5421e-02,
         -4.3854e-02,  3.9612e-02],
        [-4.1565e-02,  3.6987e-02, -1.5976e-02,  ...,  4.7394e-02,
         -1.6113e-02,  4.3716e-03],
        ...,
        [-1.5259e-05,  3.0160e-05, -1.6034e-05,  ..., -1.9729e-05,
         -1.3590e-05,  9.2745e-05],
        [-8.0466e-06, -2.6107e-05, -5.1260e-05,  ...,  4.0054e-05,
          4.9233e-05, -1.6689e-05],
        [ 3.2783e-06, -1.7822e-05,  2.4676e-05,  ..., -3.4511e-05,
         -2.0921e-05,  1.6928e-05]], requires_grad=True)


'./logs/phi1_5/vocab/embedding.ckpt-1'

In [11]:
# Set up config
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)