In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("/home/vmeshchaninov/DiffusionTextGeneration-cond-ca/")

In [2]:
import torch
from torch.nn.functional import cross_entropy
from transformers import AutoTokenizer, T5Model, T5ForConditionalGeneration

In [3]:
texts = [
    "i went well into an interview yesterday. i did not know how to do my homework. i had to wait a long day for the interview to do well. even when i arrived the next morning, the interview had failed. i was disappointed to learn that i could not finish my paper.",
    "jack wanted to cook a cake for his wife. he put the cupboard in the cupboarder. he put a big cupboard in the oven. jack was disappointed when his wife's family was done. the ovenboard turned out looking perfectly perfect!"
]

# T5-base

In [4]:
t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
t5_tokenizer(texts)

{'input_ids': [[3, 23, 877, 168, 139, 46, 2772, 4981, 5, 3, 23, 410, 59, 214, 149, 12, 103, 82, 11920, 5, 3, 23, 141, 12, 1749, 3, 9, 307, 239, 21, 8, 2772, 12, 103, 168, 5, 237, 116, 3, 23, 4363, 8, 416, 1379, 6, 8, 2772, 141, 4567, 5, 3, 23, 47, 10978, 12, 669, 24, 3, 23, 228, 59, 1992, 82, 1040, 5, 1], [3, 9325, 1114, 12, 3989, 3, 9, 4340, 21, 112, 2512, 5, 3, 88, 474, 8, 19904, 16, 8, 19904, 49, 5, 3, 88, 474, 3, 9, 600, 19904, 16, 8, 4836, 5, 3, 9325, 47, 10978, 116, 112, 2512, 31, 7, 384, 47, 612, 5, 8, 4836, 1976, 2120, 91, 479, 3923, 626, 55, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [37]:
t5_tokenizer.vocab[t5_tokenizer.eos_token]

1

In [9]:
t5_tokenizer.vocab[t5_tokenizer._pad_token]

0

In [15]:
t5_tokenizer.vocab[t5_tokenizer._unk_token]

2

In [22]:
for k, v in t5_tokenizer.vocab.items():
    if v in [3, 23, 5]:
        print(k, v)

i 23
. 5
▁ 3


In [16]:
t5_tokenizer([""])

{'input_ids': [[1]], 'attention_mask': [[1]]}

In [19]:
t5_tokenizer(["i"])

{'input_ids': [[3, 23, 1]], 'attention_mask': [[1, 1, 1]]}

In [20]:
t5_tokenizer(["I"])

{'input_ids': [[27, 1]], 'attention_mask': [[1, 1]]}

In [23]:
t5_tokenizer.batch_decode([[3, 23, 1]])

['i</s>']

In [28]:
t5_tokenizer.batch_decode([[23]])

['i']

In [25]:
t5_tokenizer(["i did not know how to do my homework."])

{'input_ids': [[3, 23, 410, 59, 214, 149, 12, 103, 82, 11920, 5, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [26]:
t5_tokenizer.batch_decode([[3, 23, 410, 59, 214, 149, 12, 103, 82, 11920, 5, 1]])

['i did not know how to do my homework.</s>']

In [27]:
t5_tokenizer.batch_decode([[3, 23, 3, 410, 59, 214, 149, 12, 103, 82, 11920, 5, 1]])

['i  did not know how to do my homework.</s>']

## Encoder

In [5]:
from model.encoder_t5 import T5EncoderModel

In [4]:
t5_tokenizer = AutoTokenizer.from_pretrained("t5-base")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [8]:
t5_encoder = T5EncoderModel.from_pretrained(
    "t5-base",
    enc_normalizer=None,
)

Some weights of the model checkpoint at t5-base were not used when initializing T5EncoderModel: ['decoder.block.8.layer.1.EncDecAttention.q.weight', 'decoder.block.9.layer.1.EncDecAttention.k.weight', 'decoder.block.5.layer.2.layer_norm.weight', 'decoder.block.10.layer.2.DenseReluDense.wo.weight', 'decoder.block.3.layer.2.DenseReluDense.wi.weight', 'decoder.block.4.layer.1.layer_norm.weight', 'decoder.block.10.layer.1.EncDecAttention.o.weight', 'decoder.block.8.layer.0.SelfAttention.k.weight', 'decoder.block.6.layer.1.EncDecAttention.v.weight', 'decoder.block.6.layer.1.layer_norm.weight', 'decoder.block.6.layer.1.EncDecAttention.q.weight', 'decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.5.layer.0.SelfAttention.v.weight', 'decoder.block.10.layer.1.EncDecAttention.v.weight', 'decoder.block.7.layer.0.SelfAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.7.layer.1.layer_norm.weight', 'decoder.block.2.layer.0.SelfAttention.o.weight', '

In [7]:
embeddings = encoder.shared.weight.data.cpu()

In [21]:
input_ = t5_tokenizer(
    texts,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    max_length=80,
    return_tensors='pt',
    return_special_tokens_mask=True
)

In [30]:
encodings = t5_encoder(input_ids=input_["input_ids"], attention_mask=input_["attention_mask"])

In [31]:
mask_token = 1 - input_['special_tokens_mask']
mask_token

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [32]:
def norm(z, mask):
    norm = torch.sum(torch.norm(z, dim=2) * mask) / torch.sum(mask)
    return norm

In [33]:
# Token's norm
norm(encodings, mask_token)

tensor(6.9983, grad_fn=<DivBackward0>)

In [34]:
# Pad's norm
norm(encodings, 1 - mask_token)

tensor(7.3005, grad_fn=<DivBackward0>)

In [36]:
embeddings = t5_encoder.shared.weight.data.cpu()
eos_emb = embeddings[t5_tokenizer.eos_token_id].cuda()
pad_emb = embeddings[t5_tokenizer.pad_token_id].cuda()

In [40]:
eos_emb / torch.norm(eos_emb)

tensor(1., device='cuda:0')

In [38]:
torch.norm(pad_emb)

tensor(664.7014, device='cuda:0')

In [None]:
input_['special_tokens_mask'] * input_['input_ids']