In [1]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = T5ForConditionalGeneration.from_pretrained("t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [2]:
input_ids = tokenizer("translate EEG to English: ", return_tensors="pt").input_ids
input_ids

tensor([[13959,   262,  8579,    12,  1566,    10,     1]])

In [3]:
input_ids = tokenizer("translate English to English: ", return_tensors="pt").input_ids
input_ids

tensor([[13959,  1566,    12,  1566,    10,     1]])

In [4]:
input_ids.repeat(100, 1).shape

torch.Size([100, 6])

In [5]:
import torch
torch.torch.randn(100, 56, 800)

tensor([[[ 1.3374,  2.5600,  0.7064,  ...,  0.7470,  1.6448, -1.0109],
         [-0.5682, -0.5854,  0.0974,  ..., -0.2271, -0.1595,  0.5075],
         [-1.4115,  0.4484,  0.9115,  ...,  1.5161, -1.9184,  0.5424],
         ...,
         [ 1.4907,  0.8101,  0.8353,  ..., -0.7885,  0.0211, -0.0088],
         [ 0.1107, -0.6868,  0.4876,  ..., -1.5508,  1.9797, -0.0509],
         [-0.0518, -0.3459, -0.0724,  ...,  0.2318,  1.0648,  2.4334]],

        [[-1.6174,  0.6054, -1.9924,  ...,  1.0129, -0.9994, -0.5570],
         [-1.6427,  0.8491, -0.0437,  ..., -1.7448,  0.2897, -0.5853],
         [ 0.0939,  1.4410, -1.2771,  ...,  0.0969, -0.7026,  0.3922],
         ...,
         [-1.0335,  0.0098, -1.3054,  ..., -1.0744,  0.6596,  2.7872],
         [-1.0477,  1.0369, -1.6365,  ...,  0.0083, -1.1186,  0.8101],
         [-0.8990,  0.7169, -0.8073,  ...,  0.7134,  0.4937, -0.7704]],

        [[-0.1738,  0.4841, -1.9050,  ...,  2.1110, -0.9089,  0.6279],
         [ 0.8249,  0.0435, -0.3616,  ..., -0

In [6]:
model = T5ForConditionalGeneration.from_pretrained("t5-large")

In [46]:
model.config

T5Config {
  "_name_or_path": "t5-large",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 4096,
  "d_kv": 64,
  "d_model": 1024,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 24,
  "num_heads": 16,
  "num_layers": 24,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 300,
 

In [7]:
def msize(m):
    return sum(p.numel() for p in m.parameters())
print(msize(model.shared) / msize(model))   # 0.3298
print(msize(model.lm_head) / msize(model))  # 0.3298

0.04459874593790213
0.04459874593790213


In [8]:
model.shared

Embedding(32128, 1024)

In [9]:
model.lm_head

Linear(in_features=1024, out_features=32128, bias=False)

In [34]:
input_ids.device

device(type='cpu')

In [11]:
model.shared.weight.shape

torch.Size([32128, 1024])

In [28]:
eeg= torch.randn(16, 56, 1024)

In [13]:
model.shared.weight.shape

torch.Size([32128, 1024])

In [27]:
model.shared(input_ids).repeat(16, 1, 1).shape

torch.Size([16, 6, 1024])

In [33]:
torch.cat((model.shared(input_ids).repeat(16, 1, 1), eeg), dim=1).shape

torch.Size([16, 62, 1024])

In [35]:
attention_mask

NameError: name 'attention_mask' is not defined

In [38]:
torch.zeros(16, 10).shape

torch.Size([16, 6])

In [40]:
torch.cat((torch.ones(16, 6), torch.zeros(16, 10)), dim=1)

tensor([[1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0