https://huggingface.co/docs/transformers/model_doc/rwkv

In [2]:
import torch
from transformers import RwkvModel, AutoTokenizer

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "RWKV/rwkv-4-169m-pile"
sequence = "The quick brown fox jumps over the lazy dog."
max_length = 20

# AutoTokenizer

In [6]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version)
tokenizer

GPTNeoXTokenizerFast(name_or_path='RWKV/rwkv-4-169m-pile', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50257: AddedToken("                     ", r

## tokenizer([sequence])

In [10]:
inputs = tokenizer(
    [sequence] * 2,                     # 句子batch
    truncation = True,                  # 超出max_length截断处理
    # padding = True,                   # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)    # https://github.com/huggingface/transformers/issues/16359

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'attention_mask', 'length'])
tensor([[  510,  3158,  8516, 30013, 27287,   689,   253, 22658,  4370,    15],
        [  510,  3158,  8516, 30013, 27287,   689,   253, 22658,  4370,    15]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([10, 10], device='cuda:0')


In [11]:
print(inputs["input_ids"])

tensor([[  510,  3158,  8516, 30013, 27287,   689,   253, 22658,  4370,    15],
        [  510,  3158,  8516, 30013, 27287,   689,   253, 22658,  4370,    15]],
       device='cuda:0')


# RwkvModel

The bare RWKV Model transformer outputting raw hidden-states without any specific head on top.

In [13]:
model: RwkvModel = RwkvModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

RwkvModel(
  (embeddings): Embedding(50277, 768)
  (blocks): ModuleList(
    (0): RwkvBlock(
      (pre_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attention): RwkvSelfAttention(
        (time_shift): ZeroPad2d((0, 0, 1, -1))
        (key): Linear(in_features=768, out_features=768, bias=False)
        (value): Linear(in_features=768, out_features=768, bias=False)
        (receptance): Linear(in_features=768, out_features=768, bias=False)
        (output): Linear(in_features=768, out_features=768, bias=False)
      )
      (feed_forward): RwkvFeedForward(
        (time_shift): ZeroPad2d((0, 0, 1, -1))
        (key): Linear(in_features=768, out_features=3072, bias=False)
        (receptance): Linear(in_features=768, out_features=768, bias=False)
        (value): Linear(in_features=3072, out_features=768, bias=False)
      )
    )
    

In [14]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

RwkvOutput(last_hidden_state=tensor([[[ 1.5777e-01, -2.7825e-01, -3.6044e-02,  ...,  2.7843e-01,
          -6.8867e-04,  1.5167e-01],
         [ 2.3544e-01, -2.6566e-01, -4.5412e-02,  ...,  4.5729e-01,
          -5.9977e-03,  4.1358e-01],
         [ 1.9137e-01, -3.2281e-01,  4.5989e-01,  ...,  6.7297e-01,
           2.5942e-01,  2.5720e-01],
         ...,
         [-2.2506e-03, -4.2943e-01,  2.6817e-01,  ...,  7.3518e-01,
           4.2219e-01,  2.4915e-01],
         [ 4.0422e-01, -5.8881e-01, -1.1256e-01,  ...,  4.8805e-01,
           1.5218e-01,  9.8003e-02],
         [ 4.2592e-01, -3.9599e-01, -5.2124e-01,  ...,  3.4675e-01,
           1.3662e-02,  2.8590e-01]],

        [[ 1.5777e-01, -2.7825e-01, -3.6044e-02,  ...,  2.7843e-01,
          -6.8867e-04,  1.5167e-01],
         [ 2.3544e-01, -2.6566e-01, -4.5412e-02,  ...,  4.5729e-01,
          -5.9977e-03,  4.1358e-01],
         [ 1.9137e-01, -3.2281e-01,  4.5989e-01,  ...,  6.7297e-01,
           2.5942e-01,  2.5720e-01],
         .

In [15]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([2, 10, 768])

In [16]:
outputs.hidden_states

In [17]:
outputs.attentions