In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoConfig

In [2]:
config = AutoConfig.from_pretrained('microsoft/deberta-v3-small')
tokenizer = AutoTokenizer.from_pretrained('malaysia-ai/bpe-tokenizer')
special_tokens_dict = {"mask_token": "[MASK]"}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
config.vocab_size = len(tokenizer)
config.max_position_embeddings = 4096

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

In [3]:
config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 4096,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.35.0",
  "type_vocab_size": 0,
  "vocab_size": 32001
}

In [4]:
model = AutoModelForMaskedLM.from_config(config)

In [5]:
model.save_pretrained('debertav2-small')
tokenizer.save_pretrained('debertav2-small')

('debertav2-small/tokenizer_config.json',
 'debertav2-small/special_tokens_map.json',
 'debertav2-small/tokenizer.json')

In [6]:
!ls -lh debertav2-small

total 262M
-rw-r--r-- 1 ubuntu ubuntu  866 Nov 15 06:35 config.json
-rw-r--r-- 1 ubuntu ubuntu 260M Nov 15 06:35 model.safetensors
-rw-r--r-- 1 ubuntu ubuntu  778 Nov 15 06:35 special_tokens_map.json
-rw-r--r-- 1 ubuntu ubuntu 1.3M Nov 15 06:35 tokenizer.json
-rw-r--r-- 1 ubuntu ubuntu 1.2K Nov 15 06:35 tokenizer_config.json


In [7]:
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import StreamingDataset
import torch
import numpy as np

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = StreamingDataset(local=local)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        data.pop('token_type_ids', None)
        for k in data.keys():
            data[k] = data[k].astype(np.int64)
        return data

    def __len__(self):
        return len(self.dataset)

train_dataset = DatasetFixed(local='tokenized-512')

In [8]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=0.15,
    pad_to_multiple_of=None,
)

In [9]:
batch = [train_dataset[i] for i in range(3)]
b = data_collator(batch)

You're using a PreTrainedTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [10]:
model(**b)

MaskedLMOutput(loss=tensor(10.5336, grad_fn=<NllLossBackward0>), logits=tensor([[[ 0.0000,  0.0178,  0.7783,  ...,  0.0152, -0.3278,  0.8059],
         [ 0.0000,  0.0064, -1.1060,  ...,  0.5021, -0.4350, -0.1890],
         [ 0.0000,  0.7313,  0.6659,  ...,  0.3666, -0.5069,  0.3751],
         ...,
         [ 0.0000,  0.2071,  0.5059,  ...,  0.4080, -0.3686,  0.3104],
         [ 0.0000,  0.6307,  0.4583,  ...,  0.5337, -0.1931,  0.9158],
         [ 0.0000,  0.1186,  0.3102,  ...,  1.1024, -0.1440,  0.7481]],

        [[ 0.0000,  0.3048,  0.8347,  ..., -1.0490,  0.1279,  0.3387],
         [ 0.0000, -0.1180,  0.0993,  ...,  0.2921,  0.7044, -0.0275],
         [ 0.0000,  0.0527,  0.9007,  ...,  0.3695,  0.3636,  0.5029],
         ...,
         [ 0.0000,  1.0467, -0.2307,  ...,  0.2358, -0.0311,  0.1480],
         [ 0.0000,  0.7975, -0.3127,  ...,  0.0608, -0.4451, -0.5585],
         [ 0.0000, -0.1052, -0.2830,  ...,  0.6167, -0.4749,  0.4427]],

        [[ 0.0000,  0.0829,  0.6659,  ...,  