In [134]:
import torch
from torch import nn
from transformers import BertConfig

config = BertConfig(max_position_embeddings=8)
word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

In [135]:
input_ids = torch.randint(0, 100, (2, 6))
input_shape = input_ids.size()
seq_length = input_shape[1]
input_ids, input_shape, seq_length

(tensor([[96, 15, 27,  4, 76, 86],
         [43, 65, 51, 20, 52, 56]]),
 torch.Size([2, 6]),
 6)

In [136]:
inputs_embeds = word_embeddings(input_ids)
print(inputs_embeds), inputs_embeds.shape

tensor([[[-0.1018, -0.0436, -0.2491,  ..., -1.5319,  0.1579, -0.3988],
         [-1.0177, -0.4927, -0.6918,  ..., -0.2705, -1.0987,  1.4907],
         [ 0.2334, -0.2801,  0.2963,  ...,  1.0481,  0.2028, -0.7454],
         [-0.8064,  1.0662, -0.0228,  ...,  1.0990,  1.8425,  0.3876],
         [-2.9163, -0.2970, -0.1240,  ..., -1.3806,  1.8677,  0.8364],
         [-0.7654, -1.2803, -0.4555,  ..., -0.5180,  0.5751,  0.8081]],

        [[ 2.8438, -1.1507, -0.1720,  ...,  0.7628,  1.2780,  2.2947],
         [ 0.7472, -1.8190,  0.3092,  ...,  0.0908, -0.3583,  2.8664],
         [ 1.5884,  0.5041, -0.5914,  ...,  0.9934, -1.9225,  0.3489],
         [ 0.3765,  0.4048,  0.3921,  ..., -0.6353, -0.9512, -0.0740],
         [-0.4986,  1.1924,  1.2099,  ..., -0.3461, -1.1524,  1.6119],
         [ 0.5000,  0.8541, -0.9925,  ..., -0.5671, -1.0413, -0.3625]]],
       grad_fn=<EmbeddingBackward0>)


(None, torch.Size([2, 6, 768]))

In [137]:
position_ids = torch.arange(config.max_position_embeddings).expand((1, -1))
position_ids

tensor([[0, 1, 2, 3, 4, 5, 6, 7]])

In [138]:
position_ids = position_ids[:, : seq_length]
position_ids

tensor([[0, 1, 2, 3, 4, 5]])

In [139]:
position_embedding = position_embeddings(position_ids)
print(position_embedding), position_embedding.shape

tensor([[[ 1.0620,  0.5503, -0.3741,  ..., -0.4613,  0.8329, -0.7874],
         [-0.2094,  0.8066, -1.0739,  ..., -1.0512,  1.3952,  0.4911],
         [ 1.4946, -0.2448, -0.0059,  ..., -0.5901,  0.7230,  1.5968],
         [-0.1138, -1.2172,  0.2797,  ...,  0.8967, -0.3172, -0.4115],
         [-0.1085, -0.9684, -0.7007,  ...,  0.1512,  0.7826,  0.2522],
         [ 1.3071, -0.8792, -1.1434,  ..., -0.0795, -1.0724, -1.2303]]],
       grad_fn=<EmbeddingBackward0>)


(None, torch.Size([1, 6, 768]))

In [140]:
token_type_ids = torch.zeros(position_ids.size(), dtype=torch.long)
token_type_ids

tensor([[0, 0, 0, 0, 0, 0]])

In [141]:
buffered_token_type_ids = token_type_ids[:, :seq_length]
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
token_type_ids = buffered_token_type_ids_expanded
token_type_ids, buffered_token_type_ids

(tensor([[0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0]]),
 tensor([[0, 0, 0, 0, 0, 0]]))

In [142]:
token_type_embedding = token_type_embeddings(token_type_ids)
print(token_type_embedding), token_type_embedding.shape

tensor([[[-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845]],

        [[-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845],
         [-0.0810,  0.0851, -0.4403,  ..., -0.6212,  0.0225,  0.2845]]],
       grad_fn=<EmbeddingBackward0>)


(None, torch.Size([2, 6, 768]))

In [143]:
embeddings = inputs_embeds + token_type_embedding + position_embedding
embeddings, embeddings.shape

(tensor([[[ 0.8792,  0.5919, -1.0635,  ..., -2.6145,  1.0133, -0.9017],
          [-1.3081,  0.3991, -2.2060,  ..., -1.9430,  0.3189,  2.2663],
          [ 1.6470, -0.4397, -0.1499,  ..., -0.1632,  0.9483,  1.1359],
          [-1.0012, -0.0658, -0.1833,  ...,  1.3745,  1.5478,  0.2607],
          [-3.1058, -1.1802, -1.2649,  ..., -1.8506,  2.6727,  1.3731],
          [ 0.4607, -2.0743, -2.0393,  ..., -1.2188, -0.4748, -0.1377]],
 
         [[ 3.8248, -0.5153, -0.9863,  ..., -0.3197,  2.1333,  1.7918],
          [ 0.4568, -0.9272, -1.2051,  ..., -1.5816,  1.0593,  3.6420],
          [ 3.0020,  0.3445, -1.0376,  ..., -0.2179, -1.1771,  2.2302],
          [ 0.1817, -0.7273,  0.2316,  ..., -0.3597, -1.2459, -0.2010],
          [-0.6881,  0.3092,  0.0689,  ..., -0.8161, -0.3473,  2.1486],
          [ 1.7261,  0.0600, -2.5762,  ..., -1.2679, -2.0913, -1.3083]]],
        grad_fn=<AddBackward0>),
 torch.Size([2, 6, 768]))

In [144]:
LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
norm_embeddings = LayerNorm(embeddings)
norm_embeddings, norm_embeddings.shape

(tensor([[[ 0.5604,  0.3906, -0.5873,  ..., -1.5035,  0.6396, -0.4917],
          [-0.7050,  0.3186, -1.2434,  ..., -1.0857,  0.2705,  1.4382],
          [ 0.9863, -0.2433, -0.0726,  ..., -0.0804,  0.5746,  0.6851],
          [-0.6119, -0.0720, -0.1398,  ...,  0.7594,  0.8594,  0.1165],
          [-1.7621, -0.6545, -0.7032,  ..., -1.0401,  1.5619,  0.8143],
          [ 0.2480, -1.2471, -1.2264,  ..., -0.7425, -0.3037, -0.1049]],
 
         [[ 2.2425, -0.2528, -0.5236,  ..., -0.1403,  1.2700,  1.0737],
          [ 0.2940, -0.5253, -0.6898,  ..., -0.9127,  0.6507,  2.1797],
          [ 1.7575,  0.2001, -0.6099,  ..., -0.1295, -0.6916,  1.3052],
          [ 0.0708, -0.4751,  0.1007,  ..., -0.2544, -0.7866, -0.1591],
          [-0.4311,  0.1662,  0.0223,  ..., -0.5078, -0.2270,  1.2679],
          [ 1.0954,  0.0811, -1.5238,  ..., -0.7273, -1.2285, -0.7519]]],
        grad_fn=<NativeLayerNormBackward0>),
 torch.Size([2, 6, 768]))

In [173]:
dropout = nn.Dropout(config.hidden_dropout_prob)
out_embeddings = dropout(norm_embeddings)
out_embeddings, out_embeddings.shape

(tensor([[[ 0.6227,  0.4340, -0.6525,  ..., -1.6706,  0.0000, -0.5463],
          [-0.7834,  0.3540, -1.3816,  ..., -1.2063,  0.3006,  1.5980],
          [ 1.0959, -0.2704, -0.0806,  ..., -0.0893,  0.6384,  0.7612],
          [-0.6799, -0.0800, -0.0000,  ...,  0.8438,  0.9549,  0.1294],
          [-1.9579, -0.7272, -0.7814,  ..., -1.1557,  1.7354,  0.9047],
          [ 0.2756, -1.3857, -1.3627,  ..., -0.8250, -0.3375, -0.1165]],
 
         [[ 2.4916, -0.2809, -0.5818,  ..., -0.1559,  1.4111,  1.1929],
          [ 0.3266, -0.5837, -0.7665,  ..., -0.0000,  0.0000,  0.0000],
          [ 1.9528,  0.2223, -0.6777,  ..., -0.1439, -0.7685,  1.4502],
          [ 0.0786, -0.5279,  0.1119,  ..., -0.2827, -0.8740, -0.1768],
          [-0.4790,  0.1847,  0.0248,  ..., -0.0000, -0.2522,  1.4087],
          [ 1.2171,  0.0901, -1.6931,  ..., -0.8081, -1.3651, -0.8354]]],
        grad_fn=<MulBackward0>),
 torch.Size([2, 6, 768]))

In [177]:
attention_mask = torch.ones(2, 8)
attention_mask

tensor([[1., 1., 1., 1., 1., 1., 1., 1.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

In [178]:
extended_attention_mask = attention_mask[:, None, None, :]
extended_attention_mask

tensor([[[[1., 1., 1., 1., 1., 1., 1., 1.]]],


        [[[1., 1., 1., 1., 1., 1., 1., 1.]]]])

In [227]:
from transformers import BertForMaskedLM, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

inputs = tokenizer(["The capital of France is [MASK].", "The capital of"], return_tensors="pt", padding=True)
inputs

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[ 101, 1996, 3007, 1997, 2605, 2003,  103, 1012,  102],
        [ 101, 1996, 3007, 1997,  102,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0, 0, 0]])}

In [193]:
batch = {
    "input_ids": [[101, 1996, 3007, 1997, 2605, 2003, 103, 1012, 102]]
}
special_tokens_mask = batch.pop("special_tokens_mask", None)
batch["input_ids"], batch["labels"] = numpy_mask_tokens(
    batch["input_ids"], special_tokens_mask=special_tokens_mask
)

In [202]:
import numpy as np

probability_matrix = np.full((1, 9), 0.15)
probability_matrix

array([[0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15]])

In [203]:
special_tokens_mask = np.array([[1, 0, 0, 0, 0, 1, 0, 0, 1]], dtype=np.bool)
special_tokens_mask

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  special_tokens_mask = np.array([[1, 0, 0, 0, 0, 1, 0, 0, 1]], dtype=np.bool)


array([[ True, False, False, False, False,  True, False, False,  True]])

In [205]:
probability_matrix[special_tokens_mask] = 0
probability_matrix

array([[0.  , 0.15, 0.15, 0.15, 0.15, 0.  , 0.15, 0.15, 0.  ]])

In [210]:
masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(np.bool)
masked_indices

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  masked_indices = np.random.binomial(1, probability_matrix, size=probability_matrix.shape).astype(np.bool)


array([[False, False, False, False, False, False,  True, False, False]])

In [223]:
attention_scores = torch.as_tensor([
    [0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1],
    [0.1, 0.1, 0.1, 0.1],
])
attention_mask = torch.as_tensor([[0, 0, 0, -1000]])
extended_attention_mask = attention_mask[:, None, None, :]
extended_attention_mask

tensor([[[[    0,     0,     0, -1000]]]])

In [224]:
final_attention_scores = attention_scores + extended_attention_mask
final_attention_scores

tensor([[[[ 1.0000e-01,  1.0000e-01,  1.0000e-01, -9.9990e+02],
          [ 1.0000e-01,  1.0000e-01,  1.0000e-01, -9.9990e+02],
          [ 1.0000e-01,  1.0000e-01,  1.0000e-01, -9.9990e+02],
          [ 1.0000e-01,  1.0000e-01,  1.0000e-01, -9.9990e+02]]]])

In [225]:
attention_probs = nn.functional.softmax(final_attention_scores, dim=-1)
attention_probs

tensor([[[[0.3333, 0.3333, 0.3333, 0.0000],
          [0.3333, 0.3333, 0.3333, 0.0000],
          [0.3333, 0.3333, 0.3333, 0.0000],
          [0.3333, 0.3333, 0.3333, 0.0000]]]])

In [277]:
sequence = torch.rand((2, 4, 8), dtype=torch.float32, device="cuda")

batch_size, seq_length, dim = sequence.shape
mask_ratio = 0.25
sequence.shape

torch.Size([2, 4, 8])

In [278]:
noise = torch.rand(batch_size, seq_length, device="cuda")
noise.shape

torch.Size([2, 4])

In [279]:
ids_shuffle = torch.argsort(noise, dim=1)
ids_shuffle.shape, ids_shuffle

(torch.Size([2, 4]),
 tensor([[3, 1, 2, 0],
         [2, 3, 1, 0]], device='cuda:0'))

In [280]:
len_keep = int(seq_length * (1 - mask_ratio))
ids_keep = ids_shuffle[:, :len_keep]
ids_keep, ids_keep.shape

(tensor([[3, 1, 2],
         [2, 3, 1]], device='cuda:0'),
 torch.Size([2, 3]))

In [282]:
sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim))
len_keep, sequence_masked, sequence_masked.shape, sequence

(3,
 tensor([[[0.4018, 0.7857, 0.0249, 0.1331, 0.1187, 0.9118, 0.3672, 0.7224],
          [0.7811, 0.7241, 0.9312, 0.2073, 0.8186, 0.9522, 0.2588, 0.8637],
          [0.7325, 0.7208, 0.3008, 0.2485, 0.1720, 0.9731, 0.7791, 0.8823]],
 
         [[0.4355, 0.2511, 0.7570, 0.3515, 0.3261, 0.3251, 0.1136, 0.0240],
          [0.0359, 0.6223, 0.4285, 0.9332, 0.0145, 0.5448, 0.8405, 0.0043],
          [0.1547, 0.1175, 0.1563, 0.5687, 0.1850, 0.6222, 0.7419, 0.8627]]],
        device='cuda:0'),
 torch.Size([2, 3, 8]),
 tensor([[[0.6399, 0.9781, 0.5286, 0.6497, 0.1382, 0.3921, 0.4539, 0.7756],
          [0.7811, 0.7241, 0.9312, 0.2073, 0.8186, 0.9522, 0.2588, 0.8637],
          [0.7325, 0.7208, 0.3008, 0.2485, 0.1720, 0.9731, 0.7791, 0.8823],
          [0.4018, 0.7857, 0.0249, 0.1331, 0.1187, 0.9118, 0.3672, 0.7224]],
 
         [[0.1781, 0.2082, 0.8804, 0.8122, 0.0999, 0.5284, 0.3716, 0.7465],
          [0.1547, 0.1175, 0.1563, 0.5687, 0.1850, 0.6222, 0.7419, 0.8627],
          [0.4355, 0.2511,

In [284]:
ids_restore = torch.argsort(ids_shuffle, dim=1)
ids_restore.shape, ids_restore

(torch.Size([2, 4]),
 tensor([[3, 1, 2, 0],
         [3, 2, 0, 1]], device='cuda:0'))

In [290]:
mask = torch.ones([batch_size, seq_length], device=sequence.device)
mask[:, :len_keep] = 0
mask, mask.shape

(tensor([[0., 0., 0., 1.],
         [0., 0., 0., 1.]], device='cuda:0'),
 torch.Size([2, 4]))

In [294]:
mask_masked = torch.gather(mask, dim=1, index=ids_restore)
mask_masked, mask_masked.shape

(tensor([[1., 0., 0., 0.],
         [1., 0., 0., 0.]], device='cuda:0'),
 torch.Size([2, 4]))

In [298]:
attention_mask = torch.as_tensor(
    [
        [1] * 3 + [0] * (seq_length - 3),
        [1] * 4 + [0] * (seq_length - 4)
    ]
    , device="cuda"
)
attention_mask, attention_mask.shape

(tensor([[1, 1, 1, 0],
         [1, 1, 1, 1]], device='cuda:0'),
 torch.Size([2, 4]))

In [301]:
attention_mask_masked = torch.gather(attention_mask, dim=1, index=ids_keep)
attention_mask_masked

tensor([[0, 1, 1],
        [1, 1, 1]], device='cuda:0')

In [303]:
extended_attention_mask = attention_mask_masked[:, None, None, :]
extended_attention_mask

tensor([[[[0, 1, 1]]],


        [[[1, 1, 1]]]], device='cuda:0')

In [304]:
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(torch.float32).min
extended_attention_mask

tensor([[[[-3.4028e+38, -0.0000e+00, -0.0000e+00]]],


        [[[-0.0000e+00, -0.0000e+00, -0.0000e+00]]]], device='cuda:0')

In [None]:
mask_masked = torch.gather(mask, dim=1, index=ids_restore)
mask_masked, mask_masked.shape