In [1]:
import torch

In [2]:
a = torch.tensor(2.0, requires_grad=True)
b = torch.tensor(3.0, requires_grad=True)

In [3]:
c = a * b

In [4]:
c

tensor(6., grad_fn=<MulBackward0>)

In [10]:
print(a), print(b)

tensor(2., requires_grad=True)
tensor(3., requires_grad=True)


(None, None)

In [11]:
def c_hook(grad):
    print(grad)
    return grad + 2

In [12]:
c.register_hook(c_hook)

<torch.utils.hooks.RemovableHandle at 0x7f29e0e7c8b0>

In [13]:
c.register_hook(lambda grad : print(grad))

<torch.utils.hooks.RemovableHandle at 0x7f29e0e7cfa0>

In [17]:
c.retain_grad() # save the gradients on the forward and backward graph on intermediate nodes

In [5]:
c.backward()

In [6]:
d = torch.tensor(4.0, requires_grad=True)

In [18]:
d.register_hook(lambda grad: grad + 100)

<torch.utils.hooks.RemovableHandle at 0x7f29e0e7ca60>

In [7]:
e = c * d

In [8]:
e

tensor(24., grad_fn=<MulBackward0>)

In [19]:
e.retain_grad()

In [20]:
e.register_hook(lambda grad: grad * 2)

<torch.utils.hooks.RemovableHandle at 0x7f29e0e7d180>

In [22]:
import torch.nn as nn

In [24]:
class MyMultiply(nn.Module):
    def __init__(self):
        super(MyMultiply, self).__init__()
    @staticmethod
    def forward(a, b):
        return a * b

In [27]:
def backward_hook(module, grad_input, grad_output):
    print('module:', module)
    print('grad_input', grad_input)
    print('grad_output', grad_output)

In [33]:
my_multiply = MyMultiply()

In [34]:
my_multiply.register_full_backward_hook(backward_hook)

<torch.utils.hooks.RemovableHandle at 0x7f29bf89a3b0>

In [35]:
c = my_multiply(a, b)
c.backward()

module: MyMultiply()
grad_input (tensor(3.), tensor(2.))
grad_output (tensor(1.),)


In [36]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

In [145]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
model = AutoModelForMaskedLM.from_pretrained('roberta-base')

In [146]:
sentence = "A pleasure to watch. <mask> <mask> great."

In [147]:
token_ids = tokenizer.encode(sentence, return_tensors="pt")
token_ids

tensor([[    0,   250, 10483,     7,  1183,     4, 50264, 50264,   372,     4,
             2]])

In [148]:
masked_position = (token_ids.squeeze() == tokenizer.mask_token_id).nonzero().flatten()

In [149]:
masked_position

tensor([6, 7])

In [150]:
model(token_ids)

MaskedLMOutput(loss=None, logits=tensor([[[32.8936, -3.9042, 18.1085,  ...,  2.8312,  5.9604, 10.9170],
         [ 4.8774, -3.1897, 13.4099,  ..., -0.6604,  2.7859,  5.7515],
         [ 0.8211, -3.8939,  6.6889,  ..., -0.3324,  0.0727, -5.1507],
         ...,
         [-3.0926, -5.2922,  8.6832,  ..., -2.1439, -1.6686, -0.8414],
         [17.6368, -4.6497, 18.7583,  ..., -0.1790,  2.7923,  6.2334],
         [12.5953, -4.0732, 30.0292,  ..., -1.2050, -0.8978,  5.9343]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [151]:
with torch.no_grad():
    output = model(token_ids)

In [152]:
output

MaskedLMOutput(loss=None, logits=tensor([[[32.8936, -3.9042, 18.1085,  ...,  2.8312,  5.9604, 10.9170],
         [ 4.8774, -3.1897, 13.4099,  ..., -0.6604,  2.7859,  5.7515],
         [ 0.8211, -3.8939,  6.6889,  ..., -0.3324,  0.0727, -5.1507],
         ...,
         [-3.0926, -5.2922,  8.6832,  ..., -2.1439, -1.6686, -0.8414],
         [17.6368, -4.6497, 18.7583,  ..., -0.1790,  2.7923,  6.2334],
         [12.5953, -4.0732, 30.0292,  ..., -1.2050, -0.8978,  5.9343]]]), hidden_states=None, attentions=None)

In [153]:
last_hidden_state = output[0].squeeze()

In [154]:
last_hidden_state

tensor([[32.8936, -3.9042, 18.1085,  ...,  2.8312,  5.9604, 10.9170],
        [ 4.8774, -3.1897, 13.4099,  ..., -0.6604,  2.7859,  5.7515],
        [ 0.8211, -3.8939,  6.6889,  ..., -0.3324,  0.0727, -5.1507],
        ...,
        [-3.0926, -5.2922,  8.6832,  ..., -2.1439, -1.6686, -0.8414],
        [17.6368, -4.6497, 18.7583,  ..., -0.1790,  2.7923,  6.2334],
        [12.5953, -4.0732, 30.0292,  ..., -1.2050, -0.8978,  5.9343]])

In [155]:
list_of_list = []
for index, mask_index in enumerate(masked_position):
    print(f"index: {index}, mask_index: {mask_index}")
    mask_hidden_state = last_hidden_state[mask_index]
    idx = torch.topk(mask_hidden_state, k = 5, dim=0)[1]
    print(f"idx: {idx}")
    words = [tokenizer.decode(x.item()).strip() for x in idx]
    list_of_list.append(words)
    print(f"Mask: {index+1}, Guesses: {words}")

index: 0, mask_index: 6
idx: tensor([ 85,  91, 280, 152, 252])
Mask: 1, Guesses: ['It', 'He', 'That', 'This', 'They']
index: 1, mask_index: 7
idx: tensor([  18,   16,   21, 1326,   32])
Mask: 2, Guesses: ["'s", 'is', 'was', 'looks', 'are']


In [156]:
best_guess = ""
for j in list_of_list:
    best_guess = best_guess+" "+j[0]

In [157]:
best_guess

" It 's"

In [114]:
from datasets import load_dataset

In [116]:
sst2 = load_dataset("glue","sst2")

Found cached dataset glue (/home/yz709/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [118]:
k_16_val = sst2['validation'][:16]

In [119]:
k_16_val

{'sentence': ["it 's a charming and often affecting journey . ",
  'unflinchingly bleak and desperate ',
  'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ',
  "the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . ",
  "it 's slow -- very , very slow . ",
  'although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . ',
  'a sometimes tedious film . ',
  "or doing last year 's taxes with your ex-wife . ",
  "you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . ",
  "in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . ",
  'the mesmerizing performances of the leads keep the film grounded and keep the audience riveted . ',
  'it takes a strange kind of laziness to wa