# Introduction
There have been plenty of well-organized tutorials elaborating on details of the Transformer. This one is inpired by and based on annotated-transformer from the Harvard NLP group, which is a great tutorial showing everything you need to reproduce the transformer model from paper. However, from a beginner's standpoint, it is sometimes easy to get lost when stuck with an unfamiliar concept and need to go for further readings. In this notebook, I try to alleviate this by organizing the codes in a top-down manner. And instead of using texts from the original paper of transfomer, I will explain using my own words and provide links to useful resources for each module if necessary.

In [1]:
import torch
import torch.nn as nn
from tqdm import tqdm
from copy import deepcopy
from feedforward import FeedForwardNetwork
from multiheadattention import MultiHeadAttention
from utils import clone, PositionalEncoding, Embedding, get_subsequent_mask, rate, greedy_decode, Generator
import torch.nn.functional as F

# A simple task
Firstly, we want to know what our task is. We take the same task as in annotated-transformer, which is to memorize the sequence of numbers from 1 to 10. Therefore, the size of our vocabulary should be 10. 

# Overview of the model

In [2]:
class FullModel(nn.Module):
    def __init__(
            self, 
            num_encoder=6, 
            num_decoder=6, 
            d_model=512, 
            vocab_size=13,
            num_head=6,
        ):
        super().__init__()
        c = deepcopy
        ffn = FeedForwardNetwork(d_model)
        attn = MultiHeadAttention(d_model=d_model, num_head=num_head)
        self.d_model = d_model
        self.shared = Embedding(vocab=vocab_size, d_model=d_model)
        self.model = EncoderDecoder(
            Encoder(EncoderLayer(c(attn), c(ffn)), num_layers=num_encoder),
            Decoder(DecoderLayer(c(attn), c(attn), c(ffn)), num_layers=num_decoder),
            nn.Sequential(c(self.shared),
                          PositionalEncoding(d_model=d_model)),
            nn.Sequential(c(self.shared),
                          PositionalEncoding(d_model=d_model)),
            Generator(d_model=d_model, vocab_size=vocab_size),
        )

    def forward(self, src_input, tgt_input, src_mask, tgt_mask):
        logits = self.model(src_input, tgt_input, src_mask, tgt_mask)
        sequence = F.linear(logits, self.shared.embedder.weight)
        return (logits, sequence)

    def generate(self, x):
        return self.model.generator(x)

In [3]:
class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder, embedder, tgt_embedder, generator):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.embedder = embedder
        self.tgt_embedder = tgt_embedder
        self.generator = generator

    def forward(self, src_input, tgt_input, src_mask, tgt_mask):
        memory = self.encode(src_input, src_mask)
        return self.decode(memory, src_mask, tgt_input, tgt_mask)

    def encode(self, src_input, src_mask):
        return self.encoder(self.embedder(src_input), src_mask)

    def decode(self, memory, src_mask, tgt_input, tgt_mask):
        return self.decoder(memory, src_mask, self.tgt_embedder(tgt_input), tgt_mask)

In [4]:
class Encoder(nn.Module):
    def __init__(self, layer, num_layers):
        super().__init__()
        self.layer_list = clone(layer, num_layers)

    def forward(self, src_embed, src_mask):
        x = src_embed
        for layer in self.layer_list:
            x = layer(x, src_mask)
        return x

class EncoderLayer(nn.Module):
    def __init__(self, attn, ffn):
        super().__init__()
        self.attn = attn
        self.ffn = ffn

    def forward(self, x, mask):
        x = self.attn(x, x, x, mask)
        x = self.ffn(x)
        return x

In [5]:
class Decoder(nn.Module):
    def __init__(self, layer, num_layers):
        super().__init__()
        self.layer_list = clone(layer, num_layers)

    def forward(self, memory, src_mask, tgt_embed, tgt_mask):
        x = tgt_embed
        for layer in self.layer_list:
            x = layer(memory, src_mask, tgt_embed, tgt_mask)
        return x


class DecoderLayer(nn.Module):
    def __init__(self, attn, cross_attn, ffn):
        super().__init__()
        self.attn = attn
        self.cross_attn = cross_attn
        self.ffn = ffn

    def forward(self, m, src_mask, x, tgt_mask):
        x = self.attn(x, x, x, tgt_mask)
        x = self.cross_attn(x, m, m, src_mask, cross=True)
        x = self.ffn(x)
        return x

In [6]:
model = FullModel(
    num_encoder=2,
    num_decoder=2,
    d_model=512,
    vocab_size=11,
    num_head=8
).cuda()
for p in model.parameters():
    if p.dim() > 1:
        print(p)
        nn.init.xavier_uniform_(p)

Parameter containing:
tensor([[ 0.2702,  0.9665,  0.8689,  ...,  0.7461, -1.0133, -1.5127],
        [ 1.6205, -0.3668,  0.3662,  ...,  1.4504,  0.2733,  0.2208],
        [-0.1541,  0.5831,  0.2178,  ..., -0.5265, -2.2544,  0.5922],
        ...,
        [ 1.0133, -0.1850, -0.9212,  ..., -0.0677, -0.0754,  0.2580],
        [-1.3319, -0.5734, -0.7726,  ...,  1.0976,  1.7588, -0.5740],
        [ 0.5696,  0.2808, -0.0281,  ...,  0.7650, -0.3095, -0.6587]],
       device='cuda:0', requires_grad=True)
Parameter containing:
tensor([[-0.0121,  0.0141, -0.0337,  ...,  0.0385,  0.0301, -0.0140],
        [ 0.0023, -0.0131,  0.0261,  ...,  0.0220,  0.0099, -0.0161],
        [ 0.0038, -0.0330, -0.0116,  ...,  0.0045,  0.0382, -0.0126],
        ...,
        [-0.0313,  0.0436, -0.0388,  ..., -0.0292,  0.0174,  0.0217],
        [ 0.0297,  0.0240,  0.0338,  ...,  0.0068, -0.0381, -0.0416],
        [-0.0107, -0.0324,  0.0367,  ..., -0.0309, -0.0391,  0.0053]],
       device='cuda:0', requires_grad=True)


In [7]:
mock_input = torch.LongTensor([[0, 2, 2, 2, 4]])
# decoder_input = torch.LongTensor([[5, 0, 1, 2, 3]])
attention_mask = torch.ones(1, 1, mock_input.size(-1))
greedy_decode(model, mock_input.cuda(), attention_mask.cuda(), 5, 0)

tensor([-0.5635], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([5], device='cuda:0')
tensor([-0.1844], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([1], device='cuda:0')
tensor([-0.2768], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([2], device='cuda:0')
tensor([-0.0318], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([4], device='cuda:0')


tensor([[0, 5, 1, 2, 4]], device='cuda:0')

# Test our model (inference)

In [8]:
mock_input = torch.LongTensor([[0, 1, 1, 1, 1, 1, 1, 2, 3, 4]])
decoder_input = torch.LongTensor([[0, 0, 1, 1, 1, 1, 1, 1, 2, 3]])
attention_mask = torch.ones(1, 1, mock_input.size(-1))

output = model(mock_input.cuda(), decoder_input.cuda(), attention_mask.cuda(), get_subsequent_mask(mock_input.size(-1)).unsqueeze(dim=0).cuda())
output[1].shape

torch.Size([1, 10, 11])

In [9]:
generator = F.linear(output[0], model.shared.embedder.weight)

In [10]:
pred = generator.argmax(dim=-1)

In [11]:
gold = torch.zeros(10, dtype=torch.long)
gold

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
pred

tensor([[ 8,  8,  8,  9,  8,  9, 10,  9, 10,  8]], device='cuda:0')

# Training
I will directly use tools from pytorch to train the model.
Here are the things we need:
- a module to manage and split our data -> Dataset and DataLoader
- a module to optimize our model based on the loss -> optimizer
- a module to manage the learning rate we will use -> scheduler

In [13]:
from torch.utils.data import DataLoader, Dataset
from torch.optim import Adam
from torch.optim.lr_scheduler import LambdaLR

In [14]:
data = torch.randint(1, 11, (20000, 10))
data[:, 0] = 1
src = data.requires_grad_(False).clone().detach()
tgt = data.requires_grad_(False).clone().detach()

In [15]:
# loss_fct = nn.KLDivLoss(reduction='sum')
from utils import LabelSmoothing
# loss_fct = LabelSmoothing(size=5, smoothing=0.1)
loss_fct = nn.CrossEntropyLoss()
optimizer = Adam(
    model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9
)
scheduler = LambdaLR(optimizer, lr_lambda=lambda step: rate(
    step, model_size=model.d_model, factor=1, warmup=400
))

In [16]:
a = torch.rand(2, 6, 5)
test_tgt = torch.randint(0, 5, (2, 6))
test_loss = loss_fct(a.view(-1, a.size(-1)), test_tgt.flatten())
test_loss

tensor(1.7257)

In [17]:
class CopyDataset(Dataset):
    def __init__(self, raw_data):
        super().__init__()
        self.data = raw_data
        # self.bos = torch.tensor([0])
        # self.eos = torch.tensor([6])
        self.pad = torch.tensor([11])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        data_item = self.data[index]
        src = data_item
        # tgt = torch.cat([self.bos, data_item[:-1]], dim=-1)
        # tgt_y = torch.cat([data_item[1:], self.eos], dim=-1)
        tgt = data_item[:-1]
        tgt_y = data_item[1:]

        encoder_attention_mask = torch.ones(1, 1).type_as(src).masked_fill(src == self.pad, 0)
        decoder_pad_mask = torch.ones(1, 1).type_as(tgt).masked_fill(tgt == self.pad, 0)
        decoder_subsequent_mask = get_subsequent_mask(tgt.size(-1))
        decoder_attention_mask = decoder_pad_mask & decoder_subsequent_mask
        return {
            'encoder_input_ids': src,
            'decoder_input_ids': tgt,
            'target_ids': tgt_y,
            'encoder_attention_mask': encoder_attention_mask,
            'decoder_attention_mask': decoder_attention_mask
        }

def split_data(data):
    train_size = int(len(data) * 0.8)
    val_size = len(data) - train_size
    train, val = torch.utils.data.random_split(data, [train_size, val_size])

    train_dataset = CopyDataset(train)
    val_dataset = CopyDataset(val)
    return train_dataset, val_dataset

In [18]:
train_dataset, val_dataset = split_data(data)

In [19]:
train_dataset[0]

{'encoder_input_ids': tensor([1, 2, 6, 5, 1, 6, 5, 9, 4, 2]),
 'decoder_input_ids': tensor([1, 2, 6, 5, 1, 6, 5, 9, 4, 2]),
 'target_ids': tensor([1, 2, 6, 5, 1, 6, 5, 9, 4, 2]),
 'encoder_attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'decoder_attention_mask': tensor([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [20]:
train_loader = DataLoader(train_dataset, batch_size=80, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=20, shuffle=True)

In [21]:
for epoch in range(20):
    pbar = tqdm(total=20)
    print("Epoch #{}".format(epoch))
    model.train()
    for batch in train_loader:
        encoder_input_ids = batch['encoder_input_ids'].cuda()
        decoder_input_ids = batch['decoder_input_ids'].cuda()
        target_ids = batch['target_ids'].cuda()
        encoder_attention_mask = batch['encoder_attention_mask'].cuda()
        decoder_attention_mask = batch['decoder_attention_mask'].cuda()
        logits, pred = model(encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask)

        output = model.generate(logits)

        loss = loss_fct(output.view(-1, output.size(-1)), target_ids.view(target_ids.flatten().size(0)))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()
        pbar.update(1)
        pbar.set_postfix({'loss': loss, 'lr': optimizer.param_groups[0]["lr"]}, refresh=True)
    model.eval()
    loss = 0
    for batch in val_loader:
        encoder_input_ids = batch['encoder_input_ids'].cuda()
        decoder_input_ids = batch['decoder_input_ids'].cuda()
        target_ids = batch['target_ids'].cuda()
        encoder_attention_mask = batch['encoder_attention_mask'].cuda()
        decoder_attention_mask = batch['decoder_attention_mask'].cuda()
        logits, pred = model(encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask)
    
        loss += loss_fct(pred.view(-1, pred.size(-1)), target_ids.view(target_ids.flatten().size(0))).detach()
    pbar.set_postfix({'val_loss': loss / len(val_loader)}, refresh=True)
        # print(optimizer.param_groups[0]["lr"])

 35%|███▌      | 7/20 [00:00<00:00, 33.56it/s, loss=tensor(2.9739, device='cuda:0', grad_fn=<NllLossBackward0>), lr=1.93e-5]

Epoch #0


200it [00:06, 28.73it/s, val_loss=tensor(4.3297, device='cuda:0')]                                                           



Epoch #1


200it [00:07, 28.14it/s, val_loss=tensor(4.3977, device='cuda:0')]-05, device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000572][A
 35%|███▌      | 7/20 [00:00<00:00, 36.74it/s, loss=tensor(4.0692e-05, device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.0011]

Epoch #2


200it [00:06, 28.69it/s, val_loss=tensor(4.6182, device='cuda:0')]                                                               



Epoch #3


200it [00:12, 15.71it/s, val_loss=tensor(5.2185, device='cuda:0')]-07, device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000898][A
  5%|▌         | 1/20 [00:00<00:02,  8.89it/s, loss=tensor(2.1458e-08, device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000781]

Epoch #4


200it [00:19, 10.47it/s, val_loss=tensor(5.2253, device='cuda:0')]                                                                


Epoch #5


200it [00:08, 23.56it/s, val_loss=tensor(5.3536, device='cuda:0')]
 35%|███▌      | 7/20 [00:00<00:00, 37.98it/s, loss=tensor(1.4901e-10, device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000636]

Epoch #6


200it [00:14, 13.58it/s, val_loss=tensor(5.4419, device='cuda:0')]                                                                


Epoch #7


200it [00:19, 10.20it/s, val_loss=tensor(5.4324, device='cuda:0')]
 10%|█         | 2/20 [00:00<00:01, 12.47it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000552]

Epoch #8


200it [00:19, 10.32it/s, val_loss=tensor(5.4596, device='cuda:0')]                                                                



Epoch #9


200it [00:19, 10.50it/s, val_loss=tensor(5.4570, device='cuda:0')]ice='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000521][A
 10%|█         | 2/20 [00:00<00:01, 11.64it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000494]

Epoch #10


200it [00:20,  9.60it/s, val_loss=tensor(5.4795, device='cuda:0')]                                                                


Epoch #11


200it [00:20,  9.54it/s, val_loss=tensor(5.5764, device='cuda:0')]
 10%|█         | 2/20 [00:00<00:01, 12.36it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000451]

Epoch #12


200it [00:17, 11.30it/s, val_loss=tensor(5.6016, device='cuda:0')]                                                        



Epoch #13


200it [00:07, 28.51it/s, val_loss=tensor(5.5962, device='cuda:0')]ice='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000433][A
 35%|███▌      | 7/20 [00:00<00:00, 40.19it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000417]

Epoch #14


200it [00:06, 29.36it/s, val_loss=tensor(5.5943, device='cuda:0')]                                                               



Epoch #15


200it [00:06, 29.35it/s, val_loss=tensor(5.5868, device='cuda:0')]ice='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000403][A
 30%|███       | 6/20 [00:00<00:00, 38.16it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.00039] 

Epoch #16


200it [00:06, 30.05it/s, val_loss=tensor(5.5923, device='cuda:0')]                                                        



Epoch #17


200it [00:06, 29.81it/s, val_loss=tensor(5.5911, device='cuda:0')]ice='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000379][A
 35%|███▌      | 7/20 [00:00<00:00, 36.35it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000368]

Epoch #18


200it [00:06, 30.63it/s, val_loss=tensor(5.5891, device='cuda:0')]                                                        



Epoch #19


 35%|███▌      | 7/20 [00:00<00:00, 36.26it/s, loss=tensor(0., device='cuda:0', grad_fn=<NllLossBackward0>), lr=0.000358][A



In [22]:
batch = next(iter(val_loader))

In [23]:
print(optimizer.param_groups[0]["lr"])

0.0022097086912079614


In [24]:
batch['target_ids'][0], batch['decoder_input_ids'][0], batch['encoder_input_ids'][0]

(tensor([1, 9, 5, 5, 6, 9, 1, 4, 7, 3]),
 tensor([1, 9, 5, 5, 6, 9, 1, 4, 7, 3]),
 tensor([1, 9, 5, 5, 6, 9, 1, 4, 7, 3]))

In [25]:
batch['decoder_input_ids'][0]

tensor([1, 9, 5, 5, 6, 9, 1, 4, 7, 3])

In [26]:
pred[0].shape

torch.Size([10, 11])

In [27]:
loss = 0
for i in range(pred.size(0)):
    loss += loss_fct(pred[i], target_ids[i])
loss = loss / pred.size(0)
loss, loss_fct(pred.view(-1, pred.size(-1)), target_ids.view(target_ids.flatten().size(0)))

(tensor(3.1544, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(3.1544, device='cuda:0', grad_fn=<NllLossBackward0>))

In [28]:
loss_fct(pred.view(-1, pred.size(-1)), target_ids.view(target_ids.flatten().size(0)))

tensor(3.1544, device='cuda:0', grad_fn=<NllLossBackward0>)

In [29]:
target_ids.view(target_ids.flatten().size(0)).shape

torch.Size([200])

In [30]:
torch.zeros()

TypeError: zeros() received an invalid combination of arguments - got (), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [31]:
target_ids.view(target_ids.flatten().size(0))[:20]

tensor([ 1,  7,  7,  7,  8,  1,  2, 10,  4,  9,  1,  5,  6,  8,  6,  4,  4,  6,
         7,  8], device='cuda:0')

In [32]:
loss, loss_fct(torch.cat([item for item in pred]), torch.cat([item for item in target_ids]))

(tensor(3.1544, device='cuda:0', grad_fn=<DivBackward0>),
 tensor(3.1544, device='cuda:0', grad_fn=<NllLossBackward0>))

In [33]:
pred.view(-1, pred.size(-1))[10:12]

tensor([[-1.7735,  1.6937,  1.8926, -1.2290, -2.1162,  0.6710, -0.8189, -1.3710,
          1.7014,  0.1233, -0.0146],
        [-2.7066,  0.5713,  0.7198, -0.1147, -0.8357, -1.7156, -0.9016,  0.3126,
         -0.6881,  1.7899,  2.4643]], device='cuda:0',
       grad_fn=<SliceBackward0>)

In [34]:
mock_input = torch.LongTensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]])
# decoder_input = torch.LongTensor([[5, 0, 1, 2, 3]])
attention_mask = torch.ones(1, 1, mock_input.size(-1))
# decoder_attention_mask = get_subsequent_mask(mock_input.size(-1)).unsqueeze(dim=0)

In [35]:
# decoder_attention_mask.shape

In [36]:
model.eval()

FullModel(
  (shared): Embedding(
    (embedder): Embedding(11, 512)
  )
  (model): EncoderDecoder(
    (encoder): Encoder(
      (layer_list): ModuleList(
        (0): EncoderLayer(
          (attn): MultiHeadAttention(
            (layernorm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
            (w_q): Linear(in_features=512, out_features=512, bias=True)
            (w_k): Linear(in_features=512, out_features=512, bias=True)
            (w_v): Linear(in_features=512, out_features=512, bias=True)
            (w_o): Linear(in_features=512, out_features=512, bias=True)
          )
          (ffn): FeedForwardNetwork(
            (layernorm): LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
            (ffn): Sequential(
              (0): Linear(in_features=512, out_features=2048, bias=True)
              (1): ReLU()
              (2): Dropout(p=0.5, inplace=False)
              (3): Linear(in_features=2048, out_features=512, bias=True)
          

In [37]:
attention_mask, mock_input

(tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]]),
 tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]))

In [38]:
greedy_decode(model, mock_input.cuda(), attention_mask.cuda(), 10, 0)

tensor([-0.3018], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')
tensor([0.], device='cuda:0', grad_fn=<MaxBackward0>)
tensor([9], device='cuda:0')


tensor([[0, 9, 9, 9, 9, 9, 9, 9, 9, 9]], device='cuda:0')

In [39]:
model.shared.embedder.weight

Parameter containing:
tensor([[-0.0083, -0.0345,  0.0599,  ..., -0.0130,  0.0741, -0.0066],
        [ 0.0338,  0.0370, -0.0141,  ...,  0.0455,  0.0997,  0.0742],
        [ 0.0604, -0.0374,  0.0189,  ..., -0.0454,  0.0603, -0.0203],
        ...,
        [ 0.0464, -0.0650,  0.0247,  ...,  0.0020,  0.0803,  0.0235],
        [ 0.0016,  0.0587,  0.0922,  ...,  0.0712, -0.0785, -0.0860],
        [-0.0795,  0.0527, -0.0127,  ...,  0.0185,  0.0922,  0.1018]],
       device='cuda:0', requires_grad=True)

