# PyTorch

In [31]:
import torch

pytorch matrix multiplication

In [39]:
batch_size = 5
max_len = 20
emb_size = 32

x = torch.randn(batch_size, max_len, emb_size)
W = torch.randn(emb_size, emb_size)

In [45]:
z = x.matmul(W)
z.shape

torch.Size([5, 20, 32])

In [47]:
z = x @ W
z.shape

torch.Size([5, 20, 32])

In [49]:
from torch import nn

In [52]:
transformer_model = nn.Transformer(nhead=4, num_encoder_layers=3)

In [53]:
transformer_model

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): 

# Transformers

In [1]:
import transformers

# BERT

In [54]:
model_path = 'bert-base-uncased' # bert-base-uncased distilbert-base-uncased

In [55]:
config = transformers.AutoConfig.from_pretrained(model_path)

In [56]:
config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.11.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [57]:
model = transformers.AutoModel.from_config(config)

In [58]:
print(type(model))

<class 'transformers.models.bert.modeling_bert.BertModel'>


In [59]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [8]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [9]:
MAX_LEN = 20

In [23]:
text = 'I want to go home.'

In [24]:
tokenized_text = tokenizer.tokenize(text)
print(tokenized_text)
print(len(tokenized_text))

['i', 'want', 'to', 'go', 'home', '.']
6


In [13]:
encoded_input = tokenizer(
    text,
    return_tensors='pt',
#     pad_to_max_length=True,
    truncation='longest_first',
    padding='max_length',
    max_length=MAX_LEN
)
encoded_input

{'input_ids': tensor([[ 101, 1045, 2215, 2000, 2175, 2188, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [14]:
output = model(**encoded_input)

In [17]:
output.last_hidden_state.shape

torch.Size([1, 20, 768])