In [1]:
import torch.nn as nn
from transformers import AutoModel
class CustomModel(nn.Module):
    def __init__(self):
        super(CustomModel, self).__init__()
        
        self.base_model = AutoModel.from_pretrained('bert-base-uncased', output_hidden_states = True, output_attentions=True)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, 3)
        
    def forward(self, input_ids, token_type_ids, attn_mask):
        outputs = self.base_model(input_ids, token_type_ids=token_type_ids, attention_mask=attn_mask)
        outputs = self.dropout(outputs[1])
        outputs = self.linear(outputs)
        
        return outputs

model = CustomModel()
model.to('cuda')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [2]:
from copy import deepcopy
def print_layers_and_return_weights(model):
    parameters = deepcopy(model.state_dict())
    print("Model's state_dict:")
    for param_tensor in parameters:
        print(param_tensor, "\t", parameters[param_tensor].size())
    print()
    return parameters


def print_output_innerbert(model, text):
    encoded_input = tokenizer(text, return_tensors='pt').to('cuda')
    output = model(encoded_input['input_ids'], encoded_input['token_type_ids'], encoded_input['attention_mask'])
    print("Text: ", text)
    print("Encoded Input Length: ", len(encoded_input['input_ids'][0]))
    print("Encoded Input: ", encoded_input['input_ids'][0])
    print("Output Size: ", output[0].shape)
    print("Pooled Output Size: ", output[1].shape)
    print("\nOutput Encodings: ", output[0])
    print("\nOutput Pooled Vector: ", output[1])

def print_output_whole(model, text):
    encoded_input = tokenizer(text, return_tensors='pt').to('cuda')
    output = model(encoded_input['input_ids'], encoded_input['token_type_ids'], encoded_input['attention_mask'])
    print("Text: ", text)
    print("Encoded Input Length: ", len(encoded_input['input_ids'][0]))
    print("Encoded Input: ", encoded_input['input_ids'][0])
    print("Output Size: ", output.shape)
    print("Output: ", output)
    """
    print("Output Size: ", output[0].shape)
    print("Pooled Output Size: ", output[1].shape)
    print("\nOutput Encodings: ", output[0])
    print("\nOutput Pooled Vector: ", output[1])
        """

In [3]:
params = print_layers_and_return_weights(model.base_model)

Model's state_dict:
embeddings.position_ids 	 torch.Size([1, 512])
embeddings.word_embeddings.weight 	 torch.Size([30522, 768])
embeddings.position_embeddings.weight 	 torch.Size([512, 768])
embeddings.token_type_embeddings.weight 	 torch.Size([2, 768])
embeddings.LayerNorm.weight 	 torch.Size([768])
embeddings.LayerNorm.bias 	 torch.Size([768])
encoder.layer.0.attention.self.query.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.query.bias 	 torch.Size([768])
encoder.layer.0.attention.self.key.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.key.bias 	 torch.Size([768])
encoder.layer.0.attention.self.value.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.self.value.bias 	 torch.Size([768])
encoder.layer.0.attention.output.dense.weight 	 torch.Size([768, 768])
encoder.layer.0.attention.output.dense.bias 	 torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.weight 	 torch.Size([768])
encoder.layer.0.attention.output.LayerNorm.bias 	 torch.Si

In [4]:
params['pooler.dense.weight']

tensor([[-0.0013, -0.0381, -0.0158,  ...,  0.0244, -0.0008,  0.0240],
        [ 0.0020,  0.0151,  0.0033,  ...,  0.0180, -0.0023,  0.0231],
        [-0.0386,  0.0145,  0.0621,  ...,  0.0374, -0.0105, -0.0395],
        ...,
        [-0.0111,  0.0136,  0.0541,  ...,  0.0666,  0.0017, -0.0090],
        [ 0.0001,  0.0024, -0.0125,  ...,  0.0046, -0.0014, -0.0079],
        [ 0.0415,  0.0751,  0.0305,  ...,  0.0317,  0.0479,  0.0080]],
       device='cuda:0')

In [5]:
params_whole = print_layers_and_return_weights(model)

Model's state_dict:
base_model.embeddings.position_ids 	 torch.Size([1, 512])
base_model.embeddings.word_embeddings.weight 	 torch.Size([30522, 768])
base_model.embeddings.position_embeddings.weight 	 torch.Size([512, 768])
base_model.embeddings.token_type_embeddings.weight 	 torch.Size([2, 768])
base_model.embeddings.LayerNorm.weight 	 torch.Size([768])
base_model.embeddings.LayerNorm.bias 	 torch.Size([768])
base_model.encoder.layer.0.attention.self.query.weight 	 torch.Size([768, 768])
base_model.encoder.layer.0.attention.self.query.bias 	 torch.Size([768])
base_model.encoder.layer.0.attention.self.key.weight 	 torch.Size([768, 768])
base_model.encoder.layer.0.attention.self.key.bias 	 torch.Size([768])
base_model.encoder.layer.0.attention.self.value.weight 	 torch.Size([768, 768])
base_model.encoder.layer.0.attention.self.value.bias 	 torch.Size([768])
base_model.encoder.layer.0.attention.output.dense.weight 	 torch.Size([768, 768])
base_model.encoder.layer.0.attention.output.dense

In [6]:
params_whole['base_model.pooler.dense.weight']

tensor([[-0.0013, -0.0381, -0.0158,  ...,  0.0244, -0.0008,  0.0240],
        [ 0.0020,  0.0151,  0.0033,  ...,  0.0180, -0.0023,  0.0231],
        [-0.0386,  0.0145,  0.0621,  ...,  0.0374, -0.0105, -0.0395],
        ...,
        [-0.0111,  0.0136,  0.0541,  ...,  0.0666,  0.0017, -0.0090],
        [ 0.0001,  0.0024, -0.0125,  ...,  0.0046, -0.0014, -0.0079],
        [ 0.0415,  0.0751,  0.0305,  ...,  0.0317,  0.0479,  0.0080]],
       device='cuda:0')

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [8]:
print_output_innerbert(model.base_model, "hello")

Text:  hello
Encoded Input Length:  3
Encoded Input:  tensor([ 101, 7592,  102], device='cuda:0')
Output Size:  torch.Size([1, 3, 768])
Pooled Output Size:  torch.Size([1, 768])

Output Encodings:  tensor([[[-0.4145,  0.1708, -0.0262,  ..., -0.3718,  0.2444,  0.2653],
         [-0.4951,  0.1428,  0.8051,  ..., -0.4704, -0.4935, -0.1657],
         [ 0.7239, -0.0396, -0.2727,  ...,  0.1140, -0.5539, -0.4134]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)

Output Pooled Vector:  tensor([[-0.7012, -0.2238,  0.2767,  0.5913, -0.0885, -0.2030,  0.6863,  0.1025,
          0.3087, -0.9987,  0.1711,  0.4516,  0.9699, -0.3612,  0.8789, -0.2883,
         -0.2684, -0.5411,  0.3246, -0.3785,  0.4412,  0.9559,  0.5557,  0.1000,
          0.3645,  0.6114, -0.5838,  0.8944,  0.9231,  0.6767, -0.6094,  0.0210,
         -0.9818, -0.1986, -0.1888, -0.9822,  0.0713, -0.6321, -0.0322, -0.0437,
         -0.8495,  0.1549,  0.9900, -0.4144,  0.1890, -0.2826, -0.9996,  0.2433,
         -0.8600,

In [9]:
print_output_whole(model, "hello")

Text:  hello
Encoded Input Length:  3
Encoded Input:  tensor([ 101, 7592,  102], device='cuda:0')
Output Size:  torch.Size([1, 3])
Output:  tensor([[ 0.3366,  0.0753, -0.4751]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [10]:
encoded_input = tokenizer("hello Ramon", return_tensors='pt').to('cuda')
output = model.base_model(encoded_input['input_ids'],encoded_input['token_type_ids'], encoded_input['attention_mask'])

In [11]:
encoded_input

{'input_ids': tensor([[  101,  7592, 12716,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1]], device='cuda:0')}

In [12]:
output.pooler_output.shape

torch.Size([1, 768])