In [1]:
import os
import torch
import pickle
import torch.nn as nn
import data_preprocess
from torch.nn import CrossEntropyLoss
from torch.utils.data import TensorDataset,DataLoader
from transformers import AutoModel,AutoConfig,AutoTokenizer,AdamW,AutoModelForMaskedLM,BertForMaskedLM
from transformers.modeling_bert import BertOnlyMLMHead,MaskedLMOutput
class POS_Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.config = AutoConfig.from_pretrained('bert-base-uncased')
        self.model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
        print(self.model)
        self.pos_embs = nn.Embedding(51, self.config.hidden_size)
        self.mask_predict = nn.Linear(self.config.hidden_size, self.config.vocab_size, bias=False)
        self.cls = BertOnlyMLMHead(self.config)
        

    def forward(self,input_ids,token_type_ids,attention_mask,labels,pos_ids):
        model_embedding = self.model.bert.embeddings
        pos_embedding = self.pos_embs(pos_ids)
        print(model_embedding)
        input_emb = model_embedding(input_ids=input_ids,token_type_ids=token_type_ids,attention_mask=attention_mask)
        print(input_emb.shape)
        print(pos_embedding.shape)
        
        
        
        
        
        outputs = self.model(output_attentions=True,output_hidden_states=True,input_ids=input_ids,
                             token_type_ids=token_type_ids,labels=labels,attention_mask=attention_mask)
        # loss
        sequence_output  =outputs[0]
        
        hidden_states = outputs[2]
        output_attentions=outputs[3]
        
        #　add_pos
        last_hidden=hidden_states[12]+pos_embedding
        
        prediction_scores = self.cls(last_hidden)
        masked_lm_loss = None
        
        if labels is not None:
            loss_fct = CrossEntropyLoss()  # -100 index = padding token
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(
            loss=masked_lm_loss,
            logits=prediction_scores,
            hidden_states=hidden_states,
            attentions=output_attentions
        )


In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# intputs=tokenizer('[CLS] actually , chase tackled him. [SEP] [MASK] to [MASK] after [MASK]',add_special_tokens=False,return_tensors='pt')
# intputs

In [3]:
# intput_token=['[CLS]', 'actually', ',', 'chase', 'tackled', 'him', '.', '[SEP]', '[MASK]', 'to', '[MASK]', 'after', '[MASK]']]
intput_token=[101,101, 2941, 1010, 5252, 26176, 2032, 1012, 102, 103, 2000, 103, 2044, 103]
input_segment=[0,0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
input_attention=[1,1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
input_maskLM=[-100,-100, -100, -100, -100, -100, -100, -100, -100, 100, -100, 5376, -100, 2032]
intput_pos=[50,50, 30, 2, 23, 38, 28, 5, 50, 50, 35, 50, 15, 50]
assert len(intput_token)==len(input_segment)
assert len(intput_token)==len(input_attention)
assert len(intput_token)==len(input_maskLM)
assert len(intput_token)==len(intput_pos)

In [4]:
intput_token_tensor=torch.tensor([intput_token],dtype=torch.long)
input_segment_tensor=torch.tensor([input_segment],dtype=torch.long)
input_attention_tensor=torch.tensor([input_attention],dtype=torch.long)
input_maskLM_tensor=torch.tensor([input_maskLM],dtype=torch.long)
intput_pos_tensor=torch.tensor([intput_pos],dtype=torch.long)

print(intput_token_tensor.shape)
print(input_segment_tensor.shape)
print(input_attention_tensor.shape)
print(input_maskLM_tensor.shape)
print(intput_pos_tensor.shape)


torch.Size([1, 14])
torch.Size([1, 14])
torch.Size([1, 14])
torch.Size([1, 14])
torch.Size([1, 14])


In [5]:
model = POS_Model()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [6]:
logits=model(input_ids=intput_token_tensor,token_type_ids=input_segment_tensor,attention_mask=input_attention_tensor, labels=input_maskLM_tensor,pos_ids=intput_pos_tensor)

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


TypeError: forward() got an unexpected keyword argument 'attention_mask'