In [73]:
from utils import *
from glue_dataset import GlueDataset
from arguments import get_args, DataTrainingArguments, ModelArguments
from transformers import TrainingArguments
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoConfig, default_data_collator
import torch
import datasets

In [74]:
data_args = DataTrainingArguments(
    task_name = 'glue',
    dataset_name = 'qqp'
)

model_args = ModelArguments(
    model_name_or_path = 'bert-base-uncased'
)

training_args = TrainingArguments(
    output_dir = 'output',
    do_train = True,
    do_eval = True,
    do_predict=True
)

In [75]:
tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)

gluedata = GlueDataset(tokenizer = tokenizer,
            model_args= model_args,
            data_args = data_args, 
            training_args = training_args)


Running tokenizer on dataset: 100%|██████████| 390965/390965 [01:07<00:00, 5790.49 examples/s]


In [76]:
tokenizer.decode(gluedata.train_dataset['input_ids'][0])

'[CLS] how is the life of a math student? could you describe your own experiences? [SEP] which level of prepration is enough for the exam jlpt5? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [77]:
encoded = tokenizer.encode('my name is minwoo')
tokenizer.decode(encoded)

'[CLS] my name is minwoo [SEP]'

In [78]:
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
roberta_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
roberta_large_tokenizer = AutoTokenizer.from_pretrained('roberta-large')

In [99]:
# tokenizer 가 다르다.
"""
    BERT : 사전크기 30522, WordPiece tokenizer(BPE와 유사, but 빈도가 아닌 가능도(likelihood) 기반)
    RoBERTa : 사전크기 50265, BBPE tokenizer(byte 형태의 sequence)
"""

bert_encoded = bert_tokenizer.encode('yes neutral no true false positive negative')
print(bert_encoded)
print(bert_tokenizer.decode(2748))
print(bert_tokenizer.decode(8699))
print(bert_tokenizer.decode(2053))
print(bert_tokenizer.decode(2995))
print(bert_tokenizer.decode(6270))
print(bert_tokenizer.decode(3893))
print(bert_tokenizer.decode(4997))
print(bert_tokenizer.decode(102))


roberta_encoded = roberta_tokenizer.encode('yes no')
print(roberta_encoded)
print(roberta_tokenizer.decode(10932))
print(roberta_tokenizer.decode(117))

roberta_large_encoded = roberta_large_tokenizer.encode('yes no')
print(roberta_large_encoded)

print(len(bert_tokenizer))
print(len(roberta_tokenizer))


[101, 2748, 8699, 2053, 2995, 6270, 3893, 4997, 102]
yes
neutral
no
true
false
positive
negative
[SEP]
[0, 10932, 117, 2]
yes
 no
[0, 10932, 117, 2]
30522
50265


In [88]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
config = AutoConfig.from_pretrained('bert-base-uncased')
model = AutoModelForMaskedLM.from_pretrained('bert-base-uncased', config = config)

"""
[minwoo] tokenizer 에 의하여 자동으로 [CLS] [MASK] .... 형태를 띄게됨.
내가 원하는 것은 [MASK] [PROMPT] [CLS] ... 이고 싶음.
"""
"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
    the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

"""

text = ["This is a great.", "it is happy [mask]."]
inputs = tokenizer.encode(text, return_tensors="pt", padding= True)

print(inputs)
tokenizer.decode(inputs[0])

with torch.no_grad():

    outputs = model(**inputs)
    print(outputs['logits'], outputs['logits'].shape)
    logits = outputs['logits']
    # 가장큰 로짓에 맞는 토큰을 모두 가져와 문장 복원
    # predicted_index = torch.argmax(outputs['logits'], dim = -1)
    # print(predicted_index)
    
    print(logits[:,0], logits[:,0].shape)
    mask_logits = logits[:,0]
    

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 101, 2023, 2003, 1037, 2307, 1012,  102, 2009, 2003, 3407, 1031, 7308,
         1033, 1012,  102]])


TypeError: BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
          (intermediate): BertIntermediate(
            (dense): Linear(in_features=768, out_features=3072, bias=True)
            (intermediate_act_fn): GELUActivation()
          )
          (output): BertOutput(
            (dense): Linear(in_features=3072, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
  )
  (cls): BertOnlyMLMHead(
    (predictions): BertLMPredictionHead(
      (transform): BertPredictionHeadTransform(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (transform_act_fn): GELUActivation()
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      )
      (decoder): Linear(in_features=768, out_features=30522, bias=True)
    )
  )
) argument after ** must be a mapping, not Tensor

In [39]:
print(tokenizer.get_vocab()['[MASK]'])
print(tokenizer)


103
BertTokenizerFast(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


In [40]:
raw_datasets = datasets.load_dataset('glue', 'qqp')
raw_datasets = raw_datasets

In [96]:
task_to_keys = {
    "cola": ("sentence", None),
    "mnli": ("premise", "hypothesis"),
    "mrpc": ("sentence1", "sentence2"),
    "qnli": ("question", "sentence"),
    "qqp": ("question1", "question2"),
    "rte": ("sentence1", "sentence2"),
    "sst2": ("sentence", None),
    "stsb": ("sentence1", "sentence2"),
    "wnli": ("sentence1", "sentence2"),
}

sentence1_key, sentence2_key = task_to_keys['qqp']

# def preprocess_function(examples):
#         # Tokenize the texts
#         # print(examples)
#         """
        
#         """
#         args = (
#             (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
#         )
        
#         result = tokenizer(*args, padding= 'max_length', max_length= 128 , truncation=True)

#         return result

# [minwoo] on transferability paper : https://github.com/thunlp/Prompt-Transferability/blob/main/Prompt-Transferability-1.0/formatter/SST2PromptFormatter.py
def preprocess_function(examples):
    args = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    
    result = tokenizer(*args, padding='max_length', max_length=128, truncation=True)
    
    # special token id
    mask_token_id = tokenizer.mask_token_id
    pad_token_id = tokenizer.pad_token_id
    
    # [minwoo] does not use!
    cls_token_id = tokenizer.cls_token_id
    sep_token_id = tokenizer.sep_token_id
    
    # customize the tokens
    input_ids = result["input_ids"]
    attention_mask = result["attention_mask"]
    token_type_ids = result["token_type_ids"]

    new_input_ids = []
    new_attention_mask = []
    new_token_type_ids = []
    
    for ids, mask, token_type_id in zip(input_ids, attention_mask, token_type_ids):
        new_ids = [mask_token_id] + [pad_token_id] * (pre_seq_len:=20) + ids
        new_mask = [0] + [0] * (pre_seq_len) + mask
        new_token_type_id = [0] + [0] * (pre_seq_len) + token_type_id
        
        new_input_ids.append(new_ids)
        new_attention_mask.append(new_mask)
        new_token_type_ids.append(new_token_type_id)

    result["input_ids"] = new_input_ids
    result["attention_mask"] = new_attention_mask
    result["token_type_ids"] = new_token_type_ids
    return result

    
raw_datasets = raw_datasets.map(
            preprocess_function,
            batched=True,
            desc="Running tokenizer on dataset"
        )


Running tokenizer on dataset:   0%|          | 0/363846 [00:00<?, ? examples/s]

Running tokenizer on dataset: 100%|██████████| 363846/363846 [01:13<00:00, 4958.32 examples/s]
Running tokenizer on dataset: 100%|██████████| 40430/40430 [00:07<00:00, 5180.29 examples/s]
Running tokenizer on dataset: 100%|██████████| 390965/390965 [01:23<00:00, 4672.46 examples/s]


In [48]:
print(tokenizer.decode(raw_datasets['train'][0]['input_ids']))
print(raw_datasets['train'][0])

data_collator = default_data_collator


[MASK] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [CLS] how is the life of a math student? could you describe your own experiences? [SEP] which level of prepration is enough for the exam jlpt5? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
{'question1': 'How is the life of a math student? Could you describe your own experiences?', 'question2': 'Which level of prepration is enough for the exam jlp

In [83]:
sample_dataset = raw_datasets['train'].select(range(2))
print(sample_dataset)
collated = data_collator(sample_dataset)

Dataset({
    features: ['question1', 'question2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2
})


In [95]:
# collated['input_ids']
# collated['attention_mask']
collated['token_type_ids']

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])