In [1]:
# BigBird 모형: 더 긴 입력 시퀀스를 처리할 수 있는 모형
# 2048 토큰 처리(BERT의 4배, 512x4)

In [1]:
from transformers import BigBirdTokenizer, BigBirdForMaskedLM
import torch

# 모델과 토크나이저 불러오기
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForMaskedLM.from_pretrained('google/bigbird-roberta-base')
model

BigBirdForMaskedLM(
  (bert): BigBirdModel(
    (embeddings): BigBirdEmbeddings(
      (word_embeddings): Embedding(50358, 768, padding_idx=0)
      (position_embeddings): Embedding(4096, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BigBirdEncoder(
      (layer): ModuleList(
        (0-11): 12 x BigBirdLayer(
          (attention): BigBirdAttention(
            (self): BigBirdBlockSparseAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
            )
            (output): BigBirdSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
           

(decoder): Linear(in_features=768, out_features=50358, bias=True)



In [2]:
inputs = ['I like reading [MASK].',
          'I like driving a [MASK]',
          'The world is facing with a [MASK] [MASK] crisis. We are all suffering from infectious diseases.',]
answers = ['I like reading book.',
           'I like driving a car',
           'The world is facing with a pandemic crisis. We are all suffering from infectious diseases.']

In [3]:
encoded_inputs = []
encoded_labels =  []

for i, l in zip(inputs, answers):
  encoded_inputs.append(tokenizer(i, return_tensors="pt"))
  #                             입력
  encoded_labels.append(tokenizer(l, return_tensors="pt")["input_ids"])

In [4]:
# 추론 모드로 실행
#추론 모드로 실행

for input, label in zip(encoded_inputs, encoded_labels):

  outputs = model(**input, labels=label)
  loss = outputs.loss
  logits = outputs.logits
  #          final hidden

  print(f"loss：{loss.item()}")
  print(f"prediction：{' '.join([tokenizer.decode(logits[0][i].argmax(-1)) for i in range(1, len(logits[0]))])}")
  print(f"answer：{tokenizer.decode(label[0][1:-1])}")
  print('\n')

Attention type 'block_sparse' is not possible if sequence_length: 7 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


loss：11.183554649353027
prediction：i like reading it . i
answer：I like reading book.


loss：8.141962051391602
prediction：much like driving a car much
answer：I like driving a car


loss：4.29605770111084
prediction：the world is facing with a global health crisis . we are all suffering from infectious diseases . .
answer：The world is facing with a pandemic crisis. We are all suffering from infectious diseases.




In [5]:
# pegasus: 문장 요약에 특화된 사전 학습 모형, 구글 2020 발표
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch

# 모델명
model_name = 'google/pegasus-xsum'
device = 'cpu'

# 모델 및 토크나이저 불러오기
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
inputs = ["""
Pretraining large neural language models, such as BERT, has led to impressive gains on many natural language processing (NLP) tasks. However, most pretraining efforts focus on general domain corpora, such as newswire and Web. A prevailing assumption is that even domain-specific pretraining can benefit by starting from general-domain language models. Recent work shows that for domains with abundant unlabeled text, such as biomedicine, pretraining language models from scratch results in substantial gains over continual pretraining of general-domain language models.
"""]

batch = tokenizer(inputs, truncation=True, padding='longest', return_tensors='pt').to(device)

In [7]:
batch

{'input_ids': tensor([[ 3414, 18006,   423, 14849,  1261,  1581,   108,   253,   130,   110,
         62613,   108,   148,  1358,   112,  2745,  6602,   124,   223,   710,
          1261,  2196,   143, 72237,   158,  2722,   107,   611,   108,   205,
          1133, 18006,  1645,   777,   124,   956,  2641,   110, 88758,   108,
           253,   130,   990, 12967,   111,  1892,   107,   202, 19552, 13183,
           117,   120,   254,  2641,   121,  7115,  1133, 18006,   137,  1280,
           141,  1215,   135,   956,   121, 23802,  1261,  1581,   107, 13618,
           201,   939,   120,   118,  9982,   122,  9878,  1596, 53541,  1352,
           108,   253,   130,  5293, 25255,   108,  1133, 18006,  1261,  1581,
           135,  5932,   602,   115,  4844,  6602,   204, 18266,  1133, 18006,
           113,   956,   121, 23802,  1261,  1581,   107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1,

In [8]:
# 요약 문장 생성

translated = model.generate(**batch)
generated_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
print(generated_text[0])



Pretraining large neural language models can lead to substantial gains over continual pretraining of general-domain language models.


In [9]:
model

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(512, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_nor