In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install transformers[sentencepiece] # 추가 설치 필요한 모듈dependencies이 있다면 설치
!pip install sacremoses
!pip install bertviz

In [2]:
import sys
from textwrap import TextWrapper
import datasets
import huggingface_hub
import matplotlib.font_manager as font_manager
import matplotlib.pyplot as plt
import torch
import transformers
import pandas as pd
import numpy as np
import sentencepiece

In [3]:
from transformers import pipeline
import torch.nn as nn
from math import sqrt
import torch.nn.functional as F
from transformers import AutoConfig, AutoModel, AutoTokenizer

In [4]:
# 1. scaled dot product 함수 구현
# 2. 어텐션 모듈을 클래스로 구현
# 3. 멀티헤드 어텐션 클래스로 구현


# 트랜스포머 개요, 간단한 역사
# https://huggingface.co/learn/nlp-course/ko/chapter1/4

# 트랜스포머 그림, 내부 작동원리
# https://nlpinkorean.github.io/illustrated-transformer/

In [5]:
def scaled_dot_product_attention(query, key, value):
  dim_k = query.size(-1) # query 벡터의 shape 중 마지막; key로 해도 됨
  scores = torch.bmm(query, key.transpose(1, 2)) / sqrt(dim_k)
  # bmm은 Batch 단위로 Matrix Multiplication
  weights = F.softmax(scores, dim=1)
  # 0번째 축: batch 갯수
  # 1번째 축: sequence 길이
  # 2번째 축: 각 토큰의 임베딩 표현 길이
  return torch.bmm(weights, value) # 그림 상에서 Z 행렬

In [6]:
class AttentionHead(nn.Module):
  def __init__(self, embed_dim, head_dim):
    super().__init__()
    self.q = nn.Linear(embed_dim, head_dim) # W^q 행렬
    self.k = nn.Linear(embed_dim, head_dim)
    self.v = nn.Linear(embed_dim, head_dim)
    return

  def forward(self, hidden_state):
    attn_outputs = scaled_dot_product_attention(
        self.q(hidden_state), self.k(hidden_state), self.v(hidden_state)
    )
    return attn_outputs

In [None]:
# Attention is All You Need 2017년 논문 Vaswani
# 9월 13일~14일 코엑스 Seoul AI Summit

In [20]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config): # configuration 세부설정
    # config 파일 참고해서 학습 진행하기
    # config.yaml
    super().__init__()
    embed_dim = config.hidden_size
    num_heads = config.num_attention_heads
    head_dim = embed_dim//num_heads
    self.heads = nn.ModuleList(
        [AttentionHead(embed_dim, head_dim) for _ in range(num_heads)]
    )
    self.output_linear = nn.Linear(embed_dim, embed_dim)
    return

  def forward(self, hidden_state):
    x = torch.cat([h(hidden_state) for h in self.heads], dim=-1)
    x = self.output_linear(x)
    return x

In [21]:
from transformers.models.rembert import tokenization_rembert_fast
model_ckpt = "bert-base-uncased"
# ckpt라고 하면 checkpoint, 즉 중간 저장된 모델
# .ckpt
# 특정 모델 체크포인트를 가져온다 -> 사전학습된 모델 가져오기
config = AutoConfig.from_pretrained(model_ckpt)
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
test = "Time flies like an arrow."
inputs = tokenizer(test, return_tensors="pt", add_special_tokens=False)
token_emb = nn.Embedding(config.vocab_size, config.hidden_size)
inputs_embs = token_emb(inputs.input_ids)

In [22]:
mha = MultiHeadAttention(config)
attn_output = mha(inputs_embs)
attn_output.size()

torch.Size([1, 6, 768])

In [23]:
from bertviz import head_view
model = AutoModel.from_pretrained(model_ckpt, output_attentions=True)
sen1 = "Time flies like an arrow."
sen2 = "Fruit flies like an orange."

viz_inputs = tokenizer(sen1, sen2, return_tensors="pt")
attention = model(**viz_inputs).attentions
sen2_start = (viz_inputs.token_type_ids==0).sum(dim=1)
tokens = tokenizer.convert_ids_to_tokens(viz_inputs.input_ids[0])
head_view(attention, tokens, sen2_start, heads=[8])

Output hidden; open in https://colab.research.google.com to view.