In [1]:
!pip install sentencepiece
!pip install wget

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 19.4 MB/s eta 0:00:01[K     |▌                               | 20 kB 23.6 MB/s eta 0:00:01[K     |▉                               | 30 kB 15.0 MB/s eta 0:00:01[K     |█                               | 40 kB 10.6 MB/s eta 0:00:01[K     |█▍                              | 51 kB 7.2 MB/s eta 0:00:01[K     |█▋                              | 61 kB 6.8 MB/s eta 0:00:01[K     |██                              | 71 kB 6.7 MB/s eta 0:00:01[K     |██▏                             | 81 kB 7.6 MB/s eta 0:00:01[K     |██▍                             | 92 kB 6.5 MB/s eta 0:00:01[K     |██▊                             | 102 kB 6.7 MB/s eta 0:00:01[K     |███                             | 112 kB 6.7 MB/s eta 0:00:01[K     |███▎                            | 122 kB 6.7 MB/s eta 0:00:01[K     |███▌      

In [2]:
from google.colab import drive
drive.mount('/content/drive')
# data를 저장할 폴더 입니다. 환경에 맞게 수정 하세요.
data_dir = "/content/drive/My Drive/BERT_DATA"

Mounted at /content/drive


In [3]:
import os
import numpy as np
import math
from random import random, randrange, randint, shuffle, choice
import matplotlib.pyplot as plt
import json
import pandas as pd
from IPython.display import display
from tqdm import tqdm, tqdm_notebook, trange
import sentencepiece as spm
import wget

import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
for f in os.listdir(data_dir):
  print(f)

kowiki.vocab
kowiki_t5.vocab
ratings_test.txt
kowiki.model
kowiki_t5.model
ratings_test.json
ratings_test_t5.json
ratings_train_t5.json
ratings_train.txt
ratings_train.json
save_gpt_pretrain.pth
save_t5_pretrain.pth
save_bert_pretrain.pth
kowiki.csv.gz


In [5]:
# vocab loading
vocab_file = f"{data_dir}/kowiki.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

True

### config 설정하기

BERT는 Encoder만 사용하므로 항목 중 Decoder 부분은 제거 했습니다.
BERT Encoder는 기본 입력에 추가로 Segment 정보를 입력 받는데 Segment개수를 정의하는 n_seg_type을 추가로 정의 했습니다.

In [8]:
""" configuration json을 읽어들이는 class """
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

In [9]:
config = Config({
    "n_enc_vocab": len(vocab),
    "n_enc_seq": 256,
    "n_seg_type": 2,
    "n_layer": 6,
    "d_hidn": 256,
    "i_pad": 0,
    "d_ff": 1024,
    "n_head": 4,
    "d_head": 64,
    "dropout": 0.1,
    "layer_norm_epsilon": 1e-12
})
print(config)

{'n_enc_vocab': 8007, 'n_enc_seq': 256, 'n_seg_type': 2, 'n_layer': 6, 'd_hidn': 256, 'i_pad': 0, 'd_ff': 1024, 'n_head': 4, 'd_head': 64, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


### Common Class

공통으로 사용되는 Class 및 함수

In [13]:
# sigusoid position encoding

def get_sigusoid_encoding_table(n_seq, d_hidn):
  def cal_angle(position, i_hidn):
    return position/np.power(10000, 2*(i_hidn//2)/d_hidn)
  def get_posi_angle_vec(position):
    return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

  sinusoid_table=np.array([get_posi_angle_vec(i_seq) for i_seq in range (n_seq)])
  sinusoid_table[:,0::2]=np.sin(sinusoid_table[:,0::2]) #even index sin
  sinusoid_table[:,1::2]=np.cos(sinusoid_table[:,1::2]) #odd index cas

  return sinusoid_table

  """ attention pad mask """
def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad).unsqueeze(1).expand(batch_size, len_q, len_k)  # <pad>
    return pad_attn_mask


""" attention decoder mask """
def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask

# scale dot product attention 
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
    
    def forward(self, Q, K, V, attn_mask):
        # (bs, n_head, n_q_seq, n_k_seq)
        scores = torch.matmul(Q, K.transpose(-1, -2)).mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        # (bs, n_head, n_q_seq, n_k_seq)
        attn_prob = nn.Softmax(dim=-1)(scores)
        attn_prob = self.dropout(attn_prob)
        # (bs, n_head, n_q_seq, d_v)
        context = torch.matmul(attn_prob, V)
        # (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
        return context, attn_prob
""" multi head attention """
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.d_hidn)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        # (bs, n_head, n_q_seq, d_head)
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_k_seq, d_head)
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_v_seq, d_head)
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)

        # (bs, n_head, n_q_seq, n_k_seq)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)
    # (bs, n_head, n_q_seq, d_head), (bs, n_head, n_q_seq, n_k_seq)
        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        # (bs, n_head, n_q_seq, h_head * d_head)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
        # (bs, n_head, n_q_seq, e_embd)
        output = self.linear(context)
        output = self.dropout(output)
        # (bs, n_q_seq, d_hidn), (bs, n_head, n_q_seq, n_k_seq)
        return output, attn_prob


""" feed forward """
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_hidn, kernel_size=1)
        self.active = F.gelu
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, inputs):
        # (bs, d_ff, n_seq)
        output = self.active(self.conv1(inputs.transpose(1, 2)))
        # (bs, n_seq, d_hidn)
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)
        # (bs, n_seq, d_hidn)
        return output

### Encoder

표준 transformer Encoder 에서 BERT에서 추가된 정의한 segment embedding만 추가

In [15]:
# 표준 Transformer EncoderLayer와 동일하다
class EncoderLayer(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config=config

    self.self_attn=MultiHeadAttention(self.config)
    self.layer_norm1=nn.LayerNorm(self.config_hidn, eps=self.config.layer_norm_epsilon)
    self.pos_ffn=PoswiseFeedForwardNet(self.config)
    self.layer_norm2=nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)


  def forward(self, inputs, attn_mask):
    attn_outputs, attn_prob=self.self_attn(inputs, inputs, inputs, attn_mask)
    attn_outputs=self.layer_norm1(inputs+attn_outputs)
    ffn_outputs=self.pos_ffn(attn_outputs)
    ffn_outputs=self.layer_norm2(ffn_outputs+attn_outputs)

    return ffn_outputs, attn_prob

In [17]:
## encoder
class Encoder(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.config=config
    self.enc_emb=nn.Embedding(self.config.n_enc_vocab, self.config.d_hidn)
    self.pos_emb=nn.Embedding(self.config.n_enc_seq+1, self.config.d_hidn)
    self.seg_emb=nn.Embedding(self.config.n_seg_type, self.config.d_hidn)
    self.layers=nn.ModuleList([EncoderLayer(self.config) for _ in range(self.config.n_layer)])