In [2]:
import torch
import torch.nn as nn
import pandas as pd
import math

In [3]:
class MultiHeadAttention(nn.Module):
  def __init__(self, dim_num = 512, head_num = 8):
    super().__init__()
    self.head_num = head_num
    self.dim_num = dim_num

    #레이어 할당
    self.fc_q = nn.Linear(dim_num, dim_num)
    self.fc_k = nn.Linear(dim_num, dim_num)
    self.fc_v = nn.Linear(dim_num, dim_num)

    self.fc_o = nn.Linear(dim_num, dim_num)

  def ScaledDotProductAttention(self, q, k, v, mask = None):
    dk = k.shape[-1]
    kt = k.permute(0, 2, 1, 3)

    output = torch.matmul(q, kt)
    output = output / math.sqrt(dk)

    if mask is not None:
      output = output.masked_fill(mask.unsqueeze(1).unsqueeze(-1), 0)

    output = nn.softmax(output, -1)
    output = torch.matmul(output, v)

    return output

  def forward(self, q, k, v, mask = None):
    batch_size = q.shape[0]

    q = self.fc_q(q).view(batch_size, -1, self.head_num, self.dim_num//self.head_num).permute(0, 2, 1, 3)
    k = self.fc_k(k).view(batch_size, -1, self.head_num, self.dim_num//self.head_num).permute(0, 2, 1, 3)
    v = self.fc_v(v).view(batch_size, -1, self.head_num, self.dim_num//self.head_num).permute(0, 2, 1, 3)

    output = self.ScaledDotProductAttention(q, k, v, mask)
    batch_num, head_num, seq_num, hidden_num = output.size()
    output = output.permute(0, 2, 1, 3).contiguous()
    output = output.view(batch_size, -1, self.hidden_dim)
    output = self.fc_o(output)

    return output




In [4]:
class AddLayerNorm(nn.Module):
  def __init__(self):
    super().__init__()

  def layer_norm(self, input):
    mean = torch.mean(input, dim = -1, keepdim= True)
    std = torch.std(input, dim = -1, keepdim= True)
    output = (input - mean) / std
    return output

  def forward(self, input, residual):
    return residual + self.layer_norm(input)

In [5]:
class FeedForward(nn.Module):
  def __init__(self, dim_num = 512):
    super().__init__()
    self.layer1 = nn.Linear(dim_num, dim_num * 4)
    self.layer2 = nn.Linear(dim_num * 4, dim_num)

  def forward(self, input):
    output = self.layer1(input)
    output = self.layer2(nn.ReLU(output))

    return output

In [6]:
class Encoder(nn.Module):
  def __init__(self, dim_num = 512):
    super().__init__()
    self.multihead = MultiHeadAttention(dim_num=dim_num)
    self.addnorm1 = AddLayerNorm()
    self.feedforward = FeedForward(dim_num=dim_num)
    self.addnorm2 = AddLayerNorm()

  def forward(self, q, k, v):
    multihead_output = self.multihead(q, k, v)
    addnorm1_output = self.addnorm1(multihead_output, q)
    feedforward_output = self.feedforward(addnorm1_output)
    addnorm2_output = self.addnorm2(feedforward_output, addnorm1_output)

    return addnorm2_output


In [7]:
class Decoder(nn.Module):
  def __init__(self, dim_num = 512):
    super().__init__()
    self.masked_multihead = MultiHeadAttention(dim_num=dim_num)
    self.addnorm1 = AddLayerNorm()
    self.multihead = MultiHeadAttention(dim_num=dim_num)
    self.addnorm2 = AddLayerNorm()
    self.feedforward = FeedForward(dim_num=dim_num)
    self.addnorm3 = AddLayerNorm()

  def forward(self, q, k, v, encoder_output, mask):
    masked_multihead_output = self.masked_multihead(q, k, v, mask)
    addnorm1_output = self.addnorm1(masked_multihead_output, q)
    #여기서 왜 encoder output이 두개로 할당돼서 들어가는거지..?
    multihead_output = self.multihead(encoder_output, encoder_output, addnorm1_output, mask)
    addnorm2_output = self.addnorm2(multihead_output, addnorm1_output)
    feedforward_output = self.feed_forward(addnorm2_output)
    addnorm3_output = self.addnorm3(feedforward_output, addnorm2_output)

    return addnorm3_output

In [8]:
class Transformer(nn.Module):
  def __init__(self, encoder_num = 6, decoder_num = 6, hidden_dim = 512, max_encoder_seq_length = 100, max_decoder_seq_length = 100):
    super().__init__()

    self.encoder_num = encoder_num
    self.decoder_num = decoder_num
    self.hidden_dim = hidden_dim
    self.max_encoder_seq_length = max_encoder_seq_length
    self.max_decoder_seq_length = max_decoder_seq_length

    self.input_data_embeded = nn.Embedding(max_encoder_seq_length, self.hidden_dim)
    self.Encoders = [Encoder(dim_num = hidden_dim) for _ in range(encoder_num)]

    self.output_data_embeded = nn.Embedding(max_decoder_seq_length, self.hidden_dim)
    self.Decoders = [Decoder(dim_num = hidden_dim) for _ in range(decoder_num)]

    self.last_linear_layer = nn.Linear(self.hidden_dim, max_decoder_seq_length)

  #이쪽이 이해가 잘 안 되네..
  def position_encoding(self, position_max_length = 100):
    position = torch.arange(0, position_max_length, dytpe = torch.float).unsqueeze(1)
    pe = torch.zeros(position_max_length, self.hidden_dim)
    div_term = torch.pow(torch.ones(self.hidden_dim // 2).fill_(10000),
                         torch.arange(0, self.hidden_dim, 2) / torch.tensor(self.hidden_dim, dtype = torch.float32))

    pe[:,0::2] = torch.sin(position / div_term)
    pe[:,1::2] = torch.cos(position / div_term)

    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

    return pe

  def forward(self, input, output, mask):
    input_embeded = self.input_data_embeded(input)
    input_embeded += self.position_encoding(self.max_encoder_seq_length)
    q, k, v = input_embeded, input_embeded, input_embeded

    for encoder in self.Encoders:
      encoder_output = encoder(q, k, v)
      q, k, v = encoder_output, encoder_output, encoder_output

    output_embeded = self.output_data_embeded(output)
    output_embeded += self.position_encoding(self.max_decoder_seq_length)
    output_embeded = output_embeded.masked_fill(mask.unsqueeze(-1), 0)
    d_q, d_k, d_v = output_embeded, output_embeded, output_embeded

    for decoder in self.Decoders:
      decoder_output = decoder(d_q, d_k, d_v, encoder_output, mask)
      d_q, d_k, d_v = decoder_output, decoder_output, decoder_output

    output = nn.Softmax(self.last_linear_layer(decoder_output), dim = -1)
    return output


In [10]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/64.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━[0m [32m41.0/64.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.16.0
    Uninstalling torchtext-0.16.0:
      Successfully uninstalled torchtext-0.16.0
Successfully installed torchtext-0.6.0


In [36]:
%%capture
!python -m spacy download en
!python -m spacy download de

In [37]:
import spacy

spacy_en = spacy.load("en_core_web_sm")
spacy_de = spacy.load("de_core_news_sm")

In [38]:
tokenized = spacy_en.tokenizer('I am a graduate student.')

for i, token in enumerate(tokenized):
  print(f'인덱스 {i}: {token.text}')

인덱스 0: I
인덱스 1: am
인덱스 2: a
인덱스 3: graduate
인덱스 4: student
인덱스 5: .


In [39]:
def tokenize_de(text):
  return [token.text for token in spacy_de.tokenizer(text)]
def tokenize_en(text):
  return [token.text for token in spacy_en.tokenizer(text)]


In [40]:
from torchtext.data import Field, BucketIterator

SRC = Field(tokenize = tokenize_de, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)
TRG = Field(tokenize = tokenize_en, init_token="<sos>", eos_token="<eos>", lower=True, batch_first=True)

In [41]:
from torchtext.datasets import Multi30k

train_dataset, valid_dataset, test_dataset = Multi30k.splits(exts=(".de", ".en"), fields=(SRC, TRG), root = 'data')

FileNotFoundError: [Errno 2] No such file or directory: 'data/multi30k/train.de'