<a href="https://colab.research.google.com/github/MyeongGuJo/Transformer/blob/main/Transformer_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import numpy as np
import pandas as pd

In [3]:
class Attention(nn.Module):
  def __init__(self, dim, device):
    super().__init__()
    self.q_proj = nn.Linear(dim, dim, device=device)
    self.k_proj = nn.Linear(dim, dim, device=device)
    self.v_proj = nn.Linear(dim, dim, device=device)

  def forward(self, Q, K, V, padding_mask, causal_mask = None):
    query = self.q_proj(Q)
    key = self.k_proj(K)
    value = self.v_proj(V)

    # Q.shape = bs * seq_len * dim

    bs = query.shape[0]

    energy = query.matmul(key.transpose(-1, -2))

    # scaling
    d_k = key.size(-1)**(1/2)
    energy = energy / d_k

    mask = padding_mask.unsqueeze(1)

    if causal_mask is not None:
      mask = mask * causal_mask

    energy = energy.masked_fill(mask == 0, -1e9)

    output = F.softmax(energy, -1)
    output = output.matmul(value)

    return output

In [4]:
class FeedForward(nn.Module):
  def __init__(self, d, device):
    super().__init__()
    self.inner_proj = nn.Linear(d, d*4, bias=False, device=device)
    self.outer_proj = nn.Linear(d*4, d, bias=False, device=device)

  def forward(self, x):
    inner = self.inner_proj(x)
    output = F.relu(inner)

    output = self.outer_proj(output)

    return output

In [5]:
class Encoder(nn.Module):
  def __init__(self, vocab_size, dim, max_seq_len, device):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim, padding_idx=0, device=device)
    self.pos_embedding = nn.Embedding(128, dim, device=device) # max_length = 128
    self.max_seq_len = max_seq_len

    self.layer_norm = nn.LayerNorm(dim, device=device)

    self.self_attention = Attention(dim, device=device)

    self.feed_forward = FeedForward(dim, device=device)

    #self.device = device
    self.register_buffer('pos', torch.arange(0, self.max_seq_len).to(device))

  def forward(self, input, input_mask):
    input_pos = self.pos
    input_seq = self.embedding(input) + self.pos_embedding(input_pos)

    # self attention
    residual = input_seq
    output = self.self_attention(input_seq, input_seq, input_seq, input_mask)

    # Add & Norm
    output = self.layer_norm(output)
    output = residual + output

    # Feed Forward
    residual = output
    output = self.feed_forward(output)

    # Add & Norm
    output = residual + output
    output = self.layer_norm(output)

    return output

In [6]:
class Decoder(nn.Module):
  def __init__(self, vocab_size, dim, max_seq_len, device):
    super().__init__()
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=dim, padding_idx=0, device=device)
    self.pos_embedding = nn.Embedding(max_seq_len, dim, device=device) # max_length = 128
    self.max_seq_len = max_seq_len

    self.layer_norm = nn.LayerNorm(dim, device=device)

    self.self_attention = Attention(dim, device=device)
    self.cross_attention = Attention(dim, device=device)

    self.feed_forward = FeedForward(dim, device=device)

    self.out_proj = nn.Linear(dim, vocab_size, device=device)

    self.device = device
    self.register_buffer('pos', torch.arange(0, self.max_seq_len).to(device))

  def forward(self, input, input_mask, enc_output):
    input_pos = self.pos
    input_seq = self.embedding(input) + self.pos_embedding(input_pos)

    # mask (bs == 16)
    bs, dim = input.shape
    m = torch.tril(torch.ones(bs, self.max_seq_len, dim, dtype=int)).to(self.device)

    # self attention
    residual = input_seq
    output = self.self_attention(input_seq, input_seq, input_seq, input_mask, causal_mask=m)

    # Add & Norm
    output = self.layer_norm(output)
    output = residual + output

    # cross attention
    residual = output
    output = self.cross_attention(output, enc_output, enc_output, input_mask, causal_mask=m)

    # Add & Norm
    output = residual + output
    output = self.layer_norm(output)

    # Feed Forward
    residual = output
    output = self.feed_forward(output)

    # Add & Norm
    output = residual + output
    output = self.layer_norm(output)


    # Linear & Softmax
    logits = self.out_proj(output)

    return logits

In [7]:
class Transformer(nn.Module):
  def __init__(self, vocab_size, input_dim, output_dim, max_seq_len, device):
    super().__init__()
    self.encoder = Encoder(vocab_size, input_dim, max_seq_len, device)
    self.decoder = Decoder(vocab_size, output_dim, max_seq_len, device)

  def forward(self, enc_src, enc_mask, dec_src, dec_mask):
    # enc_src: (bs, seq_len)
    # dec_src: (bs, seq_len)

    enc_output = self.encoder(enc_src , enc_mask)
    logits = self.decoder(dec_src, dec_mask, enc_output)

    return logits

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
encoded = tokenizer('I am an undergraduated', return_tensors='pt')
print(encoded)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

{'input_ids': tensor([[ 101, 1045, 2572, 2019, 8324, 2094,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [9]:
# 데이터 불러오기
import pandas as pd

excel_file = pd.read_excel('/content/drive/MyDrive/kor_eng_excel_dataset/2_대화체_190920.xlsx')
kor_dataset = list(excel_file['한국어'].values)
eng_dataset = list(excel_file['영어'].values)

In [10]:
eng_dataset[:5]

['Do you know which part our test is going to cover? I come to class last week.',
 'Yes. Our test will be from page 100 to page 250.',
 'I see, thanks! Did you have any additional information?',
 'Yes. The professor also told us to look at the presentation materials that he gave us last week.',
 "I'm currently analyzing the level of difficulties of our past exams. What do you think this exam will be like?"]

In [11]:
encoded_kor = tokenizer(kor_dataset, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
encoded_eng = tokenizer(eng_dataset, return_tensors='pt', padding='max_length', truncation=True, max_length=128)

train_data = encoded_kor['input_ids']
train_mask = encoded_kor['attention_mask']
target_data = encoded_eng['input_ids']
target_mask = encoded_eng['attention_mask']

In [12]:
train_data[0]

tensor([  101,  1463, 30019, 29996, 30008, 30021,  1461, 30019, 30005, 30008,
        30023,  1469, 30011, 30020, 29997, 30019,   100,   100,   100,  1463,
        30006, 29997, 30009, 29999, 30013,  1029,  1464, 30009, 29991, 30006,
         1464, 30019, 29992, 30006, 30021, 30000, 30014,  1461, 30014, 29999,
        30008, 30024, 29999, 30017, 30022,   100,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [13]:
len(tokenizer.vocab), train_mask.shape, target_mask.shape

(30522, torch.Size([11756, 128]), torch.Size([11756, 128]))

In [14]:
encoded_kor['input_ids'].shape, encoded_eng['input_ids'].shape[-1]

(torch.Size([11756, 128]), 128)

In [15]:
vocab_size = len(tokenizer.vocab)
input_dim = 256
output_dim = 256
max_seq_len = 128
batch_size = 64
LEARNING_RATE = 0.0005
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
model = Transformer(vocab_size, input_dim, output_dim, max_seq_len, device)

In [17]:
# initialize model parameters
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 25,178,682 trainable parameters


In [18]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim() > 1:
        nn.init.xavier_uniform_(m.weight.data)

model.apply(initialize_weights)

Transformer(
  (encoder): Encoder(
    (embedding): Embedding(30522, 256, padding_idx=0)
    (pos_embedding): Embedding(128, 256)
    (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (self_attention): Attention(
      (q_proj): Linear(in_features=256, out_features=256, bias=True)
      (k_proj): Linear(in_features=256, out_features=256, bias=True)
      (v_proj): Linear(in_features=256, out_features=256, bias=True)
    )
    (feed_forward): FeedForward(
      (inner_proj): Linear(in_features=256, out_features=1024, bias=False)
      (outer_proj): Linear(in_features=1024, out_features=256, bias=False)
    )
  )
  (decoder): Decoder(
    (embedding): Embedding(30522, 256, padding_idx=0)
    (pos_embedding): Embedding(128, 256)
    (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (self_attention): Attention(
      (q_proj): Linear(in_features=256, out_features=256, bias=True)
      (k_proj): Linear(in_features=256, out_features=256, bias=True)

In [19]:
import torch.optim as optim

# Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

CEL_loss = nn.CrossEntropyLoss(ignore_index=0)

In [20]:
vocab_size, input_dim, output_dim

(30522, 256, 256)

In [21]:
train_data[0].shape

torch.Size([128])

In [22]:
num_it = train_data.shape[0]

In [23]:
def label_shift_left(input_ids):
  bs = input_ids.shape[0]
  new_input_ids = input_ids[:, 1:] # 256->255
  new_input_ids = torch.cat((new_input_ids, torch.LongTensor([[0] for _ in range(bs)]).to(device)), -1)

  return new_input_ids

In [24]:
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

dataset = TensorDataset(train_data, train_mask, target_data, target_mask)

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [25]:
for epoch in range(5):

  model.train()
  epoch_loss = 0

  # gradient descent
  for i, samples in enumerate(dataloader):

    optimizer.zero_grad()

    src, src_mask, trg, trg_mask = samples

    src = src.to(device)
    src_mask = src_mask.to(device)
    trg = trg.to(device)
    trg_mask = trg_mask.to(device)

    trg_shifted = label_shift_left(trg)
    mask_shifted = label_shift_left(trg_mask)

    logits = model(src,
                   src_mask,
                   trg,
                   trg_mask
                   )

    logits = logits.reshape(-1, logits.shape[-1])
    trg_shifted = trg_shifted.reshape(-1)
    loss = CEL_loss(logits,trg_shifted)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
    # print(f'loss: {loss.item()}')
    optimizer.step()

    epoch_loss += loss.item()

  print(f'Epoch: {epoch} | Train Loss: {epoch_loss / len(dataloader)}')

  for i, param in enumerate(model.parameters()):
    if i == 5:
      break
    print(param.reshape(-1)[:5])
  print('')

Epoch: 0 | Train Loss: 5.812035832716071
tensor([-0.0001, -0.0081, -0.0069,  0.0016,  0.0050], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([-0.0969,  0.0204, -0.0484,  0.0125, -0.0105], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0.9985, 0.9926, 0.9963, 0.9934, 0.9945], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([-0.0027,  0.0088, -0.0032,  0.0023, -0.0043], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0.0099, 0.0969, 0.0297, 0.0920, 0.1129], device='cuda:0',
       grad_fn=<SliceBackward0>)

Epoch: 1 | Train Loss: 4.438894725364188
tensor([-0.0001, -0.0081, -0.0069,  0.0016,  0.0050], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([-0.0943,  0.0144, -0.0534,  0.0120, -0.0147], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([0.9997, 0.9913, 0.9886, 0.9914, 0.9899], device='cuda:0',
       grad_fn=<SliceBackward0>)
tensor([-0.0028,  0.0089, -0.0028,  0.0054, -0.0079], device='cuda:0',
       grad_fn=<SliceBackward

In [26]:
train_data[0]

tensor([  101,  1463, 30019, 29996, 30008, 30021,  1461, 30019, 30005, 30008,
        30023,  1469, 30011, 30020, 29997, 30019,   100,   100,   100,  1463,
        30006, 29997, 30009, 29999, 30013,  1029,  1464, 30009, 29991, 30006,
         1464, 30019, 29992, 30006, 30021, 30000, 30014,  1461, 30014, 29999,
        30008, 30024, 29999, 30017, 30022,   100,  1012,   102,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [33]:
model.eval()

input_seq = '어제 수업에 가지 못했는데 다음 수업 때 필요한게 있을까요?'

encoded =  tokenizer(input_seq,
                    return_tensors='pt',
                    padding='max_length',
                    truncation=True,
                    max_length=max_seq_len
                    )

input_ids = encoded['input_ids'].unsqueeze(0).to(device)
input_mask = encoded['attention_mask'].unsqueeze(0).to(device)

default_ids = torch.LongTensor([101]).unsqueeze(0).to(device) # start of sentence token ids
default_mask = torch.LongTensor([1]).unsqueeze(0).to(device)

for i in range(1, max_seq_len):
  pad_ids = torch.LongTensor([0 for _ in range(max_seq_len-i)]).unsqueeze(0).to(device)
  output_ids = torch.cat((default_ids, pad_ids), -1)

  pad_mask = torch.LongTensor([0 for _ in range(max_seq_len-i)]).unsqueeze(0).to(device)
  output_mask = torch.cat((default_mask, pad_mask), -1)

  logits = model(input_ids, input_mask, output_ids, output_mask)

  logits = F.softmax(logits, -1)
  next_token_id = torch.argmax(logits, -1)[0][0][i]

  if next_token_id == 102: # end of sentence token
    break

  default_ids = torch.cat((default_ids, next_token_id.unsqueeze(0).unsqueeze(0)), -1)
  default_mask = torch.cat((default_mask, torch.LongTensor([[1]]).to(device)), -1)

input_seq, ''.join(tokenizer.batch_decode(output_ids, skip_special_tokens=True))

('어제 수업에 가지 못했는데 다음 수업 때 필요한게 있을까요?',
 "how do you think about the of the the of the team's meeting the meeting meeting meeting meeting? meeting meeting?")