### 패키지 설치

In [1]:
!pip install -q torchdata torchtext==0.12 torch

## Transformer Encoder classification

In [2]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [3]:
device=torch.device('cuda:0')

# Part1: Model Architecture

## Model Architecture  


In [5]:
# N개 layer 
def clones(module, N):
    
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [6]:
"""
Encoder clf
임베딩->인코더->(batch_size,seq_length,d_model)
embedding: positional encoding + embedding
generator->linear(d_model, n_class) -> log_softmax -> (batch, n_class)
"""


class Encoderclf(nn.Module):

    def __init__(self, encoder, src_embed, generator):
        super().__init__() 
        self.encoder=encoder # 인코더 전체
        self.src_embed=src_embed #embedding class
        self.generator=generator
        
    def forward(self, src):
        x=self.encoder(self.src_embed(src))
        return x
    def encode(self, src):
        return self.encoder(self.src_embed(src))

        

In [29]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."

    def __init__(self, d_model, n_class ):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model,n_class)

    def forward(self, x):
        x= x.mean(dim=1) # batch ,d_model
        x=self.proj(x) # batch, n_class
        return log_softmax(x,dim=-1) # batch, n_class

In [9]:
class Encoder(nn.Module):

    def __init__(self, layer, N):
        super(Encoder,self).__init__()
        self.layers=clones(layer,N)
        self.norm=LayerNorm(layer.size) #layer.size=d_model

    def forward(self, x):
        for layer in self.layers:
            x=layer(x)
        return self.norm(x)


In [8]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    "feature 차원 정규화"
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True) 
        std = x.std(-1, keepdim=True) 
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [10]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    dropout -> residual connection
    """

    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

In [11]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"

    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x))
        return self.sublayer[1](x, self.feed_forward)

In [12]:
def attention(query, key, value, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    
    p_attn = scores.softmax(dim=-1)
    
    if dropout is not None:
        p_attn = dropout(p_attn)
        
    return torch.matmul(p_attn, value), p_attn

In [13]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value):
            
        nbatches = query.size(0)

        # 1) Do all the linear projections in batch from d_model => h x d_k
        #파이토치의 view는 사이즈가 -1로 설정되면 다른 차원으로부터 해당 값을 유추
        # transpose -> e두개의 차원 교환
        query, key, value = [
            lin(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
            for lin, x in zip(self.linears, (query, key, value))
        ]

        # 2) Apply attention on all the projected vectors in batch.
        x, self.attn = attention(
            query, key, value, dropout=self.dropout
        )

        # 3) "Concat" using a view and apply a final linear.
        x = (
            x.transpose(1, 2)
            .contiguous()
            .view(nbatches, -1, self.h * self.d_k)
        )
        del query
        del key
        del value
        return self.linears[-1](x)

In [14]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation.-Nonlinearity"

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(self.w_1(x).relu()))

In [15]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [16]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [17]:
def make_model(
    src_vocab, target, N=4, d_model=16, d_ff=32, h=4, dropout=0.1
):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = Encoderclf(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        Generator(d_model, target),
    )

    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
  

    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

## Data Loading

In [18]:
from torch.utils.data import DataLoader
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer

tokenizer=get_tokenizer('basic_english')
train_iter = IMDB(split='train')

# tokening
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text) 

# vocabulary (train set 에 대해서만)
vocab_src = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>",'<blank>'])
vocab_src.set_default_index(vocab_src["<unk>"])

# pipeline
text_pipeline = lambda x: vocab_src(tokenizer(x)) # vocabulary 내 word 위치
label_pipeline = lambda x: 1. if (x=='pos') else 0 # pos==1, neg==0

In [19]:
len(vocab_src)

100684

In [30]:
def collate_batch(

     batch,
     text_pipeline,
     label_pipeline,
     max_padding=1000,
     pad_id=2,
):
    label_list, text_list = [], []
    for (_label, _text) in batch:
         label_list.append(torch.tensor(label_pipeline(_label), dtype=torch.int))
         processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int )
         text_list.append(
               pad(
                processed_text,
                (
                    0,
                    max_padding - len(processed_text),
                ),
                value=pad_id,
               )
          )
    src = torch.stack(text_list)
    tgt = torch.stack(label_list)
    return (tgt, src)

In [31]:
from torch.utils.data.dataset import random_split

def create_dataloader(
    
    vocab_src,
    text_pipeline,
    label_pipeline,
    batch_size=64,
    max_padding=128,
    is_distributed=True,
):
    def collate_fn(batch):
        return collate_batch(
            
            batch,
            text_pipeline,
            label_pipeline,
            max_padding=max_padding,
            pad_id=vocab_src.get_stoi()['<blank>'],
        )

    train_iter, test_iter=datasets.IMDB(split=('train','test'))

    train_iter_map = to_map_style_dataset(
        train_iter
    )  # DistributedSampler needs a dataset len()
    num_train = int(len(train_iter_map) * 0.95)
    split_train_, split_valid_ = \
        random_split(train_iter_map, [num_train, len(train_iter_map) - num_train])
    
    train_dataloader = DataLoader(
        split_train_,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
    )

    valid_dataloader = DataLoader(
        split_valid_,
        batch_size=batch_size,
        shuffle=True,
        collate_fn=collate_fn,
    )
    
    return train_dataloader, valid_dataloader


## Train

In [32]:
model=make_model(len(vocab_src),2)

In [23]:
import time

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()
    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model.generator(model(text))
        label = label.type(torch.LongTensor)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model.generator(model(text))
            label = label.type(torch.LongTensor)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [33]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
# Hyperparameters
EPOCHS = 10 # epoch

BATCH_SIZE = 64
criterion = torch.nn.CrossEntropyLoss()
lr = 1e-3# learning rate
optimizer = torch.optim.Adam(
    (p for p in model.parameters() if p.requires_grad), lr=lr
)


scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = 0.6
test_iter = IMDB(split='test')

train_dataloader, valid_dataloader =create_dataloader(vocab_src, text_pipeline, label_pipeline)


In [34]:
torch.manual_seed(0)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
      scheduler.step()
    else:
       total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)


-----------------------------------------------------------
| end of epoch   1 | time: 235.06s | valid accuracy    0.781 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time: 235.93s | valid accuracy    0.850 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time: 235.94s | valid accuracy    0.848 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time: 232.80s | valid accuracy    0.853 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time: 232.90s | valid accuracy    0.850 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time: 234

In [37]:
def collate_fn(batch):
        return collate_batch(
            
            batch,
            text_pipeline,
            label_pipeline,
            max_padding=128,
            pad_id=vocab_src.get_stoi()['<blank>'],
        )

In [39]:
test_dataset=to_map_style_dataset(test_iter)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_fn)

In [40]:
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

test accuracy    0.819
