In [1]:
from gensim.models import KeyedVectors
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import numpy as np
import pandas as pd
import opencc
from ckiptagger import WS
from datetime import datetime,timezone,timedelta
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import math

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Hyper Parameter

In [2]:
DATASET = "SMP2018"
NUM_ROUTING_ITERATIONS = 4
KERNEL_SIZE = 2
HIDDEN_SIZE = 300
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 64
MODEL_PATH = "model/transCapsule" # svae/load model name/path
EPOCHS = 10
MAX_LENGTH = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)

In [4]:
# high-level 顯示此模型裡的 modules
def model_info(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"total params: {total_params}")
#     print(model.device)
    print("""
    name            module
    ----------------------""")
    for name, module in model.named_children():
        if name == "bert" or name=="0":
            for n, _ in module.named_children():
                print(f"{name}:{n}")
    #             print(_)
        else:
            print("{:15} {}".format(name, module))

## Data Preprocess

In [5]:
W2V = KeyedVectors.load('w2v_100d.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
class ckipDict(dict):
    def __init__(self, vocab, vec):
        self.vocab = vocab
        self.vec = vec
        
    def __getitem__(self, key):
        if key in self.vocab:
            idx = self.vocab.index(key)
            return self.vec[idx]

In [13]:
voc = np.load("../ckiptagger/data/embedding_word/token_list.npy").tolist()
vec = np.load("../ckiptagger/data/embedding_word/vector_list.npy")
W2V = ckipDict(voc, vec)

In [14]:
ws = WS("../ckiptagger/data")
def get_model_data(file_path, hidden_size=HIDDEN_SIZE):
    t2s_converter = opencc.OpenCC("t2s.json")
    df = pd.read_csv(file_path, sep='\t')
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    data = []
    texts = df["texts"]

    labels = df["labels"]
    oov = []
    for i, t in (enumerate(texts)):
        seg_texts = []
        label = labels[i]
        sentence_seg = [seg_t for seg_t in ws([t])[0] if seg_t!=' '] # ckip segment
        
        data_dict = dict()
        emb = []

        for seg_t in sentence_seg:
            if seg_t in W2V.vocab:
                emb += [W2V[seg_t].tolist()]
                seg_texts += [seg_t]
                
            # if zh_tw not in w2v vocab try use zh_cn
            elif t2s_converter.convert(seg_t) in W2V.vocab:
                emb += [W2V[t2s_converter.convert(seg_t)].tolist()]
                seg_texts += [seg_t]
                
            # also not in w2v vocab, try jeiba 
            else:
                for sseg_t in jieba.cut(seg_t):
                    if sseg_t == ' ':
                        continue
                    seg_texts += [sseg_t]

                    if sseg_t in W2V.vocab:
                        emb += [W2V[sseg_t]]

                    else: # oov: mean vector of each character
#                         print(f"{sentence_seg} {seg_t} {sseg_t}")
                        temp = []
                        for char in sseg_t:
                            if char in W2V.vocab:
                                temp += [W2V[char]]
                        if len(temp) != 0:
                            emb += [np.stack(temp).mean(axis=0).tolist()]
                        oov += [sseg_t]
        
#         if len(emb) < MAX_LENGTH: # padding
#             emb += [[0]*hidden_size] * ((MAX_LENGTH)-len(emb))
        
        data_dict = {"emb": emb, "label":label, 
                     "src_texts": t, "src_label": src_labels[label],
                     "seg_texts": seg_texts}
        data += [data_dict]
    
    print(f"oov: {len(oov)} {len(set(oov))}, {sorted(set(oov))}")
    
    return data, num_labels # List[Dict[List]] = List[tokenizer output]



In [None]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

In [16]:
torch.save(data_train, f"bert_data/{DATASET}/transCapsule_train.pt")
torch.save(data_test, f"bert_data/{DATASET}/transCapsule_test.pt")

del ws

In [5]:
num_labels = 41 if DATASET == "base" else 31
data_train = torch.load(f"bert_data/{DATASET}/transCapsule_train.pt")
data_test = torch.load(f"bert_data/{DATASET}/transCapsule_test.pt")

In [6]:
ls = []
for item in data_train:
    ls += [np.array(item["emb"]).shape[0]]
ls = np.array(ls)
print(ls.max(), ls.min(), ls.mean())

31 1 5.193997390169639


## Dataset

In [7]:
class intent_Dataset(Dataset):
    def __init__(self, mode, list_of_bert):
        assert mode in ["train", "test", "dev"]
        self.mode = mode
        self.data = list_of_bert
    def __getitem__(self, idx):
        emb = torch.tensor(self.data[idx]["emb"])
        label = torch.tensor(self.data[idx]["label"])
        return emb, label

    def __len__(self):
        return len(self.data)

In [8]:
def minibatch(sample):
    sample.sort(key=lambda x: x[0].shape[0], reverse=True)
    embs, labels = zip(*sample)
    max_length = MAX_LENGTH
    masks = []
    pad_embs = []
    
    
    for e in embs:
        # padding
        pad_len = max_length - e.shape[0]
        padding = torch.zeros(pad_len, HIDDEN_SIZE)
        pad_embs += [torch.cat((e, padding)).tolist()]
        
        # attn masling
        masking = [False] * e.shape[0] + [True] * pad_len
        masks += [masking]
        
    pad_embs = torch.tensor(pad_embs)
    masks = torch.tensor(masks)
    labels = torch.tensor(labels)
    return pad_embs, masks, labels

## Model

In [9]:
# sorce form: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=MAX_LENGTH):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [10]:
# source from: https://github.com/leftthomas/CapsNet/blob/master/capsule.py
class CapsuleLayer(nn.Module):
    def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None,
                 num_iterations=NUM_ROUTING_ITERATIONS):
        super(CapsuleLayer, self).__init__()

        self.num_route_nodes = num_route_nodes
        self.num_iterations = num_iterations

        self.num_capsules = num_capsules

        if num_route_nodes != -1: # digit_capsules
            self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels))
        else: # primary_capsules
            self.capsules = nn.ModuleList(
                [nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0) for _ in
                 range(num_capsules)])

    @staticmethod
    def squash(tensor, dim=-1):
        squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True)
        scale = squared_norm / (1 + squared_norm)
        return scale * tensor / torch.sqrt(squared_norm)

    def forward(self, x):
        if self.num_route_nodes != -1: # digit_capsules
            priors = x[None, :, :, None, :] @ self.route_weights[:, None, :, :, :] # random initial hat u_j|i
            logits = Variable(torch.zeros(*priors.size())) # b_ij = 0
            if torch.cuda.is_available():
                logits = logits.cuda()
            for i in range(self.num_iterations):
                probs = F.softmax(logits, dim=2) # c_ij = softmax(b_ij)
                outputs = self.squash((probs * priors).sum(dim=2, keepdim=True)) # squash(sum c_ij * hat u_j|i)

                if i != self.num_iterations - 1:
                    delta_logits = (priors * outputs).sum(dim=-1, keepdim=True) # hat u_j|i * v_j
                    logits = logits + delta_logits #b_ij = b_ij + hat u_j|i * v_j
        else: # primary_capsules
            outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules]
            outputs = torch.cat(outputs, dim=-1)
            outputs = self.squash(outputs)

        return outputs

In [11]:
class intent_classifier(nn.Module):
    def __init__(self, kernel_size, num_labels, stride=1, hidden_size=HIDDEN_SIZE):
        super().__init__()
        self.pos_encoder = PositionalEncoding(d_model=hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=12, dim_feedforward=300)
        self.transformer = nn.TransformerEncoder(encoder_layer, 1)
        self.primary_capsules = CapsuleLayer(num_capsules=1, num_route_nodes=-1,
                                in_channels=hidden_size, out_channels=100,
                                kernel_size=kernel_size, stride=stride)
        N = int((MAX_LENGTH - kernel_size + 1) / stride)
        self.intent_capsules = CapsuleLayer(num_capsules=num_labels, num_route_nodes=100 * N,
                                            in_channels=1, out_channels=15)
        self.decoder = nn.Sequential(
                        nn.Linear(15 * num_labels, num_labels),
                        nn.Softmax(dim=1))
    
    def forward(self, word_emb, mask):
        word_emb = self.pos_encoder(word_emb)
#         mask = self.generate_square_subsequent_mask(word_emb.shape[0]).to(device)
        word_emb = self.transformer(word_emb, src_key_padding_mask=mask)
        # transpose for conv1d
        # (len, batch, hidden_size) -> (batch, channel, len)
        word_emb = word_emb.transpose(0,1)
        word_emb = word_emb.transpose(1,2)
        
        word_emb = self.primary_capsules(word_emb)
        word_emb = self.intent_capsules(word_emb).squeeze().transpose(0, 1)
        word_emb = word_emb.flatten(start_dim=1)
        intent_class = self.decoder(word_emb)
        
        return intent_class

In [12]:
model = intent_classifier(KERNEL_SIZE, num_labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
loss_func = nn.CrossEntropyLoss()
model_info(model)

total params: 3547046

    name            module
    ----------------------
pos_encoder     PositionalEncoding(
  (dropout): Dropout(p=0.1, inplace=False)
)
transformer     TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
      )
      (linear1): Linear(in_features=300, out_features=300, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=300, out_features=300, bias=True)
      (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)
primary_capsules CapsuleLayer(
  (capsules): ModuleList(
    (0): Conv1d(300, 100, kernel_size=(2,), stride=(1,))
  )
)
intent_capsules CapsuleLayer()
decoder         Sequential(
  (0): 

## Train

In [13]:
def get_predictions(model, dataloader, compute_acc):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            word_embs, masks, labels = [t.to(device) for t in data if torch.is_tensor(t)]
            
            intent_cls = model(word_embs.transpose(0,1), masks)
            
            _, pred = torch.max(intent_cls, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [14]:
trainSet = intent_Dataset("train", data_train)
trainLoader = DataLoader(trainSet, batch_size=TRAIN_BATCH_SIZE, collate_fn=minibatch, shuffle=True)
testSet = intent_Dataset("test", data_test)
testLoader = DataLoader(testSet, batch_size=TEST_BATCH_SIZE, collate_fn=minibatch)

In [15]:
"""tensorboard logger"""
# writer = SummaryWriter(f"runs/{DATASET}/transCapsule/K_{KERNEL_SIZE}_E_{EPOCHS}")

'tensorboard logger'

In [16]:
train_from = 0
model = model.to(device)
model.train()
timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainLoader):
        word_embs, masks, labels = [t.to(device) for t in data]
#         print(word_emb.shape)
#         break

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(word_embs.transpose(0,1), masks) # transpose for transformer (len, batch, hidden_size)
        
        loss = loss_func(outputs, labels)
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

#     torch.save(model.state_dict(), F"{MODEL_PATH}_E_{str(epoch+1)}.pt")
    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
#     writer.add_scalar('Loss/cls', running_loss, epoch)

    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
#     writer.add_scalar('Acc/train', acc, epoch)

    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")
#     writer.add_scalar('Acc/test', acc, epoch)

  0%|          | 0/72 [00:00<?, ?it/s]

2021-04-20 10:50:06	start training model/transCapsule from epoch 1 to 10


100%|██████████| 72/72 [00:46<00:00,  1.54it/s]


2021-04-20 10:50:53	[epoch 1] loss: 247.264
[epoch 1] training acc: 0.014789


  0%|          | 0/72 [00:00<?, ?it/s]

[epoch 1] testing acc: 0.014286


100%|██████████| 72/72 [00:47<00:00,  1.52it/s]


2021-04-20 10:52:21	[epoch 2] loss: 247.260
[epoch 2] training acc: 0.014789


  0%|          | 0/72 [00:00<?, ?it/s]

[epoch 2] testing acc: 0.014286


100%|██████████| 72/72 [00:47<00:00,  1.52it/s]


2021-04-20 10:53:49	[epoch 3] loss: 247.256
[epoch 3] training acc: 0.030883


  0%|          | 0/72 [00:00<?, ?it/s]

[epoch 3] testing acc: 0.031169


100%|██████████| 72/72 [00:47<00:00,  1.51it/s]


2021-04-20 10:55:18	[epoch 4] loss: 247.250
[epoch 4] training acc: 0.030883


  0%|          | 0/72 [00:00<?, ?it/s]

[epoch 4] testing acc: 0.031169


100%|██████████| 72/72 [00:49<00:00,  1.47it/s]


2021-04-20 10:56:49	[epoch 5] loss: 247.244
[epoch 5] training acc: 0.030883


  0%|          | 0/72 [00:00<?, ?it/s]

[epoch 5] testing acc: 0.031169


  0%|          | 0/72 [00:00<?, ?it/s]


KeyboardInterrupt: 

---

In [36]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, dim_feedforward=128)
src = torch.rand(10, 32, 512)
encoder_layer(src).shape

torch.Size([10, 32, 512])

In [17]:
pos_encoder = PositionalEncoding(d_model=100)
src = torch.rand(32,40,100)
pos_encoder(src).shape

torch.Size([32, 40, 100])

In [44]:
src = torch.rand(15,20)
src.flatten().shape

torch.Size([300])