In [1]:
from gensim.models import KeyedVectors
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import numpy as np
import pandas as pd
import opencc
from ckiptagger import WS
from datetime import datetime,timezone,timedelta
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import math

## Hyper Parameter

In [16]:
DATASET = "ATIS"
NUM_ROUTING_ITERATIONS = 4
KERNEL_SIZE = 2
HIDDEN_SIZE = 300
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 64
MODEL_PATH = "model/transCapsule" # svae/load model name/path
EPOCHS = 20
MAX_LENGTH = 64
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)

In [4]:
# high-level 顯示此模型裡的 modules
def model_info(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"total params: {total_params}")
#     print(model.device)
    print("""
    name            module
    ----------------------""")
    for name, module in model.named_children():
        if name == "bert" or name=="0":
            for n, _ in module.named_children():
                print(f"{name}:{n}")
    #             print(_)
        else:
            print("{:15} {}".format(name, module))

## Data Preprocess

In [5]:
# W2V = KeyedVectors.load('w2v_100d.kv')
W2V = KeyedVectors.load('glove_en_300d.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
class ckipDict(dict):
    def __init__(self, vocab, vec):
        self.vocab = vocab
        self.vec = vec
        
    def __getitem__(self, key):
        if key in self.vocab:
            idx = self.vocab.index(key)
            return self.vec[idx]

In [7]:
voc = np.load("../ckiptagger/data/embedding_word/token_list.npy").tolist()
vec = np.load("../ckiptagger/data/embedding_word/vector_list.npy")
W2V = ckipDict(voc, vec)

In [6]:
# en
def get_model_data(file_path, hidden_size=HIDDEN_SIZE):
    df = pd.read_csv(file_path, sep='\t')
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    data = []
    texts = df["texts"]
    labels = df["labels"]
    oov = []
    for i, text in (enumerate(texts)):
        seg_texts = []
        label = labels[i]
        sentence_seg = text.split()
        
        emb = []

        for seg_t in sentence_seg:
            if seg_t in W2V.vocab:
                emb += [W2V[seg_t]]
            else:
                oov += [seg_t]

        data_dict = {"emb": emb, "label":label, 
                     "src_texts": text, "src_label": src_labels[label],
                     "seg_texts": seg_texts}
        data += [data_dict]
    
    print(f"oov: {len(oov)} {len(set(oov))}, {sorted(set(oov))}")
    
    return data, num_labels # List[Dict[List]] = List[tokenizer output]

In [7]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

oov: 331 31, ['3357', '3724', '497766', "american's", 'ap57', 'ap68', 'ap80', "atlanta's", 'be1', 'd9s', "delta's", 'dh8', "don't", "friday's", "i'd", "i'll", "i'm", "i've", "it's", 'j31', 'l1011', "let's", 'nationair', "one's", "sunday's", "that's", "we're", "what're", "what's", "york's", 'yyz']
oov: 55 10, ['137338', 'ap58', 'ap80', 'be1', 'd9s', "doesn't", "i'd", "i'm", 'nationair', "what's"]


In [8]:
torch.save(data_train, f"bert_data/{DATASET}/transCapsule_train.pt")
torch.save(data_test, f"bert_data/{DATASET}/transCapsule_test.pt")

del ws

In [5]:
num_labels = 41 if DATASET == "base" else 31
data_train = torch.load(f"bert_data/{DATASET}/transCapsule_train.pt")
data_test = torch.load(f"bert_data/{DATASET}/transCapsule_test.pt")

In [8]:
ls = []
for item in data_train:
    ls += [np.array(item["emb"]).shape[0]]
ls = np.array(ls)
print(ls.max(), ls.min(), ls.mean())

46 1 11.23373888628919


## Dataset

In [9]:
class intent_Dataset(Dataset):
    def __init__(self, mode, list_of_bert):
        assert mode in ["train", "test", "dev"]
        self.mode = mode
        self.data = list_of_bert
    def __getitem__(self, idx):
        emb = torch.tensor(self.data[idx]["emb"])
        label = torch.tensor(self.data[idx]["label"])
        return emb, label

    def __len__(self):
        return len(self.data)

In [10]:
def minibatch(sample):
    sample.sort(key=lambda x: x[0].shape[0], reverse=True)
    embs, labels = zip(*sample)
    max_length = MAX_LENGTH
    masks = []
    pad_embs = []
    
    
    for e in embs:
        # padding
        pad_len = max_length - e.shape[0]
        padding = torch.zeros(pad_len, HIDDEN_SIZE)
        pad_embs += [torch.cat((e, padding)).tolist()]
        
        # attn masling
        masking = [False] * e.shape[0] + [True] * pad_len
        masks += [masking]
        
    pad_embs = torch.tensor(pad_embs)
    masks = torch.tensor(masks)
    labels = torch.tensor(labels)
    return pad_embs, masks, labels

## Model

In [11]:
# sorce form: https://pytorch.org/tutorials/beginner/transformer_tutorial.html
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=MAX_LENGTH):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [12]:
# source from: https://github.com/leftthomas/CapsNet/blob/master/capsule.py
class CapsuleLayer(nn.Module):
    def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None,
                 num_iterations=NUM_ROUTING_ITERATIONS):
        super(CapsuleLayer, self).__init__()

        self.num_route_nodes = num_route_nodes
        self.num_iterations = num_iterations

        self.num_capsules = num_capsules

        if num_route_nodes != -1: # digit_capsules
            self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels))
        else: # primary_capsules
            self.capsules = nn.ModuleList(
                [nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0) for _ in
                 range(num_capsules)])

    @staticmethod
    def squash(tensor, dim=-1):
        squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True)
        scale = squared_norm / (1 + squared_norm)
        return scale * tensor / torch.sqrt(squared_norm)

    def forward(self, x):
        if self.num_route_nodes != -1: # digit_capsules
            priors = x[None, :, :, None, :] @ self.route_weights[:, None, :, :, :] # random initial hat u_j|i
            logits = Variable(torch.zeros(*priors.size())) # b_ij = 0
            if torch.cuda.is_available():
                logits = logits.cuda()
            for i in range(self.num_iterations):
                probs = F.softmax(logits, dim=2) # c_ij = softmax(b_ij)
                outputs = self.squash((probs * priors).sum(dim=2, keepdim=True)) # squash(sum c_ij * hat u_j|i)

                if i != self.num_iterations - 1:
                    delta_logits = (priors * outputs).sum(dim=-1, keepdim=True) # hat u_j|i * v_j
                    logits = logits + delta_logits #b_ij = b_ij + hat u_j|i * v_j
        else: # primary_capsules
            outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules]
            outputs = torch.cat(outputs, dim=-1)
            outputs = self.squash(outputs)

        return outputs

In [24]:
class intent_classifier(nn.Module):
    def __init__(self, kernel_size, num_labels, stride=1, hidden_size=HIDDEN_SIZE):
        super().__init__()
        self.pos_encoder = PositionalEncoding(d_model=hidden_size)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_size, nhead=4, dim_feedforward=300)
        self.transformer = nn.TransformerEncoder(encoder_layer, 1)
        self.primary_capsules = CapsuleLayer(num_capsules=100, num_route_nodes=-1,
                                in_channels=hidden_size, out_channels=1,
                                kernel_size=kernel_size, stride=stride)
        N = int((MAX_LENGTH - kernel_size + 1) / stride)
        self.intent_capsules = CapsuleLayer(num_capsules=num_labels, num_route_nodes=1 * N,
                                            in_channels=100, out_channels=15)
        self.decoder = nn.Sequential(
                        nn.Linear(15 * num_labels, num_labels),
                        nn.Softmax(dim=1))
    
    def forward(self, word_emb, mask):
        word_emb = self.pos_encoder(word_emb)
#         mask = self.generate_square_subsequent_mask(word_emb.shape[0]).to(device)
        word_emb = self.transformer(word_emb, src_key_padding_mask=mask)
        # transpose for conv1d
        # (len, batch, hidden_size) -> (batch, channel, len)
        word_emb = word_emb.transpose(0,1)
        word_emb = word_emb.transpose(1,2)
        
        word_emb = self.primary_capsules(word_emb)
        word_emb = self.intent_capsules(word_emb).squeeze().transpose(0, 1)
        word_emb = word_emb.flatten(start_dim=1)
        intent_class = self.decoder(word_emb)
        
        return intent_class

In [25]:
model = intent_classifier(KERNEL_SIZE, num_labels)
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
loss_func = nn.CrossEntropyLoss()
model_info(model)

total params: 2213952

    name            module
    ----------------------
pos_encoder     PositionalEncoding(
  (dropout): Dropout(p=0.1, inplace=False)
)
transformer     TransformerEncoder(
  (layers): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): _LinearWithBias(in_features=300, out_features=300, bias=True)
      )
      (linear1): Linear(in_features=300, out_features=300, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=300, out_features=300, bias=True)
      (norm1): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
)
primary_capsules CapsuleLayer(
  (capsules): ModuleList(
    (0): Conv1d(300, 1, kernel_size=(2,), stride=(1,))
    (1): Conv1d(300, 1, kernel_size=(2,), stride=(1,))
    (2): Conv1d(300

## Train

In [20]:
def get_predictions(model, dataloader, compute_acc):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            word_embs, masks, labels = [t.to(device) for t in data if torch.is_tensor(t)]
            
            intent_cls = model(word_embs.transpose(0,1), masks)
            
            _, pred = torch.max(intent_cls, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [21]:
trainSet = intent_Dataset("train", data_train)
trainLoader = DataLoader(trainSet, batch_size=TRAIN_BATCH_SIZE, collate_fn=minibatch, shuffle=True)
testSet = intent_Dataset("test", data_test)
testLoader = DataLoader(testSet, batch_size=TEST_BATCH_SIZE, collate_fn=minibatch)

In [22]:
"""tensorboard logger"""
# writer = SummaryWriter(f"runs/{DATASET}/transCapsule/K_{KERNEL_SIZE}_E_{EPOCHS}")

'tensorboard logger'

In [26]:
train_from = 0
model = model.to(device)
model.train()
timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainLoader):
        word_embs, masks, labels = [t.to(device) for t in data]
#         print(word_emb.shape)
#         break

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(word_embs.transpose(0,1), masks) # transpose for transformer (len, batch, hidden_size)
        
        loss = loss_func(outputs, labels)
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

#     torch.save(model.state_dict(), F"{MODEL_PATH}_E_{str(epoch+1)}.pt")
    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
#     writer.add_scalar('Loss/cls', running_loss, epoch)

    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
#     writer.add_scalar('Acc/train', acc, epoch)

    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")
#     writer.add_scalar('Acc/test', acc, epoch)

  0%|          | 2/535 [00:00<00:42, 12.54it/s]

2021-04-22 13:25:02	start training model/transCapsule from epoch 1 to 20


100%|██████████| 535/535 [00:34<00:00, 15.49it/s]


2021-04-22 13:25:37	[epoch 1] loss: 1333.381
[epoch 1] training acc: 0.742396


  0%|          | 2/535 [00:00<00:36, 14.52it/s]

[epoch 1] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.43it/s]


2021-04-22 13:26:26	[epoch 2] loss: 1211.218
[epoch 2] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.56it/s]

[epoch 2] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.51it/s]


2021-04-22 13:27:14	[epoch 3] loss: 1187.348
[epoch 3] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 14.95it/s]

[epoch 3] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.51it/s]


2021-04-22 13:28:03	[epoch 4] loss: 1179.019
[epoch 4] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.36it/s]

[epoch 4] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.50it/s]


2021-04-22 13:28:52	[epoch 5] loss: 1175.122
[epoch 5] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.30it/s]

[epoch 5] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.55it/s]


2021-04-22 13:29:40	[epoch 6] loss: 1173.262
[epoch 6] training acc: 0.742396


  0%|          | 2/535 [00:00<00:33, 15.69it/s]

[epoch 6] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.52it/s]


2021-04-22 13:30:29	[epoch 7] loss: 1172.546
[epoch 7] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.23it/s]

[epoch 7] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.47it/s]


2021-04-22 13:31:17	[epoch 8] loss: 1171.863
[epoch 8] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.34it/s]

[epoch 8] testing acc: 0.723549


100%|██████████| 535/535 [00:34<00:00, 15.55it/s]


2021-04-22 13:32:06	[epoch 9] loss: 1171.412
[epoch 9] training acc: 0.742396


  0%|          | 2/535 [00:00<00:36, 14.60it/s]

[epoch 9] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.22it/s]


2021-04-22 13:32:56	[epoch 10] loss: 1171.103
[epoch 10] training acc: 0.742396


  0%|          | 2/535 [00:00<00:34, 15.25it/s]

[epoch 10] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.12it/s]


2021-04-22 13:33:45	[epoch 11] loss: 1170.511
[epoch 11] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 14.98it/s]

[epoch 11] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.16it/s]


2021-04-22 13:34:35	[epoch 12] loss: 1170.355
[epoch 12] training acc: 0.742396


  0%|          | 2/535 [00:00<00:36, 14.80it/s]

[epoch 12] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.06it/s]


2021-04-22 13:35:25	[epoch 13] loss: 1170.242
[epoch 13] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 15.09it/s]

[epoch 13] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.03it/s]


2021-04-22 13:36:15	[epoch 14] loss: 1170.159
[epoch 14] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 14.93it/s]

[epoch 14] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.10it/s]


2021-04-22 13:37:05	[epoch 15] loss: 1170.097
[epoch 15] training acc: 0.742396


  0%|          | 2/535 [00:00<00:36, 14.51it/s]

[epoch 15] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 14.98it/s]


2021-04-22 13:37:56	[epoch 16] loss: 1170.426
[epoch 16] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 14.84it/s]

[epoch 16] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 14.91it/s]


2021-04-22 13:38:46	[epoch 17] loss: 1170.391
[epoch 17] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 15.02it/s]

[epoch 17] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 14.94it/s]


2021-04-22 13:39:37	[epoch 18] loss: 1169.990
[epoch 18] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 14.90it/s]

[epoch 18] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 14.93it/s]


2021-04-22 13:40:27	[epoch 19] loss: 1169.971
[epoch 19] training acc: 0.742396


  0%|          | 2/535 [00:00<00:35, 15.19it/s]

[epoch 19] testing acc: 0.723549


100%|██████████| 535/535 [00:35<00:00, 15.01it/s]


2021-04-22 13:41:17	[epoch 20] loss: 1170.706
[epoch 20] training acc: 0.742396
[epoch 20] testing acc: 0.723549


---

In [36]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8, dim_feedforward=128)
src = torch.rand(10, 32, 512)
encoder_layer(src).shape

torch.Size([10, 32, 512])

In [17]:
pos_encoder = PositionalEncoding(d_model=100)
src = torch.rand(32,40,100)
pos_encoder(src).shape

torch.Size([32, 40, 100])

In [44]:
src = torch.rand(15,20)
src.flatten().shape

torch.Size([300])