In [1]:
from gensim.models import KeyedVectors
import jieba
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
# from tqdm import tqdm
import numpy as np
import pandas as pd
import opencc
# from ckiptagger import WS
from datetime import datetime,timezone,timedelta
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
import math
from transformers import BertConfig, BertModel, AdamW

## Hyper Parameter

In [2]:
DATASET = "ATIS"
PRETRAIN = "embedding_word"
# PRETRAIN = "embedding_character"
NUM_ROUTING_ITERATIONS = 4
KERNEL_SIZE = 2
HIDDEN_SIZE = 300
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 64
MODEL_PATH = "model/transCapsule" # svae/load model name/path
EPOCHS = 50
MAX_LENGTH = 64
LR = 3e-5
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)

In [4]:
# high-level 顯示此模型裡的 modules
def model_info(model):
    total_params = sum(p.numel() for p in model.parameters())
    print(f"total params: {total_params}")
#     print(model.device)
    print("""
    name            module
    ----------------------""")
    for name, module in model.named_children():
        if name == "bert" or name=="0":
            for n, _ in module.named_children():
                print(f"{name}:{n}")
    #             print(_)
        else:
            print("{:15} {}".format(name, module))

## Data Preprocess

In [7]:
# W2V = KeyedVectors.load('../LM/w2v_100d.kv')
W2V = KeyedVectors.load('../LM/glove_en_300d.kv')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [23]:
class ckipDict(dict):
    def __init__(self, vocab, vec):
        self.vocab = vocab
        self.vec = vec
        
    def __getitem__(self, key):
        if key in self.vocab:
            idx = self.vocab.index(key)
            return self.vec[idx]
        else:
            raise BaseException(f"key error: {key}")

In [24]:
voc = np.load(f"../ckiptagger/data/{PRETRAIN}/token_list.npy").tolist()
vec = np.load(f"../ckiptagger/data/{PRETRAIN}/vector_list.npy")
W2V = ckipDict(voc, vec)

In [5]:
# en
def get_model_data(file_path, hidden_size=HIDDEN_SIZE):
    df = pd.read_csv(file_path, sep='\t')
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    data = []
    texts = df["texts"]
    labels = df["labels"]
    oov = []
    for i, text in (enumerate(texts)):
        seg_texts = []
        label = labels[i]
        sentence_seg = text.split()
        
        emb = []

        for seg_t in sentence_seg:
            if seg_t in W2V.vocab:
                emb += [W2V[seg_t]]
            else:
                oov += [seg_t]

        data_dict = {"emb": emb, "label":label, 
                     "src_texts": text, "src_label": src_labels[label],
                     "seg_texts": seg_texts}
        data += [data_dict]
    
    print(f"oov: {len(oov)} {len(set(oov))}, {sorted(set(oov))}")
    
    return data, num_labels # List[Dict[List]] = List[tokenizer output]

In [8]:
data_train, num_labels = get_model_data(f"data/{DATASET}/train.tsv")
data_test, _ = get_model_data(f"data/{DATASET}/test.tsv")

oov: 331 31, ['3357', '3724', '497766', "american's", 'ap57', 'ap68', 'ap80', "atlanta's", 'be1', 'd9s', "delta's", 'dh8', "don't", "friday's", "i'd", "i'll", "i'm", "i've", "it's", 'j31', 'l1011', "let's", 'nationair', "one's", "sunday's", "that's", "we're", "what're", "what's", "york's", 'yyz']
oov: 55 10, ['137338', 'ap58', 'ap80', 'be1', 'd9s', "doesn't", "i'd", "i'm", 'nationair', "what's"]


In [27]:
torch.save(data_train, f"bert_data/{DATASET}/transCapsule_train.pt")
torch.save(data_test, f"bert_data/{DATASET}/transCapsule_test.pt")

del ws

In [5]:
num_labels = 41 if DATASET == "base" else 31
data_train = torch.load(f"bert_data/{DATASET}/transCapsule_train.pt")
data_test = torch.load(f"bert_data/{DATASET}/transCapsule_test.pt")

In [9]:
ls = []
for item in data_train:
    ls += [np.array(item["emb"]).shape[0]]
ls = np.array(ls)
print(ls.max(), ls.min(), ls.mean())
print(ls.argmax(), data_train[ls.argmax()]["src_texts"])
print(ls.argmin(), data_train[ls.argmin()]["src_texts"])

46 1 11.23373888628919
86 i want to travel from kansas city to chicago round trip leaving wednesday june sixteenth arriving in chicago at around 7 o'clock in the evening and returning the next day arriving in kansas city at around 7 o'clock in the evening which airlines fly that route
18 what's restriction ap68


## Dataset

In [10]:
class intent_Dataset(Dataset):
    def __init__(self, mode, list_of_bert):
        assert mode in ["train", "test", "dev"]
        self.mode = mode
        self.data = list_of_bert
    def __getitem__(self, idx):
        emb = torch.tensor(self.data[idx]["emb"])
        label = torch.tensor(self.data[idx]["label"])
        return emb, label

    def __len__(self):
        return len(self.data)

In [11]:
def minibatch(sample):
    sample.sort(key=lambda x: x[0].shape[0], reverse=True)
    embs, labels = zip(*sample)
    max_length = MAX_LENGTH
    masks = []
    pad_embs = []
    
    
    for e in embs:
        # padding
        pad_len = max_length - e.shape[0]
        padding = torch.zeros(pad_len, HIDDEN_SIZE)
        pad_embs += [torch.cat((e, padding)).tolist()]
        
        # attn masling
        masking = [1] * e.shape[0] + [0] * pad_len
        masks += [masking]
        
    pad_embs = torch.tensor(pad_embs)
    masks = torch.tensor(masks)
    labels = torch.tensor(labels)
    return pad_embs, masks, labels

## Model

In [12]:
# source from: https://github.com/leftthomas/CapsNet/blob/master/capsule.py
class CapsuleLayer(nn.Module):
    def __init__(self, num_capsules, num_route_nodes, in_channels, out_channels, kernel_size=None, stride=None,
                 num_iterations=NUM_ROUTING_ITERATIONS):
        super(CapsuleLayer, self).__init__()

        self.num_route_nodes = num_route_nodes
        self.num_iterations = num_iterations

        self.num_capsules = num_capsules

        if num_route_nodes != -1: # digit_capsules
            self.route_weights = nn.Parameter(torch.randn(num_capsules, num_route_nodes, in_channels, out_channels))
        else: # primary_capsules
            self.capsules = nn.ModuleList(
                [nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=0) for _ in
                 range(num_capsules)])

    @staticmethod
    def squash(tensor, dim=-1):
        squared_norm = (tensor ** 2).sum(dim=dim, keepdim=True)
        scale = squared_norm / (1 + squared_norm)
        return scale * tensor / torch.sqrt(squared_norm)

    def forward(self, x):
        if self.num_route_nodes != -1: # digit_capsules
            priors = x[None, :, :, None, :] @ self.route_weights[:, None, :, :, :] # random initial hat u_j|i
            logits = Variable(torch.zeros(*priors.size())) # b_ij = 0
            if torch.cuda.is_available():
                logits = logits.cuda()
            for i in range(self.num_iterations):
                probs = F.softmax(logits, dim=2) # c_ij = softmax(b_ij)
                outputs = self.squash((probs * priors).sum(dim=2, keepdim=True)) # squash(sum c_ij * hat u_j|i)

                if i != self.num_iterations - 1:
                    delta_logits = (priors * outputs).sum(dim=-1, keepdim=True) # hat u_j|i * v_j
                    logits = logits + delta_logits #b_ij = b_ij + hat u_j|i * v_j
        else: # primary_capsules
            outputs = [capsule(x).view(x.size(0), -1, 1) for capsule in self.capsules]
            outputs = torch.cat(outputs, dim=-1)
            outputs = self.squash(outputs)

        return outputs

In [19]:
class intent_classifier(nn.Module):
    def __init__(self, kernel_size, num_labels, stride=1, hidden_size=HIDDEN_SIZE):
        super().__init__()
        config = BertConfig.from_json_file("./config.json")
        self.transformer = BertModel(config, add_pooling_layer=False)
#         self.primary_capsules = CapsuleLayer(num_capsules=100, num_route_nodes=-1,
#                                 in_channels=hidden_size, out_channels=1,
#                                 kernel_size=kernel_size, stride=stride) # output (batch, out_channels * feature_map_size, num_capsules)
#         N = int((MAX_LENGTH - kernel_size + 1) / stride)
#         self.intent_capsules = CapsuleLayer(num_capsules=num_labels, num_route_nodes=1 * N,
#                                             in_channels=100, out_channels=15)

        # w/o primary
        self.intent_capsules = CapsuleLayer(num_capsules=num_labels, num_route_nodes=300,
                                            in_channels=MAX_LENGTH, out_channels=15)

#         self.decoder = nn.Sequential(
#                         nn.Linear(15 * num_labels, num_labels),
#                         nn.Softmax(dim=1))
    
    def forward(self, word_emb, mask):
        outputs = self.transformer(inputs_embeds=word_emb, \
                                   attention_mask=mask, return_dict=True)
        word_emb = outputs.last_hidden_state 
        # transpose for conv1d
        # (batch, len, hidden_size) -> (batch, channel, len)
        word_emb = word_emb.transpose(1,2)
#         word_emb = self.primary_capsules(word_emb) # output (batch, out_channels * feature_map_size, num_capsules)
        word_emb = self.intent_capsules(word_emb).squeeze().transpose(0, 1)
#         word_emb = word_emb.flatten(start_dim=1)
#         intent_class = self.decoder(word_emb)

        # origin capsule network
        intent_class = (word_emb ** 2).sum(dim=-1) ** 0.5
        intent_class = F.softmax(intent_class, dim=-1)
        
        return intent_class

In [14]:
class CapsuleLoss(nn.Module):
    def __init__(self):
        super(CapsuleLoss, self).__init__()

    def forward(self, classes, labels):
#         classes = torch.argmax(classes, dim=1)
        left = F.relu(0.9 - classes, inplace=True) ** 2
        right = F.relu(classes - 0.1, inplace=True) ** 2

        margin_loss = labels * left + 0.5 * (1. - labels) * right
        margin_loss = margin_loss.sum()
#         margin_loss.requires_grad = True

        return margin_loss

In [20]:
model = intent_classifier(KERNEL_SIZE, num_labels)
optimizer = AdamW(model.parameters(), lr=LR)
# loss_func = nn.CrossEntropyLoss()
loss_func = CapsuleLoss()
model_info(model)

total params: 11797800

    name            module
    ----------------------
transformer     BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 300, padding_idx=0)
    (position_embeddings): Embedding(64, 300)
    (token_type_embeddings): Embedding(2, 300)
    (LayerNorm): LayerNorm((300,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=300, out_features=300, bias=True)
            (key): Linear(in_features=300, out_features=300, bias=True)
            (value): Linear(in_features=300, out_features=300, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=300, out_features=300, bias=True)
            (LayerNorm): LayerNorm((300,), eps=1e

## Train

In [16]:
def get_predictions(model, dataloader, compute_acc):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            word_embs, masks, labels = [t.to(device) for t in data if torch.is_tensor(t)]
            
            intent_cls = model(word_embs, masks)
            
            _, pred = torch.max(intent_cls, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [17]:
trainSet = intent_Dataset("train", data_train)
trainLoader = DataLoader(trainSet, batch_size=TRAIN_BATCH_SIZE, collate_fn=minibatch, shuffle=True)
testSet = intent_Dataset("test", data_test)
testLoader = DataLoader(testSet, batch_size=TEST_BATCH_SIZE, collate_fn=minibatch)

In [18]:
"""tensorboard logger"""
writer = SummaryWriter(f"runs/{DATASET}/transCapsule/no_primary&margin_loss")

In [None]:
train_from = 0
# EPOCHS = 20
model = model.to(device)
model.train()
timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in (trainLoader):
        word_embs, masks, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(word_embs, masks)
#         break
        
#         loss = loss_func(outputs, labels)

        # origin margin loss
        labels_onehot = torch.FloatTensor(labels.shape[0], num_labels).to(device)
        labels_onehot.zero_()
        labels_onehot.scatter_(1, labels.view(-1, 1), 1)
        
        loss = loss_func(outputs, labels_onehot)

        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()
#     break

    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
#     writer.add_scalar('Loss/cls', running_loss, epoch)

    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
#     writer.add_scalar('Acc/train', acc, epoch)

    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")
#     writer.add_scalar('Acc/test', acc, epoch)

2021-05-05 16:30:34	start training model/transCapsule from epoch 1 to 50
2021-05-05 16:30:48	[epoch 1] loss: 3024.156
[epoch 1] training acc: 0.016378
[epoch 1] testing acc: 0.017065
2021-05-05 16:31:11	[epoch 2] loss: 3024.147
[epoch 2] training acc: 0.051006
[epoch 2] testing acc: 0.046075
2021-05-05 16:31:35	[epoch 3] loss: 3024.142
[epoch 3] training acc: 0.015676
[epoch 3] testing acc: 0.022184
2021-05-05 16:31:58	[epoch 4] loss: 3024.134
[epoch 4] training acc: 0.017080
[epoch 4] testing acc: 0.015358
2021-05-05 16:32:21	[epoch 5] loss: 3024.130
[epoch 5] training acc: 0.014272
[epoch 5] testing acc: 0.017065
2021-05-05 16:32:45	[epoch 6] loss: 3024.123
[epoch 6] training acc: 0.015442
[epoch 6] testing acc: 0.015358
2021-05-05 16:33:08	[epoch 7] loss: 3024.118
[epoch 7] training acc: 0.014740
[epoch 7] testing acc: 0.017065
2021-05-05 16:33:32	[epoch 8] loss: 3024.119
[epoch 8] training acc: 0.013804
[epoch 8] testing acc: 0.017065
2021-05-05 16:33:55	[epoch 9] loss: 3024.108
[e

---

In [1]:
import tensorflow as tf
with tf.variable_scope('scope'):
    v1 = tf.get_variable('x', [1])
#     v1 = tf.Variable(0, name="x")
with tf.variable_scope('scope', reuse=True):
    v2 = tf.get_variable('x', [1])
#     v2 = tf.Variable(1, name="x")

print(v1.name, v2.name)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
scope/x:0 scope/x:0


In [6]:
class a():
    def __init__(self, in_):
        self.a = in_
        
    def __call__(self, in_):
        self.a += in_
        return self.a
        
    def __str__(self):
        return str(self.a)
        
print(a(5)(7))

12
