In [None]:
!pip install torch
!pip install transformers
!pip install tqdm
# from IPython.display import clear_output
# clear_output()

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizerFast, BertModel
from tqdm import tqdm
from torch.utils.tensorboard import SummaryWriter

## Hyper parameter

In [11]:
tokenizer_LM = "../bert-base-chinese"
NO = 1
LM_SET = ["../bert-base-chinese", "../ckiplab/bert-base-chinese", "../hfl/chinese-bert-wwm", "../hfl/rbtl3"]
MODEL_NAME_SET = ["base", "ckip", "wwm", "rbtl3"]
LM = LM_SET[NO]
MODEL_NAME = MODEL_NAME_SET[NO] + '_mlp'
DATASET = "SMP2018"
BATCH_SIZE = 16
MODEL_PATH = f"model/{DATASET}/{MODEL_NAME}" # svae/load model name/path
EPOCHS = 50
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
from datetime import datetime,timezone,timedelta
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)

In [4]:
# high-level 顯示此模型裡的 modules
def model_info(model):
#     print(model.device)
    print("""
    name            module
    ----------------------""")
    for name, module in model.named_children():
        if name == "bert" or name=="0":
            for n, _ in module.named_children():
                print(f"{name}:{n}")
    #             print(_)
        else:
            print("{:15} {}".format(name, module))

## Data Preprocess

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_LM)

In [6]:
def get_bert_data(mode, file_path):
    assert mode in ["train", "test", "dev"]
    df = pd.read_csv(file_path, sep='\t')
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    bert_data = []
    texts = df["texts"]

    labels = df["labels"]
    for i, t in enumerate(texts):
        label = labels[i]
        bert_dict = {"label": label, "src_texts": t, "src_label": src_labels[label]}
        bert_dict.update(
            tokenizer(t, 
                      max_length=128,
                      padding='max_length',
                      return_token_type_ids=True,
                      truncation=True,
                      ))
        bert_data += [bert_dict]
    torch.save(bert_data, f"bert_data/{mode}.pt")
    if mode == "train":
        return bert_data, num_labels #List[Dict[List]] = List[tokenizer output]
    else:
        return bert_data

In [12]:
"""training data"""
bert_train, num_labels = get_bert_data("train", f"data/{DATASET}/train.tsv")
print(len(bert_train), num_labels)

2299 31


In [13]:
"""testing data"""
bert_test = get_bert_data("test", f"data/{DATASET}/test.tsv")
len(bert_test)

770

In [14]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class intent_Dataset(Dataset):
    def __init__(self, mode, list_of_bert):
        assert mode in ["train", "test", "dev"]
        self.mode = mode
        self.data = list_of_bert
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx]["input_ids"])
        seg_emb = torch.tensor(self.data[idx]["token_type_ids"])
        att_emb = torch.tensor(self.data[idx]["attention_mask"])
#         if self.mode == "train" or self.mode == "dev":
        label = torch.tensor(self.data[idx]["label"])
        return input_ids, seg_emb, att_emb, label
#         else:
#             return input_ids, seg_emb, att_emb
    def __len__(self):
        return len(self.data)

## Model

In [15]:
class intent_classifier(nn.Module):
    def __init__(self, LM, num_labels):
        super().__init__()
        self.bert = BertModel.from_pretrained(LM, return_dict=True)
        self.mlp = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 512),
            nn.ReLU(),
            nn.Dropout(0.75),
            nn.Linear(512, num_labels),
            nn.Softmax(dim=1)
        )
    def forward(self, 
                input_ids=None,
                token_type_ids=None,
                attention_mask=None):
        bert_outputs = self.bert(input_ids, token_type_ids, attention_mask)
        cls_token = bert_outputs.last_hidden_state[:, 0, :]
        output = self.mlp(cls_token)
        return output

model = intent_classifier(LM, num_labels)
optimizer = torch.optim.SGD(model.mlp.parameters(), lr=0.7) # follow paper fix LM parameter

In [16]:
model_info(model)


    name            module
    ----------------------
bert:embeddings
bert:encoder
bert:pooler
mlp             Sequential(
  (0): Linear(in_features=768, out_features=512, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.75, inplace=False)
  (3): Linear(in_features=512, out_features=31, bias=True)
  (4): Softmax(dim=1)
)


In [17]:
trainSet = intent_Dataset("train", bert_train)
trainLoader = DataLoader(trainSet, batch_size=BATCH_SIZE, shuffle=True)
# devSet = intent_Dataset("dev", bert_dev)
# devLoader = DataLoader(devSet, batch_size=BATCH_SIZE*2)
testSet = intent_Dataset("test", bert_test)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE*2)

## Train & validate

In [18]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in (dataloader):
            tokens_tensors, segments_tensors, masks_tensors,\
            labels = [t.to(device) for t in data if t is not None]
            
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            _, pred = torch.max(outputs.data, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [19]:
"""tensorboard logger"""
writer = SummaryWriter(f"runs/{DATASET}/{MODEL_NAME}/E_{EPOCHS}")

In [20]:
train_from = 0
if MODEL_PATH.find(".pt") != -1:
    model.load_state_dict(torch.load(MODEL_PATH))
    p = MODEL_PATH.rfind('_')
    train_from = int(MODEL_PATH[p+1 : -3])
    MODEL_PATH = MODEL_PATH[: p-2]
model = model.to(device)
model.train()

timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in (trainLoader):
        tokens_tensors, segments_tensors, masks_tensors, \
        labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids = tokens_tensors, 
                        token_type_ids = segments_tensors, 
                        attention_mask = masks_tensors)
        
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(outputs, labels)
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

#     torch.save(model.state_dict(), F"{MODEL_PATH}_E_{str(epoch+1)}.pt")
    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
    writer.add_scalar('Loss/cls', running_loss, epoch)
    
    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
    writer.add_scalar('Acc/train', acc, epoch)

#     _, acc = get_predictions(model, devLoader, compute_acc=True)
#     print(f"[epoch {epoch+1}] validation acc: {acc:.6f}")
    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")
    writer.add_scalar('Acc/test', acc, epoch)


2021-04-01 10:56:09	start training model/SMP2018/ckip_mlp from epoch 1 to 50
2021-04-01 10:56:32	[epoch 1] loss: 474.320
[epoch 1] training acc: 0.197912
[epoch 1] testing acc: 0.200000
2021-04-01 10:57:07	[epoch 2] loss: 473.811
[epoch 2] training acc: 0.197912
[epoch 2] testing acc: 0.200000
2021-04-01 10:57:42	[epoch 3] loss: 473.775
[epoch 3] training acc: 0.197912
[epoch 3] testing acc: 0.200000
2021-04-01 10:58:17	[epoch 4] loss: 473.768
[epoch 4] training acc: 0.197912
[epoch 4] testing acc: 0.200000
2021-04-01 10:58:52	[epoch 5] loss: 473.741
[epoch 5] training acc: 0.197912
[epoch 5] testing acc: 0.200000
2021-04-01 10:59:27	[epoch 6] loss: 473.776
[epoch 6] training acc: 0.197912
[epoch 6] testing acc: 0.200000
2021-04-01 11:00:03	[epoch 7] loss: 473.768
[epoch 7] training acc: 0.197912
[epoch 7] testing acc: 0.200000
2021-04-01 11:00:38	[epoch 8] loss: 473.799
[epoch 8] training acc: 0.197912
[epoch 8] testing acc: 0.200000
2021-04-01 11:01:14	[epoch 9] loss: 473.795
[epoch 

## Test

In [None]:
testSet = intent_Dataset("test", bert_test)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE*2)

In [None]:
for e in range(1,2):
    model.load_state_dict(torch.load(f"{MODEL_PATH}_E_{e}.pt"))
    model.eval()
    model = model.to(device)
    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {e}] testing acc: {acc:.6f}")

In [37]:
input = torch.randn(2, 5, 7)
# With Learnable Parameters
m = nn.LayerNorm(input.size()[1:])
# Without Learnable Parameters
# m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
# Normalize over last two dimensions
# m = nn.LayerNorm([10, 10])
# Normalize over last dimension of size 10
# m = nn.LayerNorm(7)
# Activating the module
output = m(input)
output.shape

torch.Size([2, 5, 7])

In [36]:
input.size()[1:]

torch.Size([5, 7])