In [None]:
!pip install torch
!pip install transformers
!pip install tqdm
# from IPython.display import clear_output
# clear_output()

In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import AdamW
from sentence_transformers import models, InputExample, SentenceTransformer
from tqdm import tqdm
from sklearn.manifold import TSNE
import seaborn as sns
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

## Hyper parameter

In [2]:
tokenizer_LM = "../bert-base-chinese"
NO = 1
LM_SET = ["../bert-base-chinese", "../ckiplab/bert-base-chinese", "../hfl/chinese-bert-wwm", "../hfl/rbtl3"]
MODEL_NAME_SET = ["base", "ckip", "wwm", "rbtl3"]
LM = LM_SET[NO]
MODEL_NAME = f"sbert_cls_{MODEL_NAME_SET[NO]}"

DATASET = "SMP2018"
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 64
MODEL_PATH = f"model/sbert_cls_{DATASET}/{MODEL_NAME}" # svae/load model name/path
EPOCHS = 10
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# device="cpu"
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
from datetime import datetime,timezone,timedelta
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)
    return (str(dt2)[:-13] + '\t' + msg)

In [4]:
# high-level 顯示此模型裡的 modules
def model_info(model):
#     print(model.device)
    print("""
    name            module
    ----------------------""")
    for name, module in model.named_children():
        if name == "bert" or name=="0":
            for n, _ in module.named_children():
                print(f"{name}:{n}")
    #             print(_)
        else:
            print("{:15} {}".format(name, module))

In [5]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [6]:
def emb_visualize(bert_model):
    df = pd.read_csv(f"data/{DATASET}/train.tsv", sep='\t')
    # assert len(dataloader.dataset) == df.shape[0]
    df = df.sort_values(by="labels", ignore_index=True)
    # _, emb = get_predictions(model, trainLoader, compute_acc=False, get_emb=True)
    emb = bert_model.encode(df["texts"].tolist(), convert_to_numpy=True)
    low_dim_emb = TSNE(n_components=2, perplexity=30).fit_transform(emb)

    df["f1"] = low_dim_emb[:, 0]
    df["f2"] = low_dim_emb[:, 1]
    emb_fig = sns.relplot(
        data=df, x="f1", y="f2",
        hue="labels", alpha=0.7,
        kind="scatter")
    emb_by_cls_fig = sns.relplot(
        data=df, x="f1", y="f2",
        col="labels", col_wrap=4,
        kind="scatter")
    return emb_fig, emb_by_cls_fig

## Data Preprocess

In [7]:
def get_bert_data(mode, file_path):
    assert mode in ["train", "test", "dev"]
    df = pd.read_csv(file_path, sep='\t')
    df = df.sort_values(by=["labels"], ignore_index=True)
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]

    bert_data = []
    texts = df["texts"]
    labels = df["labels"]
    for i, t in enumerate(texts):
        label = labels[i]
        bert_dict = {"texts":t, "label":label}
        bert_data += [bert_dict]
        
    if mode == "train":
        return bert_data, num_labels #List[Dict[List]] = List[tokenizer output]
    else:
        return bert_data

In [8]:
"""training data"""
bert_train, num_labels = get_bert_data("train", f"data/{DATASET}/train.tsv")
print(len(bert_train), num_labels)

2299 31


In [9]:
"""testing data"""
bert_test = get_bert_data("test", f"data/{DATASET}/test.tsv")
len(bert_test)

770

In [10]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class intent_Dataset(Dataset):
    def __init__(self, list_of_bert):
        self.data = list_of_bert
    def __getitem__(self, idx):
        text = self.data[idx]["texts"]
        label = self.data[idx]["label"]
        return text, label
    def __len__(self):
        return len(self.data)

## Model

In [11]:
class intent_classifier(nn.Module):
    def __init__(self, LM, num_labels):
        super().__init__()
        bert = models.Transformer(LM, max_seq_length=128)
        hidden_size = bert.get_word_embedding_dimension()
        pooler = models.Pooling(hidden_size)
        self.bert_model = SentenceTransformer(modules=[bert, pooler])
        self.drop_out = nn.Dropout(0.1)
        self.cls = nn.Linear(hidden_size, num_labels)
        
    def forward(self, 
                src_texts):
        utterance_embedding = self.bert_model.encode(src_texts, convert_to_tensor=True)
        intent_cls = self.drop_out(utterance_embedding)
        intent_cls = self.cls(intent_cls)
        
        return dotdict(
            utterance_emb=utterance_embedding, # batch_size * encoder_hidden_size
            intent_cls=intent_cls) # batch_size * num_labels

In [12]:
model = intent_classifier(LM, num_labels=num_labels)
optimizer = AdamW(model.parameters(), lr=3e-5) # AdamW = BertAdam

In [13]:
model_info(model)


    name            module
    ----------------------
bert_model      SentenceTransformer(
  (0): Transformer(
    (auto_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21128, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
              

In [14]:
trainSet = intent_Dataset(bert_train)
trainLoader = DataLoader(trainSet, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
testSet = intent_Dataset(bert_test)
testLoader = DataLoader(testSet, batch_size=TEST_BATCH_SIZE)

## Train & validate

In [15]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in tqdm(dataloader):
            texts, labels = [t for t in data]
            
            outputs = model(texts)
            
            logits = outputs.intent_cls
            _, pred = torch.max(logits.data, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels.to(device)).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [16]:
"""tensorboard logger"""
writer = SummaryWriter(f"runs/{DATASET}/{MODEL_NAME}/E_{EPOCHS}")

In [17]:
train_from = 0
if MODEL_PATH.find(".pt") != -1:
    model.load_state_dict(torch.load(MODEL_PATH))
    p = MODEL_PATH.rfind('_')
    train_from = int(MODEL_PATH[p+1 : -3])
    MODEL_PATH = MODEL_PATH[: p-2]
model = model.to(device)
model.train()

timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainLoader):
        texts, labels = [t for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(texts)
        
        loss_func = nn.CrossEntropyLoss()
        loss = loss_func(outputs.intent_cls, labels.to(device))
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

#     torch.save(model.state_dict(), F"{MODEL_PATH}_E_{str(epoch+1)}.pt")
    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
    writer.add_scalar('Loss/cls', running_loss, epoch)

    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
    writer.add_scalar('Acc/train', acc, epoch)

    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")
    writer.add_scalar('Acc/test', acc, epoch)

  3%|▎         | 5/144 [00:00<00:02, 49.86it/s]

2021-03-30 13:59:50	start training model/sbert_cls_SMP2018/sbert_cls_ckip from epoch 1 to 10


100%|██████████| 144/144 [00:02<00:00, 58.19it/s]
  5%|▍         | 7/144 [00:00<00:02, 61.76it/s]

2021-03-30 13:59:52	[epoch 1] loss: 475.414


100%|██████████| 144/144 [00:02<00:00, 62.87it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.49it/s]

[epoch 1] training acc: 0.193562


100%|██████████| 13/13 [00:00<00:00, 24.43it/s]
  4%|▍         | 6/144 [00:00<00:02, 54.67it/s]

[epoch 1] testing acc: 0.198701


100%|██████████| 144/144 [00:02<00:00, 59.62it/s]
  5%|▍         | 7/144 [00:00<00:02, 61.44it/s]

2021-03-30 13:59:58	[epoch 2] loss: 430.498


100%|██████████| 144/144 [00:02<00:00, 62.42it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.40it/s]

[epoch 2] training acc: 0.277947


100%|██████████| 13/13 [00:00<00:00, 24.40it/s]
  4%|▍         | 6/144 [00:00<00:02, 56.20it/s]

[epoch 2] testing acc: 0.280519


100%|██████████| 144/144 [00:02<00:00, 58.56it/s]
  5%|▍         | 7/144 [00:00<00:02, 61.82it/s]

2021-03-30 14:00:03	[epoch 3] loss: 395.930


100%|██████████| 144/144 [00:02<00:00, 63.22it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.48it/s]

[epoch 3] training acc: 0.313180


100%|██████████| 13/13 [00:00<00:00, 24.38it/s]
  5%|▍         | 7/144 [00:00<00:02, 61.60it/s]

[epoch 3] testing acc: 0.307792


100%|██████████| 144/144 [00:02<00:00, 59.02it/s]
  5%|▍         | 7/144 [00:00<00:02, 60.61it/s]

2021-03-30 14:00:08	[epoch 4] loss: 370.041


100%|██████████| 144/144 [00:02<00:00, 62.44it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.55it/s]

[epoch 4] training acc: 0.343193


100%|██████████| 13/13 [00:00<00:00, 24.47it/s]
  4%|▍         | 6/144 [00:00<00:02, 53.10it/s]

[epoch 4] testing acc: 0.331169


100%|██████████| 144/144 [00:02<00:00, 59.76it/s]
  5%|▍         | 7/144 [00:00<00:02, 62.62it/s]

2021-03-30 14:00:13	[epoch 5] loss: 348.784


100%|██████████| 144/144 [00:02<00:00, 62.37it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.41it/s]

[epoch 5] training acc: 0.373641


100%|██████████| 13/13 [00:00<00:00, 23.69it/s]
  4%|▍         | 6/144 [00:00<00:02, 56.88it/s]

[epoch 5] testing acc: 0.372727


100%|██████████| 144/144 [00:02<00:00, 59.54it/s]
  5%|▍         | 7/144 [00:00<00:02, 62.81it/s]

2021-03-30 14:00:19	[epoch 6] loss: 329.999


100%|██████████| 144/144 [00:02<00:00, 62.08it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.33it/s]

[epoch 6] training acc: 0.414093


100%|██████████| 13/13 [00:00<00:00, 24.29it/s]
  4%|▍         | 6/144 [00:00<00:02, 57.27it/s]

[epoch 6] testing acc: 0.420779


100%|██████████| 144/144 [00:02<00:00, 60.29it/s]
  5%|▍         | 7/144 [00:00<00:01, 68.94it/s]

2021-03-30 14:00:24	[epoch 7] loss: 311.404


100%|██████████| 144/144 [00:02<00:00, 62.69it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.44it/s]

[epoch 7] training acc: 0.461940


100%|██████████| 13/13 [00:00<00:00, 24.40it/s]
  5%|▍         | 7/144 [00:00<00:02, 65.94it/s]

[epoch 7] testing acc: 0.445455


100%|██████████| 144/144 [00:02<00:00, 59.49it/s]
  5%|▍         | 7/144 [00:00<00:02, 62.41it/s]

2021-03-30 14:00:29	[epoch 8] loss: 295.965


100%|██████████| 144/144 [00:02<00:00, 62.53it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.52it/s]

[epoch 8] training acc: 0.501522


100%|██████████| 13/13 [00:00<00:00, 24.09it/s]
  4%|▍         | 6/144 [00:00<00:02, 53.20it/s]

[epoch 8] testing acc: 0.487013


100%|██████████| 144/144 [00:02<00:00, 59.90it/s]
  5%|▍         | 7/144 [00:00<00:02, 63.09it/s]

2021-03-30 14:00:34	[epoch 9] loss: 281.019


100%|██████████| 144/144 [00:02<00:00, 62.65it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.40it/s]

[epoch 9] training acc: 0.529361


100%|██████████| 13/13 [00:00<00:00, 24.25it/s]
  4%|▍         | 6/144 [00:00<00:02, 59.54it/s]

[epoch 9] testing acc: 0.514286


100%|██████████| 144/144 [00:02<00:00, 59.89it/s]
  5%|▍         | 7/144 [00:00<00:01, 69.05it/s]

2021-03-30 14:00:40	[epoch 10] loss: 267.970


100%|██████████| 144/144 [00:02<00:00, 62.63it/s]
 15%|█▌        | 2/13 [00:00<00:00, 17.14it/s]

[epoch 10] training acc: 0.560244


100%|██████████| 13/13 [00:00<00:00, 24.17it/s]

[epoch 10] testing acc: 0.541558





## Test

In [None]:
testSet = intent_Dataset("test", bert_test)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE*2)

In [None]:
for e in range(1,2):
    model.load_state_dict(torch.load(f"{MODEL_PATH}_E_{e}.pt"))
    model.eval()
    model = model.to(device)
    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {e}] testing acc: {acc:.6f}")

In [37]:
input = torch.randn(2, 5, 7)
# With Learnable Parameters
m = nn.LayerNorm(input.size()[1:])
# Without Learnable Parameters
# m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
# Normalize over last two dimensions
# m = nn.LayerNorm([10, 10])
# Normalize over last dimension of size 10
# m = nn.LayerNorm(7)
# Activating the module
output = m(input)
output.shape

torch.Size([2, 5, 7])

In [36]:
input.size()[1:]

torch.Size([5, 7])