In [1]:
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizerFast, BertConfig, BertForSequenceClassification, AdamW
from tqdm import tqdm

## Hyper parameter

In [2]:
tokenizer_LM = "../bert-base-chinese"
LM = "../ckiplab/bert-base-chinese"
# LM = "../hfl/chinese-bert-wwm"
# LM = "../hfl/rbtl3"
# LM = "../bert-base-chinese"
BATCH_SIZE = 8
MODEL_PATH = "model/base/bert_base_chinese" # svae/load model name/path
EPOCHS = 5
DATASET = "IVR"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda:0


## Utility Function

In [3]:
from datetime import datetime,timezone,timedelta
def timestamp(msg=""):
    dt1 = datetime.utcnow().replace(tzinfo=timezone.utc)
    dt2 = dt1.astimezone(timezone(timedelta(hours=8))) # 轉換時區 -> 東八區
    print(str(dt2)[:-13] + '\t' + msg)

## Data Preprocess

In [4]:
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_LM)

In [5]:
def get_bert_data(mode, file_path):
    assert mode in ["train", "test", "dev"]
    df = pd.read_csv(file_path, sep='\t')
    src_labels = sorted(set(df.labels.tolist()))
    num_labels = len(src_labels)
    df["labels"] = [src_labels.index(l) for l in df.labels.tolist()]
    bert_data = []
    texts = df["texts"]

    labels = df["labels"]
    for i, t in enumerate(texts):
        label = labels[i]
        bert_dict = {"label": label, "src_texts": t, "src_label": src_labels[label]}
        bert_dict.update(
            tokenizer(t, 
                      max_length=128,
                      padding='max_length',
                      return_token_type_ids=True,
                      truncation=True,
                      ))
        bert_data += [bert_dict]
    torch.save(bert_data, f"bert_data/{mode}.pt")
    if mode == "train":
        return bert_data, num_labels #List[Dict[List]] = List[tokenizer output]
    else:
        return bert_data

In [6]:
"""training data"""
bert_train, num_labels = get_bert_data("train", f"data/{DATASET}/train.tsv")
print(len(bert_train), num_labels)

2144 63


In [None]:
"""devlopment data"""
bert_dev = get_bert_data("dev", f"data/{DATASET}/valid.tsv")
len(bert_dev)

In [8]:
"""testing data"""
bert_test = get_bert_data("test", f"data/{DATASET}/test.tsv")
len(bert_test)

548

In [9]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
class intent_Dataset(Dataset):
    def __init__(self, mode, list_of_bert):
        assert mode in ["train", "test", "dev"]
        self.mode = mode
        self.data = list_of_bert
    def __getitem__(self, idx):
        input_ids = torch.tensor(self.data[idx]["input_ids"])
        seg_emb = torch.tensor(self.data[idx]["token_type_ids"])
        att_emb = torch.tensor(self.data[idx]["attention_mask"])
#         if self.mode == "train" or self.mode == "dev":
        label = torch.tensor(self.data[idx]["label"])
        return input_ids, seg_emb, att_emb, label
#         else:
#             return input_ids, seg_emb, att_emb
    def __len__(self):
        return len(self.data)

## Model

In [10]:
model = BertForSequenceClassification.from_pretrained(LM, num_labels=num_labels, return_dict=True)
optimizer = AdamW(model.parameters(), lr=3e-5) # AdamW = BertAdam

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../ckiplab/bert-base-chinese and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# high-level 顯示此模型裡的 modules
total_params = sum(p.numel() for p in model.parameters())
print(f"total params: {total_params}")
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
#             print(_)
    else:
        print("{:15} {}".format(name, module))

total params: 102316095

name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=63, bias=True)


In [12]:
trainSet = intent_Dataset("train", bert_train)
trainLoader = DataLoader(trainSet, batch_size=BATCH_SIZE, shuffle=True)
# devSet = intent_Dataset("dev", bert_dev)
# devLoader = DataLoader(devSet, batch_size=BATCH_SIZE*2)
testSet = intent_Dataset("test", bert_test)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE*2)

## Train & validate

In [13]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in tqdm(dataloader):
            tokens_tensors, segments_tensors, masks_tensors,\
            labels = [t.to(device) for t in data if t is not None]
            
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs.logits
            _, pred = torch.max(logits.data, 1) # _: logits最大數值; pred: 最大數值的 index
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [14]:
train_from = 0
if MODEL_PATH.find(".pt") != -1:
    model.load_state_dict(torch.load(MODEL_PATH))
    p = MODEL_PATH.rfind('_')
    train_from = int(MODEL_PATH[p+1 : -3])
    MODEL_PATH = MODEL_PATH[: p-2]
model = model.to(device)
model.train()

timestamp(f"start training {MODEL_PATH} from epoch {train_from+1} to {EPOCHS}")
for epoch in range(train_from, EPOCHS):
    running_loss = 0.0
    for data in tqdm(trainLoader):
        tokens_tensors, segments_tensors, masks_tensors, \
        labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids = tokens_tensors, 
                        token_type_ids = segments_tensors, 
                        attention_mask = masks_tensors,
                        labels = labels)
        
        loss = outputs.loss
        # backward
        loss.backward()
        optimizer.step()

        # 紀錄當前 batch loss
        running_loss += loss.item()

#     torch.save(model.state_dict(), F"{MODEL_PATH}_E_{str(epoch+1)}.pt")
    timestamp(f"[epoch {epoch+1}] loss: {running_loss:.3f}")
    
    _, acc = get_predictions(model, trainLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] training acc: {acc:.6f}")
#     _, acc = get_predictions(model, devLoader, compute_acc=True)
#     print(f"[epoch {epoch+1}] validation acc: {acc:.6f}")
    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {epoch+1}] testing acc: {acc:.6f}")

  0%|          | 1/268 [00:00<00:39,  6.78it/s]

2021-04-20 14:48:20	start training model/base/bert_base_chinese from epoch 1 to 5


100%|██████████| 268/268 [00:31<00:00,  8.63it/s]
  1%|          | 2/268 [00:00<00:14, 18.36it/s]

2021-04-20 14:48:51	[epoch 1] loss: 737.685


100%|██████████| 268/268 [00:09<00:00, 28.85it/s]
  6%|▌         | 2/35 [00:00<00:01, 16.94it/s]

[epoch 1] training acc: 0.765392


100%|██████████| 35/35 [00:02<00:00, 16.96it/s]
  0%|          | 1/268 [00:00<00:31,  8.53it/s]

[epoch 1] testing acc: 0.718978


100%|██████████| 268/268 [00:31<00:00,  8.57it/s]
  1%|          | 3/268 [00:00<00:09, 28.92it/s]

2021-04-20 14:49:34	[epoch 2] loss: 290.483


100%|██████████| 268/268 [00:09<00:00, 28.80it/s]
  6%|▌         | 2/35 [00:00<00:01, 16.66it/s]

[epoch 2] training acc: 0.896922


100%|██████████| 35/35 [00:02<00:00, 16.90it/s]
  0%|          | 1/268 [00:00<00:31,  8.52it/s]

[epoch 2] testing acc: 0.830292


100%|██████████| 268/268 [00:31<00:00,  8.55it/s]
  1%|          | 3/268 [00:00<00:09, 28.94it/s]

2021-04-20 14:50:17	[epoch 3] loss: 141.237


100%|██████████| 268/268 [00:09<00:00, 28.76it/s]
  6%|▌         | 2/35 [00:00<00:01, 16.59it/s]

[epoch 3] training acc: 0.943097


 34%|███▍      | 12/35 [00:00<00:01, 15.23it/s]


KeyboardInterrupt: 

## Test

In [None]:
testSet = intent_Dataset("test", bert_test)
testLoader = DataLoader(testSet, batch_size=BATCH_SIZE*2)

In [None]:
for e in range(1,2):
    model.load_state_dict(torch.load(f"{MODEL_PATH}_E_{e}.pt"))
    model.eval()
    model = model.to(device)
    _, acc = get_predictions(model, testLoader, compute_acc=True)
    print(f"[epoch {e}] testing acc: {acc:.6f}")

In [37]:
input = torch.randn(2, 5, 7)
# With Learnable Parameters
m = nn.LayerNorm(input.size()[1:])
# Without Learnable Parameters
# m = nn.LayerNorm(input.size()[1:], elementwise_affine=False)
# Normalize over last two dimensions
# m = nn.LayerNorm([10, 10])
# Normalize over last dimension of size 10
# m = nn.LayerNorm(7)
# Activating the module
output = m(input)
output.shape

torch.Size([2, 5, 7])

In [36]:
input.size()[1:]

torch.Size([5, 7])

In [17]:
for data in trainLoader:
    print(data[0].shape)
    break

torch.Size([8, 128])
