<a href="https://colab.research.google.com/github/Huang-23/PRfinal_BERT/blob/main/nlp_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 準備原始文本數據

In [19]:
import os
import pandas as pd


df_train = pd.read_csv("/content/drive/MyDrive/nlp/nlp_BERT/train.csv")

df_train.head()
empty_title = ((df_train['title1_zh'].isnull())|df_train['title2_zh'].isnull())
df_train = df_train[~empty_title]

MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x: len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x: len(x)) > MAX_LENGTH)]

len(df_train)

265695

In [20]:
# 從32萬筆訓練數據裡隨機抽取？%訓練
SAMPLE_FRAC = 0.03
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=927)
# 去除不必要的欄位並重新命名兩標題的欄位名
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']
df_train

Unnamed: 0,text_a,text_b,label
0,免费分享生活小技巧 教你用手机免费观看全网VIP视频,打开微信直接观看高清免费电视直播，不仅流畅，还不占内存,agreed
1,农村的这种树枝可以抑制白发生长，再也不用染发，和白发说再见,到60岁不用去染发，洗发水和它混合洗头，头上一根白发都找不到,agreed
2,网传海湖新区发生坠楼事件！真相来了...,辟谣丨西宁五四小学门口有男子给两个女孩吃药片？看警方回应！,unrelated
3,新密某游乐场打死人？造谣者昨日删除视频，现已被警方拘留,河北一网民造谣“警察打死人”，已被拘留！,unrelated
4,红枣千万不要这样吃，等于慢性自杀，千万要注意！,韭菜和它一起吃，等于慢性自杀，伤肠伤胃甚至食物中毒,unrelated
...,...,...,...
7966,猪身上有3个部位的肉，卖多便宜也不吃，别给自己惹出一身病,猪身上3个部位的肉，不管多便宜也别吃，省得给自己惹一身病,agreed
7967,它被誉为“高血压克星”，早晚来一杯，告别高血压气色棒,高血压的“克星”终于被发现了，每天一杯，血压平稳，胜过降压药,agreed
7968,新综艺破10亿播放量！与三男友同过520？张翰两字疑表白郑爽,吴昕潘玮柏，我们相爱吧香港版，内地未播片段大公开,unrelated
7969,一波绿色护眼保护视力壁纸,绿色背景真的能保护视力吗？,unrelated


In [21]:
# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
df_train.to_csv("/content/drive/MyDrive/nlp/nlp_BERT/train.tsv", sep="\t", index=False)
#df_train.to_csv("train_test.csv", index=False)

print("訓練樣本數：", len(df_train))
df_train.head()

訓練樣本數： 7971


Unnamed: 0,text_a,text_b,label
0,免费分享生活小技巧 教你用手机免费观看全网VIP视频,打开微信直接观看高清免费电视直播，不仅流畅，还不占内存,agreed
1,农村的这种树枝可以抑制白发生长，再也不用染发，和白发说再见,到60岁不用去染发，洗发水和它混合洗头，头上一根白发都找不到,agreed
2,网传海湖新区发生坠楼事件！真相来了...,辟谣丨西宁五四小学门口有男子给两个女孩吃药片？看警方回应！,unrelated
3,新密某游乐场打死人？造谣者昨日删除视频，现已被警方拘留,河北一网民造谣“警察打死人”，已被拘留！,unrelated
4,红枣千万不要这样吃，等于慢性自杀，千万要注意！,韭菜和它一起吃，等于慢性自杀，伤肠伤胃甚至食物中毒,unrelated


In [22]:
!pip install torch



In [23]:
!pip install transformers tqdm boto3 requests regex -q

In [24]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)

PyTorch 版本： 1.10.0+cu111


# 將原始文本轉換成 BERT 相容的輸入格式

In [25]:
from torch.utils.data import Dataset
 
    
class FakeNewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]  # 一般訓練你會需要 dev set
        self.mode = mode
        # 大數據你會需要用 iterator=True
        self.df = pd.read_csv("/content/drive/MyDrive/nlp/nlp_BERT/"+mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

In [26]:
# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text_a}
句子 2：{text_b}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")

[原始文本]
句子 1：免费分享生活小技巧 教你用手机免费观看全网VIP视频
句子 2：打开微信直接观看高清免费电视直播，不仅流畅，还不占内存
分類  ：agreed

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 1048, 6589, 1146,  775, 4495, 3833, 2207, 2825, 2341, 3136,  872,
        4500, 2797, 3322, 1048, 6589, 6225, 4692, 1059, 5381,  100, 6228, 7574,
         102, 2802, 2458, 2544,  928, 4684, 2970, 6225, 4692, 7770, 3926, 1048,
        6589, 4510, 6228, 4684, 3064, 8024,  679,  788, 3837, 4517, 8024, 6820,
         679, 1304, 1079, 2100,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1])

label_tensor   ：0

--------------------

[還原 tokens_tensors]
[CLS]免费分享生活小技巧教你用手机免费观看全网[UNK]视频[SEP]打开微信直接观看高清免费电视直播，不仅流畅，还不占内存[SEP]



In [27]:
!pip install pysnooper -q

In [28]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [29]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 62]) 
tensor([[ 101, 1048, 6589,  ...,    0,    0,    0],
        [ 101, 1093, 3333,  ..., 1168,  102,    0],
        [ 101, 5381,  837,  ...,    0,    0,    0],
        ...,
        [ 101, 5741, 3362,  ...,    0,    0,    0],
        [ 101, 3805, 2336,  ...,    0,    0,    0],
        [ 101, 5811, 7440,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([64, 62])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 62])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

# 在 BERT 之上加入新 layer 成下游任務模型

In [30]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)


In [31]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cuda:0
classification acc: 0.4960481746330448


In [32]:
def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]
     
model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)

print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")


整個分類模型的參數量：102269955
線性分類器的參數量：2307



# 訓練該下游任務模型

In [33]:

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 6
for epoch in range(EPOCHS):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

[epoch 1] loss: 66.867, acc: 0.855
[epoch 2] loss: 42.441, acc: 0.899
[epoch 3] loss: 29.484, acc: 0.931
[epoch 4] loss: 23.948, acc: 0.953
[epoch 5] loss: 18.962, acc: 0.929
[epoch 6] loss: 12.402, acc: 0.974


# 對新樣本做推論

In [35]:
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in testset.label_map.items()}

# 生成 Kaggle 繳交檔案
df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], 
                          df.loc[:, 'Category']], axis=1)
df_pred.to_csv('/content/drive/MyDrive/nlp/nlp_BERT/submission.csv', index=False)
df_pred.head()

Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
