## Import Libraries

In [1]:
import os
import csv
import itertools
import pickle

import numpy as np
import pandas as pd

from tqdm import tqdm_notebook, trange
from IPython.display import clear_output

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Preprocess Dataset - step1

```
 資料預處理步驟 ->
 1. 資料的 row 是以 reviews, label 輪流交替出現，且會有資料沒對齊的狀況，若沒對齊 -> 刪除。
 2. 為了節省記憶體空間，把 sentence length 超過 200 的句子刪除。
    length -> 使用 BERT 斷詞後的 WordPiece token 去計算個數。
 3. 輸出 dataframe，其中 columns 分別為單一句子，整串句子concatenation 以及 label。
```

In [3]:
from transformers import BertTokenizer
PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def data_processing(file):
    # read and parse tsv file
    reviews = []
    label = []
    with open(file, encoding='utf-8') as f:
        rows = csv.reader(f, delimiter='\t', quotechar='"')
        for index, row in enumerate(rows):
            if index % 2 == 0:
                reviews.append(row)
            else:
                label.append(row)

        for i, j in zip(reviews, label):
            if len(i) != len(j):
                unequal_idx = reviews.index(i)
                reviews.pop(unequal_idx)
                label.pop(unequal_idx)

        # check whether the length of reviews & sentiment are equal
        for i, j in zip(reviews, label):
            assert len(i) == len(j)
    
    whole_reviews = [] # store the whole row into list
    reviews_len = [] # store the length of the row
    
    for review in reviews:
        whole_reviews.append("".join(review))
        reviews_len.append(len(review))
    
    # For reducing memory, give up the sentence with longer than the length of 200.
    row_reviews = []
    for idx, review in enumerate(whole_reviews):
        tokens = tokenizer.tokenize(review)
        if len(tokens) > 200:
            temp = []
            temp.append(review[:200])
            row_reviews.append(temp*reviews_len[idx])
        else:
            temp = []
            temp.append(review)
            row_reviews.append(temp*reviews_len[idx])
    
    # from 2d to 1d
    reviews = list(itertools.chain.from_iterable(reviews))
    row_reviews = list(itertools.chain.from_iterable(row_reviews))
    label = list(itertools.chain.from_iterable(label))
    
    # return as pandas dataframe
    df_bert = pd.DataFrame({
        'reviews': reviews,
        'row_reviews': row_reviews,
        'label': label,
    })
    
    return df_bert

file_name = 'training_set.tsv'
df_bert = data_processing(file_name)
df_bert.head()

## Preprocess sentence words - step2

``` 
 資料預處理步驟 ->
 1. 移除 punctuations
 2. 移除不需要的字詞、空白、相對不重要的英文數字序號。
 3. 替換一些不常見的字詞。
```

In [2]:
import re
from zhon.hanzi import punctuation
import string

punctuation =  punctuation + string.punctuation

ModuleNotFoundError: No module named 'zhon'

In [3]:
remove_punctuation = re.compile(r'[{}]'.format(punctuation))
df_bert.reviews = df_bert.reviews.apply(lambda x : re.sub(remove_punctuation, "", x))

def remove_init_num(x):
    remove_compiler = re.compile(r'^[0-9]')
    if len(x) > 1 :
        if (not x[1].isdigit()) :
            return re.sub(remove_compiler, '', x)
    return x

def remove_space(x):
    return re.sub(r'\s+', '', x)

def sub_unk_eng(x, sub_word):
    sub_compiler = re.compile(r'[A-Za-z][0-9A-Za-z]+')
    return re.sub(sub_compiler, sub_word, x)


df_bert.reviews = df_bert.reviews.apply(remove_space)
df_bert.reviews = df_bert.reviews.apply(remove_init_num)
df_bert['sub_reviews'] = df_bert['reviews'].apply(lambda x : sub_unk_eng(x, '产品'))
df_bert.head()

NameError: name 'punctuation' is not defined

In [5]:
df_bert.row_reviews = df_bert.row_reviews.apply(remove_space)
df_bert.row_reviews = df_bert.row_reviews.apply(remove_init_num)
df_bert.row_reviews = df_bert.row_reviews.apply(lambda x : sub_unk_eng(x, '产品'))
df_bert.head()

Unnamed: 0,reviews,row_reviews,label,sub_reviews
0,千呼万唤始出来,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral,千呼万唤始出来
1,尼康的APSC小相机终于发布了,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral,尼康的产品小相机终于发布了
2,COOLPIXA你怎么看呢,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral,产品你怎么看呢
3,我看尼康是挤牙膏挤惯了啊,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral,我看尼康是挤牙膏挤惯了啊
4,外观既没有V1时尚,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,negative,外观既没有产品时尚


## Preprocess Dataset - step3

```
---> 替換部分 BERT tokenize 後為 ["UNK"] 之字眼。
```

In [6]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()

```
Note:
查看有哪些 token 被轉換為 ["UNK"]
```

In [7]:
def to_token_ids(x):
    return tokenizer.tokenize(x)
    
df_bert['reviews_tokens'] = df_bert['sub_reviews'].apply(to_token_ids)
reviews_tokens = df_bert['reviews_tokens'].tolist()

unk_idx = []
for idx, review in enumerate(reviews_tokens):
    for tok in review:
        if tok == "[UNK]":
            unk_idx.append(idx)
            
df_unk_bert = df_bert.iloc[unk_idx]

```
Note:
替換部分 ["UNK"] 為對應的產品。
```

In [8]:
replace_dict = {
    '1D': '佳能相機',
    '5D': '佳能相機',
    '6D': '佳能相機',
    '7D': '佳能相機',
    '40D': '佳能相機',
    '50D': '佳能相機',
    '60D': '佳能相機',
    '70D': '佳能相機',
    '350D': '佳能相機',
    '500D': '佳能相機',
    '550D': '佳能相機',
    '650D': '佳能相機',
    'P系列': '尼康相機',
    'D300': '尼康相機',
    'D200': '尼康相機',
    'ＬＣＤ': '液晶顯示器',
    '徕卡M': '徕卡相機',
    '徕卡X': '徕卡相機',
    '1855Ⅱ': '鏡頭',
    'K卡口': '卡口',
    '廋廋': '瘦瘦',
}

In [9]:
sub_reviews = df_bert['sub_reviews'].tolist()
row_reviews = df_bert['row_reviews'].tolist()

def sub_unknown(sub_dict, sub_list, df, col):
    new_sub_reviews = {}
    for idx, review in tqdm_notebook(enumerate(sub_reviews)):
        for key, value in sub_dict.items():
            if key in review:
                match_str = '(' + key + ')'
                split_tok = re.split(match_str, review)

                # 替換["UNK"]
                split_tok = [sub_dict[tok] if tok in sub_dict else tok for tok in split_tok]
                sub_rev = "".join(split_tok)
                new_sub_reviews[idx] = sub_rev

    for key, value in new_sub_reviews.items():
        df_bert.loc[key, col] = value
        
        
sub_unknown(replace_dict, sub_reviews, df_bert, 'sub_reviews')
sub_unknown(replace_dict, row_reviews, df_bert, 'row_reviews')

del df_bert['reviews']
del df_bert['reviews_tokens']

cols = df_bert.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_bert = df_bert[cols]
df_bert.head()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




Unnamed: 0,sub_reviews,row_reviews,label
0,千呼万唤始出来,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral
1,尼康的产品小相机终于发布了,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral
2,产品你怎么看呢,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral
3,我看尼康是挤牙膏挤惯了啊,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,neutral
4,外观既没有产品时尚,千呼万唤始出来，尼康的产品小相机终于发布了，产品.你怎么看呢？我看，尼康是挤牙膏挤惯了啊，1...,negative


## Viewing the proportion of different labels

In [10]:
# negative = 0
# neutral = 1
# positive = 2
1 / (df_bert.label.value_counts() / df_bert.shape[0])

neutral      1.283216
positive     7.062500
negative    12.639821
Name: label, dtype: float64

---

## Split train test dataset

```
我們僅有 training dataset，但要在 12/25當天實際預測 testing dataset。
故預先切分以調整測試。
```

In [None]:
from sklearn.model_selection import train_test_split
train_df_bert, test_df_bert = train_test_split(df_bert, test_size=0.2, random_state=42)

# os.mkdir('data')
train_df_bert.to_csv('train.tsv', sep='\t', index=False)
test_df_bert.to_csv('test.tsv', sep='\t', index=False)

## 建構一個用來讀取訓練 / 測試集的 Dataset object

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from transformers import BertTokenizer

PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

class ReviewsDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        self.df = pd.read_csv(r"" + mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {"negative": 0, "neutral": 1, "positive": 2}
        self.tokenizer = tokenizer
        
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        # test
        if self.mode == "test": 
            
            review_a, review_b = self.df.iloc[idx, :2].values
            label_tensor = None # 在 test mode 中，label設定為 None 以用於預測
        
        # train
        else:
            review_a, review_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
        
        
        # 建立句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"] # 起始 token
        tokens_a = self.tokenizer.tokenize(review_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(tokens_a)
        
        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(review_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
clear_output()

In [None]:
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # train mode 有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    
    # test mode 無 labels
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

## 使用 DataLoader 拆分 trainiset 成數個 batch

In [None]:
# 初始化
trainset = ReviewsDataset("train", tokenizer=tokenizer)
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

## 下游任務學習

In [None]:
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = 'bert-base-chinese'
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

## 建構預測 label 之 function

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    
    with torch.no_grad():
        for data in tqdm_notebook(dataloader):
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None] 
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(
                    input_ids=tokens_tensors, 
                    token_type_ids=segments_tensors, 
                    attention_mask=masks_tensors
            )

            logits = outputs[0]
            _, pred = torch.max(logits.data, 1) # 1 -> 對列取 max
            
            # 計算訓練集的精準度
            if compute_acc:
                labels = data[3]
                total += labels.size(0) # 紀錄目前訓練之總data數目
                correct += (pred == labels).sum().item()
            
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
        
    if compute_acc:
        acc = correct / total
        return predictions, acc

    return predictions

## 建構 BERT Model 並查看在未訓練狀況下之 accuracy

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

## Train BERT model

In [None]:
# training
model.train()

# optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
optimizer = torch.optim.RMSprop(model.parameters(),
                                weight_decay = 1e-6,
                                lr=1e-5)

m = nn.LogSoftmax(dim=1)
criterion = nn.NLLLoss(weight=torch.tensor([12.84, 1.27, 7.14]).to(device))
EPOCHS = 12
for epoch in tqdm_notebook(range(EPOCHS)):
    
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = criterion(m(outputs[1]), labels)
        # backward
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()

        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[Epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

In [None]:
torch.cuda.empty_cache()

## 使用 DataLoader 拆分 testset 成數個 batch 並進行預測

In [None]:
# testing
testset = ReviewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=64, 
                        collate_fn=create_mini_batch,
                        shuffle=False)

# 預測測試集
predictions = get_predictions(model, testloader)

## 抓出 ground truth，並預測訓練後之結果

In [None]:
# ground truth
def get_test_accuracy(pred, truth):
    # predictions: gpu -> cpu
    corr_count = (pred.cpu() == truth).sum().item()
    return corr_count / len(truth)

label_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
    } 
ground_truth = torch.tensor([label_mapping[i] for i in testset.df.iloc[:, 1].values])

get_test_accuracy(predictions, ground_truth)

## 因資料之 label 為不平衡資料，故也查看 precision/recall/f1_score

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

test_Y = ground_truth
pred_Y = predictions.cpu()

accuracy = accuracy_score(test_Y, pred_Y)
precision = precision_score(test_Y, pred_Y, average='macro')
recall = recall_score(test_Y, pred_Y, average='macro')
fscore = f1_score(test_Y, pred_Y, average='macro')

print("Accuracy: %g\tPrecision: %g\tRecall: %g\tF-score: %g" % (
    accuracy, precision, recall, fscore))