<a href="https://colab.research.google.com/github/Jace-Yang/Multiclass_Sentiment_Classification_Chinese/blob/main/Evaluating.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set up

### Set up for Colab

In [1]:
# For runing notebook in colab
from google.colab import drive
drive.mount('/content/drive')
import os
root_of_repository = '/content/drive/MyDrive/ADL/Project/'
os.chdir(root_of_repository)

Mounted at /content/drive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 45.3 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 63.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


### Packages

In [3]:
import torch
from transformers import BertModel, BertTokenizer
from transformers import logging
logging.set_verbosity_error()
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

import numpy as np
import pandas as pd
import json
import copy
import time
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

In [17]:
from model_utils import test

In [13]:
# class SentimentModel(nn.Module):
class Model(nn.Module):
    def __init__(self, num_classes, model_name, pretrain_path, hidden_size):
        '''
        pretrain_path: local or hugging-face path, e.g '/roberta-wwm-ext pretrain/'
        '''
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_path, return_dict=False)
        for param in self.bert.parameters():
            param.requires_grad = True  # Allow all parameters to be updated
        self.fc = nn.Linear(hidden_size, num_classes)   # A layer to calculate logits of 6 ouput classes from 768 (hidden size of BERT)
            # Note: We are going to use Cross-EntropyLoss with a softmax “embedded”.
    def forward(self, x, token_type_ids, attention_mask):
        context = x  # Input sentence
        segments = token_type_ids
        mask = attention_mask  # Only mask the padding part
        _, pooled = self.bert(context, token_type_ids=segments, attention_mask=mask)
        logits = self.fc(pooled) # probability of 6 classes
        return logits

## Data Preprocessing

In [5]:
SEQ_LENGTH = 128
BATCH_SIZE = 8
LABEL_DICT = {'fear':0, 'neutral':1, 'sad':2, 'surprise':3, 'angry':4, 'happy':5} # Mapping label code and meaning
TOKENIZER = BertTokenizer.from_pretrained("chinese_wwm_ext_pytorch") # Hugging face BertTokenizer to load pretrain model

#tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
#model = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")


DEVELOPMENT_SET_PATH = 'data/usual_train.txt'
TEST_SET_PATH = 'data/usual_test_labeled.txt'

def convert_text_to_token(tokenizer, sentence, seq_length):
    """Tokenize sentence

    Args:
        tokenizer (PreTrainedTokenizer): a pretrained tokenizer with special token set to 
            {'unk_token': '[UNK]', 'sep_token': '[SEP]', 
             'pad_token': '[PAD]', 'cls_token': '[CLS]', 
             'mask_token': '[MASK]'}
        sentence (str): 
        seq_length (int): length of maximum input sentence accepted
    
    Returns: tuple(word_ids, segments, attention_masks)
        word_ids (list): tokenized sentence
        segments (list): label segmentation of original sentence and padding
        attention_masks (list): label whether the word is masked
    """ 
    tokens = tokenizer.tokenize(sentence) # Tokenize the sentence
    tokens = ["[CLS]"] + tokens + ["[SEP]"] # Add [CLS] before token and [SEP] after token
    word_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate list of word id
    segments = [0] * len(word_ids) # Label whether it is segmented
    attention_masks = [1] * len(word_ids) # Label whether the word is masked
    # Chop or pad the sentence into a single length - seq_length
    if len(word_ids) < seq_length: # Padding
        length_to_pad = seq_length - len(word_ids)
        word_ids += [0] * length_to_pad # [0] is the index of word "PAD" in the vocabulary table
        segments += [1] * length_to_pad # [1] denotes that this part of words are PAD
        attention_masks += [0] * length_to_pad # Change attention mask of PAD part as [0]
    else: # Chopping
        word_ids = word_ids[:seq_length]
        segments = segments[:seq_length]
        attention_masks = attention_masks[:seq_length]
    assert len(word_ids) == len(segments) == len(attention_masks)
    return word_ids, segments, attention_masks

In [6]:
def genDataLoader(data_type):
    '''Construct dataset loader

    Args:
        data_type (str): 'train' in training, 'val' in validating, 'test' in testing
    '''
    if data_type == 'test':
        with open(TEST_SET_PATH, encoding='utf8') as file:
            data = json.load(file)
    else:
        with open(DEVELOPMENT_SET_PATH, encoding='utf8') as file:
            data = json.load(file)
            # TESTING_STAGE
            if TESTING:
                dev_set, _ = train_test_split(data, train_size=160, random_state=4995)
                train_set, val_set = train_test_split(dev_set, test_size=0.2, random_state=4995)
            else:
                train_set, val_set = train_test_split(data, test_size=0.2, random_state=4995)
            data = train_set if data_type == 'train' else val_set
    ids_pool = []
    segments_pool = []
    masks_pool = []
    target_pool = []
    count = 0
    # Process all the sentences
    for each in data:
        cur_ids, cur_type, cur_mask = convert_text_to_token(TOKENIZER, each['content'], seq_length = SEQ_LENGTH)
        ids_pool.append(cur_ids)
        segments_pool.append(cur_type)
        masks_pool.append(cur_mask)
        cur_target = LABEL_DICT[each['label']]
        target_pool.append([cur_target])
        count += 1
        if count % 2000 == 0:
            print(f'Processed {count} sentences for {data_type}')
    # Construct Data Generater
    data_gen = TensorDataset(torch.LongTensor(np.array(ids_pool)),
                             torch.LongTensor(np.array(segments_pool)),
                             torch.LongTensor(np.array(masks_pool)),
                             torch.LongTensor(np.array(target_pool)))
    sampler = RandomSampler(data_gen)
    loader = DataLoader(data_gen, sampler=sampler, batch_size=BATCH_SIZE)
    return loader

In [7]:
TESTING = True
train_datagen = genDataLoader('train')
val_datagen = genDataLoader('val')
test_datagen = genDataLoader('test')

Processed 2000 sentences for test
Processed 4000 sentences for test


In [8]:
BEST_MODEL_FOLDER = 'result/model/'  # Path to save best model
TRAINING_LOGS_FOLDER = 'result/training/'  # Path to save training logs
TESTING_LOGS_FOLDER = 'result/testing/'  # Path to save testing logs

## Evaluating

In [9]:
# All pretrain models in chinese
MODELS_PATHS_UNITS = {
    'BERT': ('bert-base-chinese', 768),
    'BERT-wwm': ('hfl/chinese-bert-wwm-ext', 768),
    'RoBERTa': ('uer/chinese_roberta_L-12_H-768', 768),
    'RoBERTa-wwm': ('hfl/chinese-roberta-wwm-ext', 768),
    'RoBERTa-wwm-large': ('hfl/chinese-roberta-wwm-ext-large', 1024),
    'Re-trained RoBERTa-wwm': ('hfl/rbt3', 768),
    'Re-trained RoBERTa-wwm-large': ('hfl/rbtl3', 1024),
}

### Training Performance

In [10]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [14]:
training_results = []
for model_name in tqdm(MODELS_PATHS_UNITS.keys()):
    log_path = f'{TRAINING_LOGS_FOLDER}{model_name}.pickle'
    if os.path.exists(log_path):
        training_log = pd.read_pickle(log_path)
        training_results.append(training_log)
training_result = pd.concat(training_results, axis=0)
training_result.to_excel('output/training_result.xlsx', index=False)

  0%|          | 0/7 [00:00<?, ?it/s]

### Testing Performance

In [None]:
REPLACE_EXIST = True

# Evaluate each pretrain model
for model_name in tqdm(MODELS_PATHS_UNITS.keys()):
    print('-'*10, model_name, '-'*10)
    result_log_path = f'{TESTING_LOGS_FOLDER}{model_name}.pickle'

    if not os.path.exists(result_log_path) or REPLACE_EXIST:
        model_path = f'{BEST_MODEL_FOLDER}best_{model_name}.pth' if not TESTING else f'{BEST_MODEL_FOLDER}best_testing_{model_name}.pth'
        # Initialize a model
        sentiment_classifier = Model(num_classes=6,
                                     model_name=model_name, 
                                     pretrain_path=MODELS_PATHS_UNITS[model_name][0],
                                     hidden_size=MODELS_PATHS_UNITS[model_name][1]).to(DEVICE)

        # Load model parameters
        sentiment_classifier.load_state_dict(torch.load(model_path, map_location=DEVICE))

        # Evaluate on testset
        loss, accuracy, f1, inference_time = test(sentiment_classifier, test_datagen, device=DEVICE)

        # Logging
        result = pd.DataFrame([[model_name, loss, accuracy, inference_time]], columns=['model_name', 'loss', 'accuracy', 'inference_time'])
        result.to_pickle(result_log_path)

  0%|          | 0/7 [00:00<?, ?it/s]

---------- BERT ----------


In [None]:
testing_results = []
for model_name in tqdm(MODELS_PATHS_UNITS.keys()):
    log_path = f'{TESTING_LOGS_FOLDER}{model_name}.pickle'
    if os.path.exists(log_path):
        testing_log = pd.read_pickle(log_path)
        testing_results.append(testing_log)
testing_result = pd.concat(testing_results, axis=0)
testing_result.to_excel('output/testing_result.xlsx', index=False)
testing_result

  0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,model_name,loss,accuracy,inference_time
0,BERT,0.6476,0.510827,35.820117
0,BERT-wwm,0.6998,0.63303,34.321145
0,RoBERTa,0.707,0.635746,36.553144
0,RoBERTa-wwm,0.6844,0.591163,37.308538
0,RoBERTa-wwm-large,0.6802,0.577993,129.985601
0,Re-trained RoBERTa-wwm,0.603,0.475079,9.177177
0,Re-trained RoBERTa-wwm-large,0.6298,0.531702,16.389246


In [None]:
sentiment_classifier = copy.deepcopy(pretrained_model)
sentiment_classifier.load_state_dict(torch.load(BEST_MODEL_PATH))
print('模型加载完毕')

def pred(word, model):
    cur_ids, cur_type, cur_mask = convert_text_to_token(TOKENIZER, word, seq_length=SEQ_LENGTH)
    cur_ids, cur_type, cur_mask = torch.LongTensor(np.array([cur_ids])).to(DEVICE), torch.LongTensor(np.array([cur_type])).to(DEVICE), torch.LongTensor(np.array([cur_mask])).to(DEVICE) # 数据构造成tensor形式
    with torch.no_grad():
        y_ = model(cur_ids, token_type_ids=cur_type, attention_mask=cur_mask)
        pred = y_.max(-1, keepdim=True)[1]  # 取最大值
        # cur_pre = LABEL_DICT[int(pred[0][0].cuda().data.cpu().numpy())] # 预测的情绪
        cur_pre = LABEL_DICT[int(pred[0][0].data.cpu().numpy())] # 预测的情绪
        print(cur_pre)


NameError: ignored

In [None]:
pred('草泥马好可爱', MODEL)

angry


In [None]:
pred('草泥马是什么神仙物种', MODEL)

angry


In [None]:
pred('草！我爱死你了！！！！！！！！！！！', MODEL)

angry


In [None]:
pred('世界上五大最可爱动物:草泥马第二', MODEL)

happy
