## Set up

### Set up for Colab

In [4]:
# For runing notebook in colab
from google.colab import drive
drive.mount('/content/drive')
import os
root_of_repository = '/content/drive/MyDrive/ADL/Project/'
os.chdir(root_of_repository)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.3 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 77.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


### Packages

In [5]:
import torch
from transformers import BertModel, BertTokenizer
from transformers import logging
logging.set_verbosity_error()
import torch.nn as nn
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, RandomSampler

import numpy as np
import pandas as pd
import json
import copy
import time
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

## Data Preprocessing

In [6]:
SEQ_LENGTH = 128
BATCH_SIZE = 8
LABEL_DICT = {'fear':0, 'neutral':1, 'sad':2, 'surprise':3, 'angry':4, 'happy':5} # Mapping label code and meaning
TOKENIZER = BertTokenizer.from_pretrained("chinese_wwm_ext_pytorch") # Hugging face BertTokenizer to load pretrain model

#tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
#model = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")


DEVELOPMENT_SET_PATH = 'data/usual_train.txt'
TEST_SET_PATH = 'data/usual_test_labeled.txt'

def convert_text_to_token(tokenizer, sentence, seq_length):
    """Tokenize sentence

    Args:
        tokenizer (PreTrainedTokenizer): a pretrained tokenizer with special token set to 
            {'unk_token': '[UNK]', 'sep_token': '[SEP]', 
             'pad_token': '[PAD]', 'cls_token': '[CLS]', 
             'mask_token': '[MASK]'}
        sentence (str): 
        seq_length (int): length of maximum input sentence accepted
    
    Returns: tuple(word_ids, segments, attention_masks)
        word_ids (list): tokenized sentence
        segments (list): label segmentation of original sentence and padding
        attention_masks (list): label whether the word is masked
    """ 
    tokens = tokenizer.tokenize(sentence) # Tokenize the sentence
    tokens = ["[CLS]"] + tokens + ["[SEP]"] # Add [CLS] before token and [SEP] after token
    word_ids = tokenizer.convert_tokens_to_ids(tokens) # Generate list of word id
    segments = [0] * len(word_ids) # Label whether it is segmented
    attention_masks = [1] * len(word_ids) # Label whether the word is masked
    # Chop or pad the sentence into a single length - seq_length
    if len(word_ids) < seq_length: # Padding
        length_to_pad = seq_length - len(word_ids)
        word_ids += [0] * length_to_pad # [0] is the index of word "PAD" in the vocabulary table
        segments += [1] * length_to_pad # [1] denotes that this part of words are PAD
        attention_masks += [0] * length_to_pad # Change attention mask of PAD part as [0]
    else: # Chopping
        word_ids = word_ids[:seq_length]
        segments = segments[:seq_length]
        attention_masks = attention_masks[:seq_length]
    assert len(word_ids) == len(segments) == len(attention_masks)
    return word_ids, segments, attention_masks

In [7]:
def genDataLoader(data_type):
    '''Construct dataset loader

    Args:
        data_type (str): 'train' in training, 'val' in validating, 'test' in testing
    '''
    if data_type == 'test':
        with open(TEST_SET_PATH, encoding='utf8') as file:
            data = json.load(file)
    else:
        with open(DEVELOPMENT_SET_PATH, encoding='utf8') as file:
            data = json.load(file)
            # TESTING_STAGE
            if TESTING:
                dev_set, _ = train_test_split(data, train_size=320, random_state=4995)
                train_set, val_set = train_test_split(dev_set, test_size=0.2, random_state=4995)
            else:
                train_set, val_set = train_test_split(data, test_size=0.2, random_state=4995)
            data = train_set if data_type == 'train' else val_set
    ids_pool = []
    segments_pool = []
    masks_pool = []
    target_pool = []
    count = 0
    # Process all the sentences
    for each in data:
        cur_ids, cur_type, cur_mask = convert_text_to_token(TOKENIZER, each['content'], seq_length = SEQ_LENGTH)
        ids_pool.append(cur_ids)
        segments_pool.append(cur_type)
        masks_pool.append(cur_mask)
        cur_target = LABEL_DICT[each['label']]
        target_pool.append([cur_target])
        count += 1
        if count % 2000 == 0:
            print(f'Processed {count} sentences for {data_type}')
    # Construct Data Generater
    data_gen = TensorDataset(torch.LongTensor(np.array(ids_pool)),
                             torch.LongTensor(np.array(segments_pool)),
                             torch.LongTensor(np.array(masks_pool)),
                             torch.LongTensor(np.array(target_pool)))
    sampler = RandomSampler(data_gen)
    loader = DataLoader(data_gen, sampler=sampler, batch_size=BATCH_SIZE)
    return loader

In [8]:
TESTING = True
TESTING = False
train_datagen = genDataLoader('train')
val_datagen = genDataLoader('val')
test_datagen = genDataLoader('test')

Processed 2000 sentences for train
Processed 4000 sentences for train
Processed 6000 sentences for train
Processed 8000 sentences for train
Processed 10000 sentences for train
Processed 12000 sentences for train
Processed 14000 sentences for train
Processed 16000 sentences for train
Processed 18000 sentences for train
Processed 20000 sentences for train
Processed 22000 sentences for train
Processed 2000 sentences for val
Processed 4000 sentences for val
Processed 2000 sentences for test
Processed 4000 sentences for test


## Modeling

In [9]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')


### Load Pretrain Model

- `wwm` means whole word masking pretrained upon EXT dataset

In [10]:
# All pretrain models in chinese
MODELS_PATHS_UNITS = {
    'BERT': ('bert-base-chinese', 768),
    'BERT-wwm': ('hfl/chinese-bert-wwm-ext', 768),
    'RoBERTa': ('uer/chinese_roberta_L-12_H-768', 768),
    'RoBERTa-wwm': ('hfl/chinese-roberta-wwm-ext', 768),
    'RoBERTa-wwm-large': ('hfl/chinese-roberta-wwm-ext-large', 1024),
    'Re-trained RoBERTa-wwm': ('hfl/rbt3', 768),
    'Re-trained RoBERTa-wwm-large': ('hfl/rbtl3', 1024),
}

In [11]:
class Model(nn.Module):
    def __init__(self, num_classes, model_name):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(MODELS_PATHS_UNITS[model_name][0], return_dict=False)  # /roberta-wwm-ext pretrain/
        for param in self.bert.parameters():
            param.requires_grad = True  # Allow all parameters to be updated
            
        self.fc = nn.Linear(MODELS_PATHS_UNITS[model_name][1], num_classes)   # A layer to calculate logits of 6 ouput classes from 768 (hidden size of BERT)
            # Note: We are going to use Cross-EntropyLoss with a softmax “embedded”.
    def forward(self, x, token_type_ids, attention_mask):
        context = x  # Input sentence
        segments = token_type_ids
        mask = attention_mask  # Only mask the padding part
        _, pooled = self.bert(context, token_type_ids=segments, attention_mask=mask)
        logits = self.fc(pooled) # probability of 6 classes
        return logits

### Fine-tuning

In [12]:
def train(model, model_name, train_loader, test_loader, optimizer, device=DEVICE):
    '''Train the model
    '''

    model.train()
    best_acc = 0.0
    training_loss = []
    training_acc = []
    training_f1 = []
    validation_loss = []
    validation_acc = []
    validation_f1 = []
    time_usage = []
    epochs = list(range(1, NUM_EPOCHS + 1))
    for epoch in tqdm(epochs):
        batch_idx = 0
        running_loss = 0
        training_start_time = time.time()
        pred = []
        y_train = []
        for (word_ids, token_types, attention_masks, y) in tqdm(train_loader):
            word_ids, token_types, attention_masks, y = word_ids.to(device), token_types.to(device), attention_masks.to(device), y.to(device)
            y_pred = model(word_ids, token_type_ids=token_types, attention_mask=attention_masks)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y.squeeze()) # Calculate Loss
            loss.backward()
            optimizer.step()
            # Logging the loss and accuracy
            running_loss += loss.item()
            pred += y_pred.argmax(dim=1).tolist() # Get the maximum probability
            y_train += y.squeeze().tolist()
            batch_idx += 1
            # Print Every 1000 batch
            if(batch_idx + 1) % 1000 == 0:
                print('Epoch: {} [{}/{} ({:.2f}%)]\tBatch Loss: {:.6f}\tAvg Loss: {:.6f}\t'.format(
                    epoch, 
                    batch_idx * len(word_ids),
                    len(train_loader.dataset),
                    100. * batch_idx / len(train_loader),
                    loss.item(),
                    running_loss / batch_idx))
        # Compute time cost
        time_cost = time.time() - training_start_time
        time_usage.append(time_cost)
        print(f'Epoch {epoch} finished, took {time_cost:.1f}s')

        # Logging loss and accuracy, average on every updates(batches) in the training stage
        training_loss.append(running_loss / len(train_loader))
        training_acc.append(accuracy_score(y_train, pred))
        training_f1.append(f1_score(y_train, pred, average='macro'))
        
        # Evaluate performance on testset
        val_loss, val_acc, val_f1, _ = test(model, test_loader) 
        validation_loss.append(val_loss)
        validation_acc.append(val_acc)
        validation_f1.append(val_f1)

        # Keep Best model
        if best_acc < val_acc:
            model_path = f'{BEST_MODEL_FOLDER}best_{model_name}.pth' if not TESTING else f'{BEST_MODEL_FOLDER}best_testing_{model_name}.pth'
            torch.save(model.state_dict(), model_path)
            best_acc = val_acc

    # Output logs after all epoches
    progress_log = pd.DataFrame({'Model': model_name,
                                 'Epoch': epochs,
                                 'training_loss': training_loss,
                                 'training_acc': training_acc,
                                 'training_f1': training_f1,
                                 'validation_loss': validation_loss,
                                 'validation_acc': validation_acc,
                                 'validation_f1': validation_f1,
                                 'time_usage': time_usage
                                 })
    return progress_log

def test(model, test_loader, device=DEVICE):
    '''Evaluate the model
    '''
    model.eval()
    test_loss = 0.0
    y_test = []
    pred = []
    inference_start = time.time()
    for (word_ids, token_types, attention_masks, y) in test_loader:
        word_ids, token_types, attention_masks, y = word_ids.to(device), token_types.to(device), attention_masks.to(device), y.to(device)
        with torch.no_grad():
            y_ = model(word_ids, token_type_ids=token_types, attention_mask=attention_masks)
        test_loss += F.cross_entropy(y_, y.squeeze()).item()
        y_test += y.squeeze().tolist()
        pred += y_.argmax(dim=1).tolist() # Obtain the maximum probability
    inference_time = time.time() - inference_start
    test_loss /= len(test_loader)
    test_correct = accuracy_score(y_test, pred, normalize=False)
    test_acc = accuracy_score(y_test, pred)
    test_f1 = f1_score(y_test, pred, average='macro')
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%), Macro F1: {:.2f}'.format(
          test_loss, test_correct, len(test_loader.dataset),
          100. * test_acc,
          test_f1))
    return test_loss, test_acc, test_f1, inference_time

In [13]:
# Setting up
REPLACE_EXIST = False
NUM_EPOCHS = 30
BEST_MODEL_FOLDER = 'result/model/'  # Path to save best model
TRAINING_LOGS_FOLDER = 'result/training/'  # Path to save training logs

# Fine-tune each pretrain model
for model_name in tqdm(MODELS_PATHS_UNITS.keys()):
    print('-'*10, model_name, '-'*10)
    model_path = f'{BEST_MODEL_FOLDER}best_{model_name}.pth' if not TESTING else f'{BEST_MODEL_FOLDER}best_testing_{model_name}.pth'
    log_path = f'{TRAINING_LOGS_FOLDER}{model_name}.pickle'
    if not os.path.exists(model_path) or not os.path.exists(log_path) or REPLACE_EXIST:
        pretrained_model_ = Model(num_classes=6, model_name=model_name)
        sentiment_classifier = pretrained_model_.to(DEVICE)
        gc.collect()
        optimizer = torch.optim.Adam(sentiment_classifier.parameters(), lr=2e-5)
        training_log = train(sentiment_classifier, model_name, train_datagen, val_datagen, optimizer)
        training_log.to_pickle(log_path)
    else:
        print('already done')

  0%|          | 0/7 [00:00<?, ?it/s]

---------- BERT ----------
already done
---------- BERT-wwm ----------
already done
---------- RoBERTa ----------
already done
---------- RoBERTa-wwm ----------


Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/412M [00:00<?, ?B/s]

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 1 finished, took 539.2s
Test set: Average loss: 0.6342, Accuracy: 4270/5554 (76.88%), Macro F1: 0.75


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 2 finished, took 528.0s
Test set: Average loss: 0.6809, Accuracy: 4244/5554 (76.41%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 3 finished, took 528.2s
Test set: Average loss: 0.8045, Accuracy: 4215/5554 (75.89%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 4 finished, took 527.8s
Test set: Average loss: 0.9579, Accuracy: 4228/5554 (76.13%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 5 finished, took 529.3s
Test set: Average loss: 1.2172, Accuracy: 4169/5554 (75.06%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 6 finished, took 529.3s
Test set: Average loss: 1.1012, Accuracy: 4178/5554 (75.23%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 7 finished, took 529.7s
Test set: Average loss: 1.1452, Accuracy: 4244/5554 (76.41%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 8 finished, took 529.6s
Test set: Average loss: 1.2902, Accuracy: 4173/5554 (75.14%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 9 finished, took 529.5s
Test set: Average loss: 1.4050, Accuracy: 4206/5554 (75.73%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 10 finished, took 529.2s
Test set: Average loss: 1.2751, Accuracy: 4234/5554 (76.23%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 11 finished, took 528.0s
Test set: Average loss: 1.2636, Accuracy: 4121/5554 (74.20%), Macro F1: 0.70


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 12 finished, took 528.4s
Test set: Average loss: 1.2594, Accuracy: 4202/5554 (75.66%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 13 finished, took 527.7s
Test set: Average loss: 1.3448, Accuracy: 4207/5554 (75.75%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 14 finished, took 527.5s
Test set: Average loss: 1.3108, Accuracy: 4251/5554 (76.54%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 15 finished, took 527.6s
Test set: Average loss: 1.2378, Accuracy: 4217/5554 (75.93%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 16 finished, took 528.9s
Test set: Average loss: 1.3847, Accuracy: 4207/5554 (75.75%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 17 finished, took 527.6s
Test set: Average loss: 1.4591, Accuracy: 4215/5554 (75.89%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 18 finished, took 527.6s
Test set: Average loss: 1.5570, Accuracy: 4151/5554 (74.74%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 19 finished, took 527.5s
Test set: Average loss: 1.5082, Accuracy: 4139/5554 (74.52%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 20 finished, took 527.7s
Test set: Average loss: 1.3888, Accuracy: 4146/5554 (74.65%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 21 finished, took 528.1s
Test set: Average loss: 1.3191, Accuracy: 4128/5554 (74.32%), Macro F1: 0.70


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 22 finished, took 528.8s
Test set: Average loss: 1.3407, Accuracy: 4162/5554 (74.94%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 23 finished, took 529.3s
Test set: Average loss: 1.4736, Accuracy: 4169/5554 (75.06%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 24 finished, took 529.1s
Test set: Average loss: 1.4347, Accuracy: 4181/5554 (75.28%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 25 finished, took 528.9s
Test set: Average loss: 1.3142, Accuracy: 4075/5554 (73.37%), Macro F1: 0.69


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 26 finished, took 528.9s
Test set: Average loss: 1.3952, Accuracy: 4187/5554 (75.39%), Macro F1: 0.73


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 27 finished, took 529.1s
Test set: Average loss: 1.3608, Accuracy: 4142/5554 (74.58%), Macro F1: 0.71


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 28 finished, took 529.3s
Test set: Average loss: 1.5831, Accuracy: 4159/5554 (74.88%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 29 finished, took 529.4s
Test set: Average loss: 1.4510, Accuracy: 4156/5554 (74.83%), Macro F1: 0.72


  0%|          | 0/2777 [00:00<?, ?it/s]

Epoch 30 finished, took 529.0s
Test set: Average loss: 1.4628, Accuracy: 4177/5554 (75.21%), Macro F1: 0.71
---------- RoBERTa-wwm-large ----------
already done
---------- Re-trained RoBERTa-wwm ----------
already done
---------- Re-trained RoBERTa-wwm-large ----------
already done
