# Method 1: Prompt+Pre-trained BERT

## Design a hard prompt

In [1]:
# define a hard prompt, and its ID
def get_prompt(x):
    prompt = f'总体上来说很[MASK]: {x}'
    return {
        'prompt': prompt,
        'mask_id': prompt.find('[MASK]') # since only one [MASK], find the ID
    }

# map the label to the word in the dictionary 
# (tokenizer, words with similiar meanings have similar embeddings, that's why we could calculate the loss)
def map_label(tokenizer):
    return {
        '1': {'token': '好', 'id': tokenizer.convert_tokens_to_ids("好")},
        '0': {'token': '差', 'id': tokenizer.convert_tokens_to_ids("差")}
    }


In [3]:
# a testing from Huggingface
from transformers import AutoTokenizer

checkpoint = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

comment = '这个宾馆比较陈旧了，特价的房间也很一般。'

print('verbalizer:', map_label(tokenizer))

prompt_data = get_prompt(comment)
prompt, mask_offset = prompt_data['prompt'], prompt_data['mask_id']

encoding = tokenizer(prompt, truncation=True)
tokens = encoding.tokens()
mask_idx = encoding.char_to_token(mask_offset)

print('prompt:', prompt)
print('prompt tokens:', tokens)
print('mask idx:', mask_idx)

verbalizer: {'1': {'token': '好', 'id': 1962}, '0': {'token': '差', 'id': 2345}}
prompt: 总体上来说很[MASK]: 这个宾馆比较陈旧了，特价的房间也很一般。
prompt tokens: ['[CLS]', '总', '体', '上', '来', '说', '很', '[MASK]', ':', '这', '个', '宾', '馆', '比', '较', '陈', '旧', '了', '，', '特', '价', '的', '房', '间', '也', '很', '一', '般', '。', '[SEP]']
mask idx: 7


## Upload dataset combined with prompts

In [5]:
from torch.utils.data import Dataset

class upload_dataset(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)
    
    def load_data(self, data_file):
        Data = {}
        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                items = line.strip().split('\t')
                prompts = get_prompt(items[0])
                assert len(items) == 2
                Data[idx] = {
                    'comment': items[0], 
                    'label': int(items[1]),
                    'prompt': prompts['prompt'],
                    'mask_id': prompts['mask_id']
                }
        return Data
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [6]:
# testing 
train_data = upload_dataset('chnsenticorp/train/part.0')
valid_data = upload_dataset('chnsenticorp/dev/part.0')
test_data = upload_dataset('chnsenticorp/test/part.0')
print(f'train set size: {len(train_data)}')
print(f'valid set size: {len(valid_data)}')
print(f'test set size: {len(test_data)}')
print(next(iter(train_data)))

train set size: 9600
valid set size: 1200
test set size: 1200
{'comment': '选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'label': 1, 'prompt': '总体上来说很[MASK]: 选择珠江花园的原因就是方便，有电动扶梯直接到达海边，周围餐馆、食廊、商场、超市、摊位一应俱全。酒店装修一般，但还算整洁。 泳池在大堂的屋顶，因此很小，不过女儿倒是喜欢。 包的早餐是西式的，还算丰富。 服务吗，一般', 'mask_id': 6}


## Data preprocessing

In [31]:
import torch
import numpy as np
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
checkpoint = 'bert-base-chinese'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# find the id of the label defined in tokenizer
label = map_label(tokenizer)
pos_id, neg_id = label['1']['id'], label['0']['id']
# Dataloader, in NLP we usually use collate_fn to do the padding to make sure samples have the same sequence length
def collate(batch_samples): # operate on each batch
    batch_sentence = []
    batch_label = []
    batch_mask_id = []
    max_length = 0
    for sample in batch_samples:
        batch_sentence.append(sample['prompt'])
        encoding = tokenizer(sample['prompt'], truncation=True)
        max_length = max(max_length, len(encoding.tokens())) # dynamic padding to the longest 
        mask_idx = encoding.char_to_token(sample['mask_id']) # convert the character id of mask in prompts to token id in tokenized sentence
        batch_mask_id.append(mask_idx) # if pre-fix prompt, mask id will be the same
        batch_label.append(sample['label']) 
    batch_inputs = tokenizer(batch_sentence, max_length=max_length, padding=True, truncation=True, return_tensors="pt") 
    label_id = [neg_id, pos_id]
    return {
        'batch_inputs':batch_inputs,
        'batch_mask_id': batch_mask_id,
        'label_id': label_id,
        'labels': batch_label
        
    }


In [32]:
# a testing for dataloader
# batch_data = next(iter(train_loader))
# print('batch_X shape:', {k: v.shape for k, v in batch_data['batch_inputs'].items()})
# print(batch_data['batch_inputs'])
# print(batch_data['batch_mask_id'])
# print(batch_data['label_id'])
# print(batch_data['labels'])

batch_X shape: {'input_ids': torch.Size([32, 194]), 'token_type_ids': torch.Size([32, 194]), 'attention_mask': torch.Size([32, 194])}
{'input_ids': tensor([[ 101, 2600,  860,  ...,    0,    0,    0],
        [ 101, 2600,  860,  ...,    0,    0,    0],
        [ 101, 2600,  860,  ...,    0,    0,    0],
        ...,
        [ 101, 2600,  860,  ...,    0,    0,    0],
        [ 101, 2600,  860,  ...,    0,    0,    0],
        [ 101, 2600,  860,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,

## Build the model

Look into the structure of the pre-trained model

In [33]:
from transformers import AutoModelForMaskedLM, BertModel # the difference is that the former class contains a head layer for specific MLM task;
# BertModel is the base class, and we need to define the head layer e.g. the softmax

checkpoint = 'bert-base-chinese'
pre_trained_model = BertModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
vocab_size = len(tokenizer)
print(pre_trained_model)
print(vocab_size)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

Frozen all the parameters + Add the classiifcation layer (trainable)

In [14]:
for param in pre_trained_model.parameters():
    param.requires_grad_(False)

In [27]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.fc = torch.nn.Linear(768, vocab_size)
    
    def forward(self, batch_data):
        input_data = batch_data['batch_inputs']
        with torch.no_grad():
            out = pre_trained_model(**input_data)
        
        out = self.fc(out.last_hidden_state[:, batch_data['batch_mask_id'][0]]) # only works when the masking position is fixed (batch, vocab_size)
        out = out[:,batch_data['label_id']] # we only want to explore the prediction on the label_id
        return out

In [34]:
from transformers import AdamW
from tqdm import tqdm
# Hyper-parameters
model = Model(vocab_size)
optimizer = AdamW(model.parameters(), lr=5e-4)
loss_fun = torch.nn.CrossEntropyLoss()
num_epoch = 1
batch_size = 32
# encapsulate into the dataloader as input
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
valid_loader = DataLoader(valid_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True, collate_fn=collate)
# def to_device(batch_data):
#     new_batch_data = {}
#     for k, v in batch_data.items():
#         if k == 'batch_inputs':
#             new_batch_data[k] = {
#                 k_: v_.to(device) for k_, v_ in v.items()
#             }
#         elif k == 'label_word_id':
#             new_batch_data[k] = v
#         else:
#             new_batch_data[k] = torch.tensor(v).to(device)
#     return new_batch_data
    
# if torch.cuda.is_available():
#     device = torch.device("cuda") 
#     model = model.to(device)
# else: 'cpu'




In [36]:


for epoch in range(num_epoch):
    total_loss = 0
    i = 1
    for batch_data in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epoch}"):
        model.train() # training mode
        output = model(batch_data)
        loss = loss_fun(output, torch.tensor(batch_data['labels']))
        
        loss.backward()
        total_loss+=loss.item()

        optimizer.step()
        optimizer.zero_grad()# set gradient to 0 when batch is updated
        
        
        if i % 10 == 0: # record every 10 training batches
            output = output.argmax(dim=1)
            acc = (output == torch.tensor(batch_data['labels'])).sum().item()/ len(batch_data['labels'])
            print(f'Training stage:Batch {i}, Loss: {loss.item()}, Accuracy: {acc}')
        
        if i % 30 == 0: # validate every 30 batches
            model.eval()
            batch_val = next(iter(valid_loader))
            with torch.no_grad():
                output = model(batch_val)
            loss = loss_fun(output, torch.tensor(batch_val['labels']))
            output = output.argmax(dim=1)
            acc = (output == torch.tensor(batch_val['labels'])).sum().item()/ len(batch_val['labels'])
            print(f'Validation stage:Batch {i}, Loss: {loss.item()}, Accuracy: {acc}')

        i+=1
        
    average_loss = total_loss/len(train_loader)
    print(f"Training Epoch {epoch + 1}/{num_epoch}, Average Loss: {average_loss:.4f}")

    #



Epoch 1/1:   3%|▎         | 10/300 [01:40<50:22, 10.42s/it]

Training stage:Batch 10, Loss: 0.6216623187065125, Accuracy: 0.6875


Epoch 1/1:   7%|▋         | 20/300 [02:58<35:41,  7.65s/it]

Training stage:Batch 20, Loss: 0.5000836253166199, Accuracy: 0.84375


Epoch 1/1:  10%|▉         | 29/300 [04:29<45:24, 10.05s/it]

Training stage:Batch 30, Loss: 0.4135119318962097, Accuracy: 0.96875


Epoch 1/1:  10%|█         | 30/300 [04:49<59:31, 13.23s/it]

Validation stage:Batch 30, Loss: 0.5504613518714905, Accuracy: 0.78125


Epoch 1/1:  13%|█▎        | 40/300 [06:39<49:46, 11.49s/it]

Training stage:Batch 40, Loss: 0.39707937836647034, Accuracy: 0.84375


Epoch 1/1:  17%|█▋        | 50/300 [08:10<37:13,  8.93s/it]

Training stage:Batch 50, Loss: 0.2865910232067108, Accuracy: 0.96875


Epoch 1/1:  20%|█▉        | 59/300 [09:18<30:42,  7.64s/it]

Training stage:Batch 60, Loss: 0.40223997831344604, Accuracy: 0.875


Epoch 1/1:  20%|██        | 60/300 [09:38<44:54, 11.23s/it]

Validation stage:Batch 60, Loss: 0.49976465106010437, Accuracy: 0.84375


Epoch 1/1:  23%|██▎       | 70/300 [11:03<42:21, 11.05s/it]

Training stage:Batch 70, Loss: 0.3674204647541046, Accuracy: 0.875


Epoch 1/1:  27%|██▋       | 80/300 [12:21<20:32,  5.60s/it]

Training stage:Batch 80, Loss: 0.4557051658630371, Accuracy: 0.8125


Epoch 1/1:  30%|██▉       | 89/300 [13:50<37:42, 10.72s/it]

Training stage:Batch 90, Loss: 0.4064907729625702, Accuracy: 0.8125


Epoch 1/1:  30%|███       | 90/300 [14:17<54:48, 15.66s/it]

Validation stage:Batch 90, Loss: 0.5552137494087219, Accuracy: 0.65625


Epoch 1/1:  33%|███▎      | 100/300 [15:39<30:20,  9.10s/it]

Training stage:Batch 100, Loss: 0.43643707036972046, Accuracy: 0.78125


Epoch 1/1:  37%|███▋      | 110/300 [16:57<28:45,  9.08s/it]

Training stage:Batch 110, Loss: 0.38890618085861206, Accuracy: 0.875


Epoch 1/1:  40%|███▉      | 119/300 [18:15<29:01,  9.62s/it]

Training stage:Batch 120, Loss: 0.3738325834274292, Accuracy: 0.90625


Epoch 1/1:  40%|████      | 120/300 [18:33<36:51, 12.29s/it]

Validation stage:Batch 120, Loss: 0.3671298921108246, Accuracy: 0.84375


Epoch 1/1:  43%|████▎     | 130/300 [21:17<52:05, 18.38s/it]  

Training stage:Batch 130, Loss: 0.35306087136268616, Accuracy: 0.875


Epoch 1/1:  47%|████▋     | 140/300 [22:41<25:16,  9.48s/it]

Training stage:Batch 140, Loss: 0.3177770674228668, Accuracy: 0.875


Epoch 1/1:  50%|████▉     | 149/300 [24:23<29:16, 11.64s/it]

Training stage:Batch 150, Loss: 0.5328782796859741, Accuracy: 0.71875


Epoch 1/1:  50%|█████     | 150/300 [24:42<35:05, 14.04s/it]

Validation stage:Batch 150, Loss: 0.43746045231819153, Accuracy: 0.75


Epoch 1/1:  53%|█████▎    | 160/300 [26:38<31:07, 13.34s/it]

Training stage:Batch 160, Loss: 0.21509599685668945, Accuracy: 0.9375


Epoch 1/1:  57%|█████▋    | 170/300 [28:03<23:35, 10.88s/it]

Training stage:Batch 170, Loss: 0.307746559381485, Accuracy: 0.90625


Epoch 1/1:  60%|█████▉    | 179/300 [29:27<19:02,  9.44s/it]

Training stage:Batch 180, Loss: 0.28111857175827026, Accuracy: 0.875


Epoch 1/1:  60%|██████    | 180/300 [29:55<30:16, 15.13s/it]

Validation stage:Batch 180, Loss: 0.3439118266105652, Accuracy: 0.78125


Epoch 1/1:  63%|██████▎   | 190/300 [31:55<20:54, 11.41s/it]

Training stage:Batch 190, Loss: 0.16184115409851074, Accuracy: 0.96875


Epoch 1/1:  67%|██████▋   | 200/300 [35:02<26:26, 15.87s/it]

Training stage:Batch 200, Loss: 0.3493075966835022, Accuracy: 0.875


Epoch 1/1:  70%|██████▉   | 209/300 [37:42<30:15, 19.95s/it]

Training stage:Batch 210, Loss: 0.45072853565216064, Accuracy: 0.75


Epoch 1/1:  70%|███████   | 210/300 [38:46<49:51, 33.24s/it]

Validation stage:Batch 210, Loss: 0.1629774570465088, Accuracy: 0.96875


Epoch 1/1:  73%|███████▎  | 220/300 [41:57<17:28, 13.10s/it]

Training stage:Batch 220, Loss: 0.22232332825660706, Accuracy: 0.90625


Epoch 1/1:  77%|███████▋  | 230/300 [44:28<13:58, 11.98s/it]

Training stage:Batch 230, Loss: 0.34698835015296936, Accuracy: 0.84375


Epoch 1/1:  80%|███████▉  | 239/300 [47:03<13:41, 13.47s/it]

Training stage:Batch 240, Loss: 0.3418276906013489, Accuracy: 0.8125


Epoch 1/1:  80%|████████  | 240/300 [47:50<23:38, 23.65s/it]

Validation stage:Batch 240, Loss: 0.2533036768436432, Accuracy: 0.90625


Epoch 1/1:  83%|████████▎ | 250/300 [50:47<12:22, 14.85s/it]

Training stage:Batch 250, Loss: 0.3286125957965851, Accuracy: 0.84375


Epoch 1/1:  87%|████████▋ | 260/300 [53:12<12:17, 18.44s/it]

Training stage:Batch 260, Loss: 0.33326736092567444, Accuracy: 0.90625


Epoch 1/1:  90%|████████▉ | 269/300 [56:18<09:17, 17.99s/it]

Training stage:Batch 270, Loss: 0.34367135167121887, Accuracy: 0.84375


Epoch 1/1:  90%|█████████ | 270/300 [57:04<13:03, 26.13s/it]

Validation stage:Batch 270, Loss: 0.41093870997428894, Accuracy: 0.84375


Epoch 1/1:  93%|█████████▎| 280/300 [59:18<03:45, 11.27s/it]

Training stage:Batch 280, Loss: 0.24305881559848785, Accuracy: 0.9375


Epoch 1/1:  97%|█████████▋| 290/300 [1:03:25<04:16, 25.63s/it]

Training stage:Batch 290, Loss: 0.4430398941040039, Accuracy: 0.8125


Epoch 1/1: 100%|█████████▉| 299/300 [1:06:51<00:19, 19.34s/it]

Training stage:Batch 300, Loss: 0.2576771080493927, Accuracy: 0.9375


Epoch 1/1: 100%|██████████| 300/300 [1:07:05<00:00, 13.42s/it]

Validation stage:Batch 300, Loss: 0.28407829999923706, Accuracy: 0.90625
Training Epoch 1/1, Average Loss: 0.3794



