# An example of adapting CKGA for BERT

In [1]:
import torch 
import torch.nn as nn

import argparse

from adapter_models import CONTROLER, ADAPTER, MHA

from transformers import BertModel, BertTokenizer, BertConfig

Suppose the dataset has three sentences where the aspects are surrounded by [SEP]

The entity_idx represents the entity number of the aspect in sub-DBpedia.

The labels is the sentiment labels of the dataset, which are tri-categorized.

In [2]:

datasets = ["Easy to [SEP] carry [SEP] , can be taken anywhere, can be hooked up to printers,headsets.",
            "Very good [SEP] quality [SEP] and well made.",
            "They are by far the easiest [SEP] systems [SEP] to actually learn about computers with."]
entity_idx = torch.randint(0, 1000, (len(datasets),))
labels = torch.randint(0, 3, (len(datasets),))
print('entity_idx : ',entity_idx, '\n', 'labels : ', labels)

entity_idx :  tensor([118, 404, 657]) 
 labels :  tensor([0, 1, 1])


#### CKGA requires additional hyperparameter.

In [24]:
def get_args(jupyter=False):
    def str2bool(v):
        if isinstance(v, bool):
            return v
        if v.lower() in ('yes', 'true', 't', 'y', '1'):
            return True
        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
            return False
        else:
            raise argparse.ArgumentTypeError('Boolean value expected.')
    ###################################################################
    # parameters of original models
    parser = argparse.ArgumentParser()
    parser.add_argument('--bert_model', type=str, default='bert-base-uncased')
    parser.add_argument('--learning_rate', type=float, default=1e-3)
    ###################################################################
    # parameters of adapters
    parser.add_argument('--adapter_gcn_hid_dim', type=int, default=300)
    parser.add_argument('--adapter_score', type=int, default=0, choices=[i for i in range(-10,12,2)])
    parser.add_argument('--adapter_gcn_out_dim', type=int, default=768)
    parser.add_argument('--adapter_dropout', type=float, default=0.5)
    parser.add_argument('--adapter_layer_num', type=int, default=2)
    parser.add_argument('--adapter_freeze_emb', type=str2bool, default=True)
    parser.add_argument('--adapter_mode', type=str, default='adapter',choices=['adapter', 'origin'])
    parser.add_argument('--adapter_kge', type=str, default='transh',choices=['transe', 'transh','transr','rotate'])
    parser.add_argument('--adapter_norm', type=str2bool, default='False')
    
    parser.add_argument('--fuse_mode', type=str, default='p', choices=['p','c'], help='plus or concatenate')
    parser.add_argument('--train_model', type=str, default='d', choices=['j','d'], help='joint or dependent')
    parser.add_argument('--origin_model_path', type=str, default='./best_origin_state_dict/laptop_acc_0.7665_f1_0.7202.pkl')
    parser.add_argument('--origin_model_lr', type=float, default=1e-3)
    
    ###################################################################
    if jupyter is True:
        return parser.parse_args(args=[])
    return parser.parse_args()


#### Define a BERT model as original model

In [25]:
class ORIGINAL_MODEL(nn.Module):
    def __init__(self, args) -> None:
        super().__init__()
        self.bert = BertModel.from_pretrained(args.bert_model)
        self.dense = nn.Linear(768, 3)
    
    def forward(self, inputs):
        x = self.bert(inputs['input_ids'])[1]
        logits = self.dense(x)
        # in this example, the aspect_emb is 
        return {'emb': x, 'classification':logits, 'aspect_emb':x}

In [26]:
args = get_args(jupyter=True)
args.device = 'cpu'
tokenizer = BertTokenizer.from_pretrained(args.bert_model)
original_model = ORIGINAL_MODEL(args)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### In the normal calculation, the output of Bert looks like this

In [7]:
inputs = tokenizer.encode_plus(datasets[0], return_tensors='pt')
'''
outputs:
emb:    torch.Size([1, 768])
classification: torch.Size([1, 3])
aspect_emb: torch.Size([1, 768])
'''
outputs = original_model(inputs)
for key in outputs:
    print(key, ' : ', outputs[key].size())

emb  :  torch.Size([1, 768])
classification  :  torch.Size([1, 3])
aspect_emb  :  torch.Size([1, 768])


### Create the CKGA with path of the dataset.
### User CONTROLER to package the original model (bert) and CKGA.

In [27]:
adapter = ADAPTER(f'./graph/laptop', args)
mha = MHA(emb1_dim=768, emb2_dim=args.adapter_gcn_out_dim, hdim=args.adapter_gcn_out_dim, n_head=2)
if args.fuse_mode == 'p':
    controler = CONTROLER(original_model, adapter, mha, 768, 3)
elif args.fuse_mode =='c':
    controler = CONTROLER(original_model, adapter, mha, 768+args.adapter_gcn_out_dim, 3)
model = controler

#### After adding CKGA, the operation is as follows:

In [11]:
adapter(None, gcn=True)
original_model_inputs = inputs
adapter_inputs = torch.tensor(entity_idx[0])
outputs = model(original_model_inputs, 
                adapter_inputs, 
                model=args.adapter_mode, 
                mode=args.fuse_mode)
print(outputs)

tensor([[-0.1960, -0.2256,  0.2316]], grad_fn=<AddmmBackward0>)


  adapter_inputs = torch.tensor(entity_idx[0])


### Since a sample uses only a little entities from sub-DBpedia, we do not want to update the parameters of CKGA frequently. For this reason, we set two optimizers so that the parameters of CKGA and the original model can be updated separately.

In [29]:
params1 = [
    {"params": [p for p in model.origin_model.parameters() if p.requires_grad], "lr":args.origin_model_lr},
    {"params": [p for p in model.classification.parameters() if p.requires_grad], "lr":args.learning_rate}, 
    {"params": [p for p in model.mha.parameters() if p.requires_grad], "lr":args.learning_rate}, 
]
params2 = [
    {"params": [p for p in model.adapter.parameters() if p.requires_grad], "lr":args.learning_rate}
]
optimizer1 = torch.optim.Adam(params1, lr=args.learning_rate, weight_decay=0)
optimizer2 = torch.optim.Adam(params2, lr=args.learning_rate, weight_decay=0)


### When training the model, the computational steps for each epoch are as follows:
1. Let CKGA do one graph convolution operation.
2. For each batch, instead of computing the graph convolution again, we directly take the result after the graph convolution operation, which can significantly reduce the frequency of graph convolutions operation.
3. After each batch is computed, update the parameters of the original model
4. After each epoch, update the parameters of CKGA, which can reduce the frequency of parameter update.

In [30]:
# epoch
criterion = nn.CrossEntropyLoss()
model.train()
for _ in range(2):
    optimizer2.zero_grad()
    model.adapter(None, gcn=True)
    for i in range(len(datasets)):
        optimizer1.zero_grad()
        original_model_inputs = tokenizer.encode_plus(datasets[i], return_tensors='pt')
        adapter_inputs = torch.tensor(entity_idx[i])
        targets = torch.tensor([labels[i]])
        outputs = model(original_model_inputs, 
                        adapter_inputs, 
                        model=args.adapter_mode, 
                        mode=args.fuse_mode)
        loss = criterion(outputs, targets)
        loss.backward(retain_graph=True)
        optimizer1.step()
    optimizer2.step()

  adapter_inputs = torch.tensor(entity_idx[i])
