In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import (AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig)
import evaluate
import torch.nn as nn
import torch
import numpy as np
from utils import PROD_TOK, AUX_TOK, RewardModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
#We initialize our custom reward model
rm = RewardModel("VCNC/bert_piezas3", hidden_size=768, classes=20).to(device)
rm.load_state_dict(torch.load('rm_dict.pt'))

<All keys matched successfully>

In [4]:
#We initialize 2 BERT pieces models, one to train and the other for reference
model_new = AutoModelForSequenceClassification.from_pretrained("VCNC/bert_piezas3").to(device)
model_ref = AutoModelForSequenceClassification.from_pretrained("VCNC/bert_piezas3").to(device)

In [9]:
#We check whether the models were loaded correctly and are ready to inference
#For the reward model
input = torch.tensor([[2, 2, 1, 3, 4, 1], [3, 2, 1, 3, 4, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]).to(device)
print(f"Reward model's output: {rm(input, token_type_ids=token_type, train=False)}")

#For the pieces models
input = torch.tensor([[2, 2, 1, 3, 4], [3, 2, 1, 3, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]).to(device)
attention_mask = torch.tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]).to(device)
with torch.no_grad():
    print(f"""
    New model's output: {model_new(input_ids=input, token_type_ids=token_type, attention_mask=attention_mask)}
    Reference model's output: {model_ref(input_ids=input, token_type_ids=token_type, attention_mask=attention_mask)}""")

Reward model's output: SequenceClassifierOutput(loss=None, logits=tensor([[-10.7145],
        [-10.8042]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

    New model's output: SequenceClassifierOutput(loss=None, logits=tensor([[ -2.5363,  -3.1314,  -1.2427,  -1.6860,  -3.0604,  -2.0583, -21.7888,
          -4.1208,  -2.3355,  -7.4675,  -2.8129,  -5.3138,  -2.3228,  -3.1468,
         -21.6905,  -5.4794, -21.6313,  -4.1415,  -5.0822,  -5.6226],
        [ -2.5363,  -3.1314,  -1.2427,  -1.6860,  -3.0604,  -2.0583, -21.7888,
          -4.1208,  -2.3355,  -7.4675,  -2.8129,  -5.3138,  -2.3228,  -3.1468,
         -21.6905,  -5.4794, -21.6313,  -4.1415,  -5.0822,  -5.6226]],
       device='cuda:0'), hidden_states=None, attentions=None)
    Reference model's output: SequenceClassifierOutput(loss=None, logits=tensor([[ -2.5363,  -3.1314,  -1.2427,  -1.6860,  -3.0604,  -2.0583, -21.7888,
          -4.1208,  -2.3355,  -7.4675,  -2.8129,  -5.3138,  -2.3228,  -3.

In [5]:
#We define the PPO loss as in this hf post (https://huggingface.co/blog/deep-rl-ppo)
def PPO_loss(eps, model_new, model_ref, reward_model, inp):
    #We first compute the output given by the new and reference model
    new_out = model_new(inp)
    ref_out = model_ref(inp)
    print(f"New model's output: {mod_out}")
    print(f"Reference model's output: {ref_out}")

    #We compute the ratio of the two probability distributions
    ratio = mod_out/ref_out

    #We get the reward assigned by our reward model
    true_out = torch.max(new_out, 1).indices
    reward_inp = torch.cat()
    reward = reward_model(mod_out)
    print(f"The reward for this action and state is: {reward[0]}")

    #The entropy is computed to balance explotation and exploration 
    entropy = -torch.dot(mod_out, torch.log(mod_out))
    print(f"The current entropy is: {entropy}")

    expression = torch.tensor([torch.dot(ratio, reward), torch.dot(torch.clamp(ratio, min=1-eps, max=1+eps), reward)])

    return torch.min(expression)+entropy

In [10]:
#We can see that the results do not vary, indicating that our functions are correct
input = torch.tensor([[2, 2, 1, 3, 4, 1], [3, 2, 1, 3, 4, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]).to(device)
rm(input, token_type_ids=token_type, train=False)

SequenceClassifierOutput(loss=None, logits=tensor([[0.1322],
        [0.0090]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [11]:
#In this case we will use the product dataset
data = df_hist_piez.to_numpy()
keys = np.unique(np.array(list(AUX_TOK.values()))) #Get unique tokens only
dataset = np.zeros((len(data)*(len(keys)-1), 3), dtype=object) #The number of pairs is the number of examples at the beginning times 1 minus keys

#Iterate through the dataset to generate sets (correct, wrong, input)
for i in range(len(data)):
    internal_cont = 0
    for j in range(len(keys)):
        if j != data[i, 1][0]:
            dataset[i*(len(keys)-1)+internal_cont, 0] = data[i, 1][0]
            dataset[i*(len(keys)-1)+internal_cont, 1] = j
            dataset[i*(len(keys)-1)+internal_cont, 2] = data[i, 0]
            internal_cont += 1
dataset

array([[5, 0,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       [5, 1,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       [5, 2,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       ...,
       [10, 16,
        list([10, 10, 10, 13, 13, 8, 2, 2, 0, 0, 10, 10, 5, 10, 5, 8, 8, 2, 2, 10, 2, 2, 17, 5, 5, 5, 5, 8, 8, 8, 8, 8, 0, 5, 0, 0, 0, 19, 17, 17, 0, 15, 0, 10, 10, 10, 10, 7, 7, 19, 12, 12, 12, 12, 2, 13, 10, 8, 5, 5, 13, 13, 13, 13, 5, 4, 4, 8, 8, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 12, 5, 10, 10, 10, 4, 4, 5, 11, 5, 8, 8, 0, 0, 2, 1, 13, 13, 0, 0

## Data formatting and train/test splitting

In [12]:
MAX_LEN = max([len(i) for i in dataset[:, 2]])
PAD = max(list(AUX_TOK.values()))+1

input_ids = []
attention_mask = []
token_type_ids = []
train = []
for i in range(len(dataset)):
    pad_len = MAX_LEN-len(dataset[:, 2][i])
    right_seq = dataset[:, 2][i]+[dataset[i, 0]]+[PAD]*pad_len
    wrong_seq = dataset[:, 2][i]+[dataset[i, 1]]+[PAD]*pad_len
    input_ids.append([right_seq, wrong_seq]) #The input_ids is a concatenation of the right and wrong example
    attention_mask.append([1]*(len(dataset[:, 2][i])+1)+[0]*pad_len)
    token_type_ids.append([0]*len(dataset[:, 2][i])+[1]*(pad_len+1)) #The +1 is because the output selected by the human changes the token type
    train.append(True) #In RLHF every example is given as a training example REVISAR!!!
input_ids[0], attention_mask[0], token_type_ids[0], train[0] #Get first element

([[5,
   8,
   8,
   8,
   8,
   8,
   8,
   5,
   8,
   2,
   10,
   8,
   8,
   8,
   10,
   10,
   10,
   13,
   8,
   2,
   2,
   13,
   0,
   10,
   5,
   5,
   8,
   2,
   2,
   10,
   7,
   7,
   0,
   1,
   0,
   18,
   10,
   12,
   10,
   10,
   10,
   13,
   13,
   13,
   13,
   1,
   5,
   5,
   17,
   7,
   8,
   8,
   5,
   5,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20],
  [5,
   8,
   8,
   8,
   8,
   8,
   8,
   5,
   8,
   2,
   10,
   8,
   8,
   8,
   10,
   10,
   10,
   13,
   8,
   2,
   2,
   13,
   0,
   10,
   5,
   5,
   8,
   2,
   2,
   10,
   7,
   7,
   0,
   1,
   0,
   18,
   10,
   12,
   10,
   10,
   10,
   13,
   13,
   1

In [13]:
#We create a df to store the data properly
data = pd.DataFrame(data={'input_ids':input_ids, 'attention_mask':attention_mask, 'token_type_ids':token_type_ids, 'train':train})
data.head()

Unnamed: 0,input_ids,attention_mask,token_type_ids,train
0,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
1,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
2,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
3,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
4,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True


## DatasetDict creation

In [14]:
#We turn our datasets into DatasetDicts
train = Dataset.from_pandas(data.loc[data.train == True].reset_index(drop=True))
test = Dataset.from_pandas(data.loc[data.train == False].reset_index(drop=True))

ds = DatasetDict()

ds['train'] = train
ds['validation'] = test

ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'train'],
        num_rows: 1764
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'train'],
        num_rows: 0
    })
})

In [15]:
#We redefine a custom loss that maximizes the difference between the rewards of positive and negative examples
def custom_loss(out_right:torch.tensor, out_wrong:torch.tensor, return_outputs:bool=False):
    #The idea behind the equations is better explained in the following article: https://medium.com/towards-generative-ai/reward-model-training-2209d1befb5f
    diff = out_right-out_wrong
    loss = torch.sum(-torch.log(torch.sigmoid(out_right-out_wrong)))
    return (loss, diff) if return_outputs else loss

In [16]:
#Finally, we train the model
retrain = True #This variable only indicates if the training will be performed on a new or a pre-trained model
model_path = "/content/drive/MyDrive/Colab Notebooks/Auto-CNC/rm_dict.pt"
if retrain:
  rm.load_state_dict(torch.load(model_path))
  rm.train(mode=True)
else:
  rm.train(mode=True)

In [17]:
#We use Adam optimizer
optimizer = torch.optim.Adam(params=rm.parameters(), lr=3e-07)

In [18]:
EPOCHS = 2
batch_size = 60
batches_per_epoch = len(train) // batch_size
last_batch = len(train)%batch_size


for epoch in range(EPOCHS):
    for i in range(batches_per_epoch):
        start = i * batch_size

        # Take a batch
        input_batch = train[start:start+batch_size]
        inputs = np.array(input_batch['input_ids'])

        # Format input_ids to match model requirements
        input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
        attention_mask = torch.tensor(input_batch['attention_mask']).to(device)
        token_type_ids = torch.tensor(input_batch['token_type_ids']).to(device)

        # Forward pass through the model
        out = rm(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)
        loss = custom_loss(out[0].logits, out[1].logits)

        if i%(batches_per_epoch//2) == 0:
            #print(f"Correct reward: {out[0].logits} Wrong reward: {out[1].logits}")
            print(f"Loss epoch {epoch} batch {i}: {loss}")

        # Backpropagate
        loss.backward()

        # Update weights
        optimizer.step()

    #Take the last training examples that fall out of the batches
    # Take a batch
    input_batch = train[-last_batch:]
    inputs = np.array(input_batch['input_ids'])

    # Format input_ids to match model requirements
    input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
    attention_mask = torch.tensor(input_batch['attention_mask']).to(device)
    token_type_ids = torch.tensor(input_batch['token_type_ids']).to(device)

    # Forward pass through the model
    out = rm(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)
    loss = custom_loss(out[0].logits, out[1].logits)

    # Backpropagate
    loss.backward()

    # Update weights
    optimizer.step()

Loss epoch 0 batch 0: 2.5394420623779297
Loss epoch 0 batch 1: 0.5584986209869385
Loss epoch 0 batch 2: 2.657935619354248
Loss epoch 0 batch 3: 0.0013255055528134108
Loss epoch 0 batch 4: 0.013485319912433624
Loss epoch 0 batch 5: 0.020154066383838654
Loss epoch 0 batch 6: 6.212632179260254
Loss epoch 0 batch 7: 0.703912079334259
Loss epoch 0 batch 8: 2.662889003753662
Loss epoch 0 batch 9: 0.04050937294960022
Loss epoch 0 batch 10: 0.07783440500497818
Loss epoch 0 batch 11: 0.013892030343413353
Loss epoch 0 batch 12: 0.009468009695410728
Loss epoch 0 batch 13: 2.516263723373413
Loss epoch 0 batch 14: 4.688385963439941
Loss epoch 0 batch 15: 0.7963013648986816
Loss epoch 0 batch 16: 0.5163571834564209
Loss epoch 0 batch 17: 2.39184308052063
Loss epoch 0 batch 18: 3.694889783859253
Loss epoch 0 batch 19: 3.792034387588501
Loss epoch 0 batch 20: 6.204624176025391
Loss epoch 0 batch 21: 2.208420515060425
Loss epoch 0 batch 22: 1.6280598640441895
Loss epoch 0 batch 23: 2.4737799167633057
L

KeyboardInterrupt: ignored

In [None]:
loss

tensor(42.0421, device='cuda:0', grad_fn=<SumBackward0>)

In [19]:
rm.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21, 768, padding_idx=20)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [20]:
#Evaluate if training was done correctly taking some training examples
inp = train[0:1000:100]
inputs = np.array(inp['input_ids'])
input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
attention_mask = torch.tensor(inp['attention_mask']).to(device)
token_type_ids = torch.tensor(inp['token_type_ids']).to(device)
rm(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)

[SequenceClassifierOutput(loss=None, logits=tensor([[ 0.6461],
         [ 2.2590],
         [ 0.0599],
         [-0.2282],
         [-3.7392],
         [ 3.4150],
         [ 2.2236],
         [ 6.2790],
         [ 1.5751],
         [ 8.0019]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 SequenceClassifierOutput(loss=None, logits=tensor([[-11.0226],
         [-10.8747],
         [-11.0418],
         [-11.0268],
         [-11.0430],
         [ -6.8721],
         [-11.0320],
         [ -8.7046],
         [  1.6969],
         [  6.4221]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]

In [21]:
#Save model
torch.save(rm.state_dict(), 'drive/MyDrive/Colab Notebooks/Auto-CNC/rm_dict.pt')

In [None]:
#Load saved model
load_model = RewardModel("VCNC/bert_piezas3", hidden_size=768, classes=20).to(device)
load_model.load_state_dict(torch.load('rm/rm_dict.pt'))

<All keys matched successfully>

In [None]:
load_model.eval()

In [None]:
#Evaluate if model was loaded succesfully
inp = train[0:3]
inputs = np.array(inp['input_ids'])
input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
attention_mask = torch.tensor(inp['attention_mask']).to(device)
token_type_ids = torch.tensor(inp['token_type_ids']).to(device)
load_model(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)

[SequenceClassifierOutput(loss=None, logits=tensor([[0.7354],
         [0.7354],
         [0.7354]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 SequenceClassifierOutput(loss=None, logits=tensor([[0.7354],
         [0.7354],
         [0.7354]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]