In [None]:
!pip install transformers[torch]
!pip install evaluate

Collecting transformers[torch]
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m106.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m87.4 MB/s

In [None]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import (AutoModelForSequenceClassification, Trainer, TrainingArguments)
import evaluate
import torch.nn as nn
import torch
import numpy as np
from utils import PROD_TOK, AUX_TOK

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load and save data

## Split data

In [None]:
#Load data
df = pd.read_hdf('human_data.h5', key='df')
df.head()

Unnamed: 0,Input_seq,Output,Type
0,"[5, 7, 4, 4, 16, 5, 5, 6, 5, 2, 17, 0, 13, 6, ...",[5],Piezas
1,"[1, 6, 8, 4, 1, 9, 11, 7, 2, 5, 8, 10, 1, 12, ...",[5],Productos
2,"[2, 1]",[3],Productos
3,"[9, 5, 6, 9, 11, 1, 2, 6, 8, 12, 0, 6, 7, 10, ...",[7],Productos


In [None]:
#We divide the dataset into products and pieces
df_prod = df.loc[df.Type=='Productos']
df_piez = df.loc[df.Type=='Piezas']

In [None]:
#Save splitted datasets for later use
df_prod.to_hdf('products_human.h5', key='df_prod', index=False)
df_piez.to_hdf('pieces_human.h5', key='df_piez', index=False)

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['Input_seq', 'Output', 'Type'], dtype='object')]

  df_prod.to_hdf('products_human.h5', key='df_prod', index=False)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['Input_seq', 'Output', 'Type'], dtype='object')]

  df_piez.to_hdf('pieces_human.h5', key='df_piez', index=False)


## Process SQL data to suit format

In [None]:
#Load SQL dataset
df_hist_piez = pd.read_csv('rl_piezas.csv')

In [None]:
#Turn list of strings into array (formatting)
X_series = df_hist_piez['SIMUL'].apply(lambda x: [int(x.split(', ')[0][1:])] + [int(x.split(', ')[i+1]) for i in range(len(x.split(', '))-2)] + [int(x.split(', ')[-1][:-1])])
y_series = df_hist_piez['DES'].apply(lambda x: [int(x[1:-1])])
df_hist_piez.drop_duplicates('SIMUL', inplace=True, ignore_index=True)
df_hist_piez.drop(['SIMUL', 'DES'], axis=1,inplace=True)
df_hist_piez.insert(0, 'X', X_series)
df_hist_piez.insert(1, 'y', y_series)
df_hist_piez.head()

Unnamed: 0,X,y
0,"[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10...",[5]
1,"[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10...",[8]
2,"[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10...",[8]
3,"[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10...",[8]
4,"[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10...",[8]


In [None]:
#Saved processed data
df_hist_piez.to_hdf('pieces_hist.h5', key='df_hist_piez')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['X', 'y'], dtype='object')]

  df_hist_piez.to_hdf('pieces_hist.h5', key='df_hist_piez')


## Load data

In [None]:
#Load splitted datasets
df_prod = pd.read_hdf('products_human.h5', key='df_prod')
df_piez = pd.read_hdf('pieces_human.h5', key='df_piez')


In [None]:
#Load SQL processed dataset
df_hist_pez = pd.read_hdf('pieces_hist.h5', key='df_hist_piez')

# Load and build model

In [None]:
#We define a custom model as our reward model
class RewardModel (torch.nn.Module):
    def __init__(self, model_name, hidden_size,*model_args, **kwargs):
        super().__init__()
        self.base = AutoModelForSequenceClassification.from_pretrained(model_name, *model_args, **kwargs)

        #Last layer must be changed for it to be a regression problem
        self.base.classifier =  torch.nn.Linear(in_features=hidden_size, out_features=1, bias=True)
        self = self.base

    #Our custom model should take as input a pair of right/wrong answers with a fixed sequence
    def forward(self, input_ids, attention_mask=None,
                token_type_ids=None, train:bool=False):
        #Whether the model is being trained or tested is important when inferencing
        if train:
            #The input_ids are structured in a way in which the first chunk corresponds to right examples and the second one to wrong ones
            input_right, input_wrong  = input_ids[0][0], input_ids[0][1]
            out_right = self.base(input_ids=input_right, attention_mask=attention_mask, token_type_ids=token_type_ids)
            out_wrong = self.base(input_ids=input_wrong, attention_mask=attention_mask, token_type_ids=token_type_ids)
            return [out_right, out_wrong]
        else:
            return self.base(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

In [None]:
#We initialize our custom reward model
rm = RewardModel("VCNC/bert_piezas3", hidden_size=768).to(device)

In [None]:
#This is an example of how to inference the model with batches of data
input = torch.tensor([[[[2, 2, 1, 3, 4, 1], [3, 2, 1, 3, 4, 4]], [[2, 2, 1, 3, 4, 3], [3, 2, 1, 3, 4, 1]]]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]).to(device)
rm(input, token_type_ids=token_type, train=True)

[SequenceClassifierOutput(loss=None, logits=tensor([[0.3812],
         [0.3812]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 SequenceClassifierOutput(loss=None, logits=tensor([[0.3812],
         [0.3812]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]

In [None]:
#We can see that the results do not vary, indicating that our functions are correct
input = torch.tensor([[2, 2, 1, 3, 4, 1], [3, 2, 1, 3, 4, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]).to(device)
rm(input, token_type_ids=token_type, train=False)

SequenceClassifierOutput(loss=None, logits=tensor([[0.3812],
        [0.3812]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

# Preprocess data

In [None]:
#In this case we will use the product dataset
data = df_hist_piez.to_numpy()
keys = np.unique(np.array(list(AUX_TOK.values()))) #Get unique tokens only
dataset = np.zeros((len(data)*(len(keys)-1), 3), dtype=object) #The number of pairs is the number of examples at the beginning times 1 minus keys

#Iterate through the dataset to generate sets (correct, wrong, input)
for i in range(len(data)):
    internal_cont = 0
    for j in range(len(keys)):
        if j != data[i, 1][0]:
            dataset[i*(len(keys)-1)+internal_cont, 0] = data[i, 1][0]
            dataset[i*(len(keys)-1)+internal_cont, 1] = j
            dataset[i*(len(keys)-1)+internal_cont, 2] = data[i, 0]
            internal_cont += 1
dataset

array([[5, 0,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       [5, 1,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       [5, 2,
        list([5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 10, 10, 10, 13, 8, 2, 2, 13, 0, 10, 5, 5, 8, 2, 2, 10, 7, 7, 0, 1, 0, 18, 10, 12, 10, 10, 10, 13, 13, 13, 13, 1, 5, 5, 17, 7, 8, 8, 5])],
       ...,
       [10, 16,
        list([10, 10, 10, 13, 13, 8, 2, 2, 0, 0, 10, 10, 5, 10, 5, 8, 8, 2, 2, 10, 2, 2, 17, 5, 5, 5, 5, 8, 8, 8, 8, 8, 0, 5, 0, 0, 0, 19, 17, 17, 0, 15, 0, 10, 10, 10, 10, 7, 7, 19, 12, 12, 12, 12, 2, 13, 10, 8, 5, 5, 13, 13, 13, 13, 5, 4, 4, 8, 8, 2, 2, 2, 2, 2, 2, 4, 4, 4, 2, 12, 5, 10, 10, 10, 4, 4, 5, 11, 5, 8, 8, 0, 0, 2, 1, 13, 13, 0, 0

## Data formatting and train/test splitting

In [None]:
MAX_LEN = max([len(i) for i in dataset[:, 2]])
PAD = max(list(AUX_TOK.values()))+1

input_ids = []
attention_mask = []
token_type_ids = []
train = []
for i in range(len(dataset)):
    pad_len = MAX_LEN-len(dataset[:, 2][i])
    right_seq = dataset[:, 2][i]+[dataset[i, 0]]+[PAD]*pad_len
    wrong_seq = dataset[:, 2][i]+[dataset[i, 1]]+[PAD]*pad_len
    input_ids.append([right_seq, wrong_seq]) #The input_ids is a concatenation of the right and wrong example
    attention_mask.append([1]*(len(dataset[:, 2][i])+1)+[0]*pad_len)
    token_type_ids.append([0]*len(dataset[:, 2][i])+[1]*(pad_len+1)) #The +1 is because the output selected by the human changes the token type
    train.append(True) #In RLHF every example is given as a training example REVISAR!!!
input_ids[0], attention_mask[0], token_type_ids[0], train[0] #Get first element

([[5,
   8,
   8,
   8,
   8,
   8,
   8,
   5,
   8,
   2,
   10,
   8,
   8,
   8,
   10,
   10,
   10,
   13,
   8,
   2,
   2,
   13,
   0,
   10,
   5,
   5,
   8,
   2,
   2,
   10,
   7,
   7,
   0,
   1,
   0,
   18,
   10,
   12,
   10,
   10,
   10,
   13,
   13,
   13,
   13,
   1,
   5,
   5,
   17,
   7,
   8,
   8,
   5,
   5,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20,
   20],
  [5,
   8,
   8,
   8,
   8,
   8,
   8,
   5,
   8,
   2,
   10,
   8,
   8,
   8,
   10,
   10,
   10,
   13,
   8,
   2,
   2,
   13,
   0,
   10,
   5,
   5,
   8,
   2,
   2,
   10,
   7,
   7,
   0,
   1,
   0,
   18,
   10,
   12,
   10,
   10,
   10,
   13,
   13,
   1

In [None]:
#We create a df to store the data properly
data = pd.DataFrame(data={'input_ids':input_ids, 'attention_mask':attention_mask, 'token_type_ids':token_type_ids, 'train':train})
data.head()

Unnamed: 0,input_ids,attention_mask,token_type_ids,train
0,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
1,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
2,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
3,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True
4,"[[5, 8, 8, 8, 8, 8, 8, 5, 8, 2, 10, 8, 8, 8, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",True


## DatasetDict creation

In [None]:
#We turn our datasets into DatasetDicts
train = Dataset.from_pandas(data.loc[data.train == True].reset_index(drop=True))
test = Dataset.from_pandas(data.loc[data.train == False].reset_index(drop=True))

ds = DatasetDict()

ds['train'] = train
ds['validation'] = test

ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'train'],
        num_rows: 1764
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'token_type_ids', 'train'],
        num_rows: 0
    })
})

# Model training

In [None]:
#We redefine a custom loss that maximizes the difference between the rewards of positive and negative examples
def custom_loss(out_right:torch.tensor, out_wrong:torch.tensor, return_outputs:bool=False):
    #The idea behind the equations is better explained in the following article: https://medium.com/towards-generative-ai/reward-model-training-2209d1befb5f
    diff = out_right-out_wrong
    loss = torch.sum(-torch.log(torch.sigmoid(out_right-out_wrong)))
    return (loss, diff) if return_outputs else loss

In [None]:
#Finally, we train the model
rm.train()

RewardModel(
  (base): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(21, 768, padding_idx=13)
        (position_embeddings): Embedding(1024, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias

In [None]:
#We use Adam optimizer
optimizer = torch.optim.Adam(params=rm.parameters(), lr=3e-05)

In [None]:
EPOCHS = 2
batch_size = 40
batches_per_epoch = len(train) // batch_size


for epoch in range(EPOCHS):
    for i in range(batches_per_epoch):
        start = i * batch_size
        # take a batch
        input_batch = train[start:start+batch_size]
        inputs = np.array(input_batch['input_ids'])
        #Format input_ids to match model requirements
        input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
        attention_mask = torch.tensor(input_batch['attention_mask']).to(device)
        token_type_ids = torch.tensor(input_batch['token_type_ids']).to(device)

        # forward pass through the model
        out = rm(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)
        loss = custom_loss(out[0].logits, out[1].logits)
        # backpropagate
        optimizer.zero_grad()
        loss.backward()
        # update weights
        optimizer.step()

In [None]:
loss

tensor(28.4680, device='cuda:0', grad_fn=<SumBackward0>)

In [None]:
#Evaluate if training was done correctly taking some training examples
inp = train[0:3]
inputs = np.array(inp['input_ids'])
input_ids = torch.tensor(np.array([[inputs[:, 0], inputs[:, 1]]])).to(device)
attention_mask = torch.tensor(inp['attention_mask']).to(device)
token_type_ids = torch.tensor(inp['token_type_ids']).to(device)
rm(input_ids = input_ids, attention_mask = attention_mask, token_type_ids = token_type_ids, train = True)

[SequenceClassifierOutput(loss=None, logits=tensor([[0.3829],
         [0.3502],
         [0.4553]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 SequenceClassifierOutput(loss=None, logits=tensor([[0.1543],
         [0.3762],
         [0.3471]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)]