In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import (AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoConfig)
import evaluate
import torch.nn as nn
import torch
import numpy as np
from utils import PROD_TOK, AUX_TOK, RewardModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [108]:
#We initialize our custom reward model
rm = RewardModel("VCNC/bert_piezas3", hidden_size=768, classes=20).to(device)
rm.load_state_dict(torch.load('rm_dict.pt'))
rm.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21, 768, padding_idx=20)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [110]:
#We initialize 2 BERT pieces models, one to train and the other for reference
model_new = AutoModelForSequenceClassification.from_pretrained("VCNC/bert_piezas3").to(device)
model_ref = AutoModelForSequenceClassification.from_pretrained("VCNC/bert_piezas3").to(device)
model_new.train()
model_ref.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21, 768, padding_idx=13)
      (position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [111]:
#We check whether the models were loaded correctly and are ready to inference
#For the reward model
input = torch.tensor([[2, 2, 1, 3, 4, 1], [3, 2, 1, 3, 4, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 0, 1]]).to(device)
    
print(f"Reward model's output: {rm(input, token_type_ids=token_type, train=False)}")

#For the pieces models
input = torch.tensor([[2, 2, 1, 3, 4], [3, 2, 1, 3, 4]]).to(device)
token_type = torch.tensor([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]).to(device)
attention_mask = torch.tensor([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]).to(device)
with torch.no_grad():
    print(f"""
    New model's output: {model_new(input_ids=input, token_type_ids=token_type, attention_mask=attention_mask)}
    Reference model's output: {model_ref(input_ids=input, token_type_ids=token_type, attention_mask=attention_mask)}""")

Reward model's output: SequenceClassifierOutput(loss=None, logits=tensor([[-10.9299],
        [-10.8700]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

    New model's output: SequenceClassifierOutput(loss=None, logits=tensor([[ -2.5976,  -3.0268,  -1.3479,  -1.5660,  -3.1606,  -1.8852, -21.8073,
          -4.2567,  -2.3831,  -7.3893,  -2.6168,  -5.6001,  -2.4642,  -3.2108,
         -21.7702,  -5.2638, -21.3716,  -4.2614,  -5.2135,  -5.3150],
        [ -2.4703,  -3.0965,  -1.3549,  -2.0352,  -2.7437,  -2.0568, -21.8894,
          -4.3856,  -2.4865,  -7.5653,  -2.6749,  -5.6142,  -2.2101,  -2.9694,
         -21.9664,  -5.5138, -21.6562,  -4.1202,  -4.9875,  -5.6576]],
       device='cuda:0'), hidden_states=None, attentions=None)
    Reference model's output: SequenceClassifierOutput(loss=None, logits=tensor([[ -2.5363,  -3.1314,  -1.2427,  -1.6860,  -3.0604,  -2.0583, -21.7888,
          -4.1208,  -2.3355,  -7.4675,  -2.8129,  -5.3138,  -2.3228,  -3.

In [120]:
#We define the PPO loss as in this hf post (https://huggingface.co/blog/deep-rl-ppo)
def PPO_loss(eps:float, model_new, model_ref, reward_model, inp:torch.tensor, verbose:bool=True):
    #We first compute the output given by the new and reference model
    new_out = nn.functional.softmax(model_new(inp).logits, dim=1)
    ref_out = nn.functional.softmax(model_ref(inp).logits, dim=1)

    #We compute the ratio of the two outputs
    new_out_prob = torch.max(new_out, 1).values
    ref_out_prob = torch.max(ref_out, 1).values
    ratio = new_out_prob/ref_out_prob

    #We format the input for the reward model
    max_out = torch.max(new_out, 1).indices
    max_out = torch.reshape(max_out, (max_out.size(dim=0), 1))
    reward_inp = torch.cat((inp, max_out), 1)
    
    reward_tokens = torch.zeros_like(reward_inp)
    reward_tokens[:, -1] = 1 

    #We get the reward assigned by our reward model
    reward = reward_model(reward_inp, token_type_ids=reward_tokens, train=False).logits
    reward = torch.reshape(reward, (1, reward.size(dim=0)))[0]
    
    #The entropy is computed to balance explotation and exploration
    entropy = torch.tensor([-torch.dot(new_out[i], torch.log2(new_out[i])) for i in range(len(new_out))]).to(device)

    #This is the Lclip term presented in the hf post
    expression = torch.tensor([(ratio*reward).tolist(), (torch.clamp(ratio, min=1-eps, max=1+eps)*reward).tolist()])

    loss = -(torch.sum((torch.min(expression, 0).values.to(device)+entropy)))
    
    if verbose:
        print(f"New model's output: {new_out}")
        print(f"Reference model's output: {ref_out}")
        print(reward_inp, reward_tokens)
        print(f"The reward for this action and state is: {reward}")
        print(f"The current entropy is: {entropy}")
        print(f"The final loss is: {loss}")

    return torch.autograd.Variable(loss, requires_grad = True)

In [124]:
#Sample of a forward pass using PPO loss function
input = torch.tensor([[2, 2, 1, 3, 4], [3, 2, 1, 3, 4]]).to(device)
PPO_loss(0.2, model_new, model_ref, rm, input)

New model's output: tensor([[7.9668e-02, 3.7430e-02, 2.9483e-01, 1.5748e-01, 3.9525e-02, 1.0632e-01,
         2.2878e-10, 1.1194e-02, 9.3476e-02, 5.2020e-04, 3.1839e-02, 3.5076e-03,
         7.9789e-02, 4.0205e-02, 3.2621e-10, 3.0629e-03, 3.3976e-10, 1.2742e-02,
         5.2590e-03, 3.1531e-03],
        [7.3784e-02, 4.0047e-02, 2.6574e-01, 1.5518e-01, 4.1266e-02, 1.3042e-01,
         3.9035e-10, 1.3299e-02, 8.7210e-02, 6.0468e-04, 4.8939e-02, 5.1721e-03,
         6.8708e-02, 4.7920e-02, 2.7663e-10, 3.2591e-03, 3.2098e-10, 1.1070e-02,
         4.6500e-03, 2.7291e-03]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Reference model's output: tensor([[7.0636e-02, 3.8957e-02, 2.5755e-01, 1.6532e-01, 4.1825e-02, 1.1393e-01,
         3.0746e-10, 1.4484e-02, 8.6342e-02, 5.0985e-04, 5.3568e-02, 4.3934e-03,
         8.7449e-02, 3.8361e-02, 3.3922e-10, 3.7228e-03, 3.5992e-10, 1.4187e-02,
         5.5383e-03, 3.2260e-03],
        [7.0636e-02, 3.8957e-02, 2.5755e-01, 1.6532e-01, 4.1825e-02, 1.1393e-

tensor(17.4308, device='cuda:0', requires_grad=True)

In [125]:
#We use Adam optimizer
optimizer = torch.optim.Adam(params=model_new.parameters(), lr=3e-07)

In [127]:
EPOCHS = 1000
BATCH_SIZE = 2
MAX_LEN = 15

for epoch in range(EPOCHS):
    # Generate random sequence as training example
    inp_len = np.random.randint(2, MAX_LEN+1)
    input = torch.tensor(np.random.randint(0, 20, size=(BATCH_SIZE, inp_len))).to(device)

    # Compute loss (does forward pass also)
    loss = PPO_loss(0.2, model_new, model_ref, rm, input, verbose=False)

    if epoch%50 == 0:
        #print(f"Correct reward: {out[0].logits} Wrong reward: {out[1].logits}")
        print(f"Loss epoch {epoch}: {loss}")

    # Backpropagate
    loss.backward()

    # Update weights
    optimizer.step()

Loss epoch 0: 13.797601699829102
Loss epoch 2: 13.895196914672852
Loss epoch 4: 6.327188014984131
Loss epoch 6: 16.44588851928711
Loss epoch 8: 16.842594146728516
Loss epoch 10: 16.496747970581055
Loss epoch 12: 9.753284454345703
Loss epoch 14: 17.926841735839844
Loss epoch 16: 16.47771453857422
Loss epoch 18: 14.808004379272461
Loss epoch 20: 14.898300170898438
Loss epoch 22: 14.517597198486328
Loss epoch 24: 11.079215049743652
Loss epoch 26: 13.42514419555664
Loss epoch 28: 11.610021591186523
Loss epoch 30: 18.30743980407715
Loss epoch 32: 16.235687255859375
Loss epoch 34: 17.545480728149414
Loss epoch 36: 14.82377815246582
Loss epoch 38: 14.335840225219727
Loss epoch 40: 13.071252822875977
Loss epoch 42: 15.559148788452148
Loss epoch 44: 16.331436157226562
Loss epoch 46: 15.651971817016602
Loss epoch 48: 12.946578979492188
Loss epoch 50: 16.716609954833984
Loss epoch 52: 14.01222038269043
Loss epoch 54: 14.119315147399902
Loss epoch 56: 14.016378402709961
Loss epoch 58: 13.211669921

In [128]:
#Test that reward and reference model are still the same
input = torch.tensor([[2, 2, 1, 3, 4], [3, 2, 1, 3, 4]]).to(device)
PPO_loss(0.2, model_new, model_ref, rm, input)

New model's output: tensor([[6.5049e-02, 2.6806e-02, 2.2435e-01, 1.3317e-01, 3.2102e-02, 1.5782e-01,
         3.2575e-10, 1.7956e-02, 8.3287e-02, 5.2001e-04, 6.4405e-02, 4.4030e-03,
         1.2668e-01, 3.7096e-02, 3.2886e-10, 3.5549e-03, 3.1342e-10, 1.3814e-02,
         5.7693e-03, 3.2172e-03],
        [8.0917e-02, 4.3634e-02, 2.6163e-01, 1.2973e-01, 3.8722e-02, 1.1113e-01,
         2.6086e-10, 1.1044e-02, 9.9301e-02, 5.3719e-04, 4.9703e-02, 4.7165e-03,
         9.8419e-02, 4.5927e-02, 3.0808e-10, 3.3190e-03, 2.6936e-10, 1.2594e-02,
         5.3902e-03, 3.2824e-03]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Reference model's output: tensor([[7.0636e-02, 3.8957e-02, 2.5755e-01, 1.6532e-01, 4.1825e-02, 1.1393e-01,
         3.0746e-10, 1.4484e-02, 8.6342e-02, 5.0985e-04, 5.3568e-02, 4.3934e-03,
         8.7449e-02, 3.8361e-02, 3.3922e-10, 3.7228e-03, 3.5992e-10, 1.4187e-02,
         5.5383e-03, 3.2260e-03],
        [7.0636e-02, 3.8957e-02, 2.5755e-01, 1.6532e-01, 4.1825e-02, 1.1393e-

tensor(14.1056, device='cuda:0', requires_grad=True)