In [1]:
!pip install -q datasets
!pip install -q trl
!pip install wandb -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.[0m[31m
[0m

In [2]:
import os
import random
import numpy as np
import pandas as pd
import csv
import torch
import transformers
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import gc
import torch.nn.functional as F
import wandb

from typing import Optional, Generator, Dict, Type, Any
from math import acos, degrees, sin
from copy import deepcopy
from datasets import load_dataset, IterableDataset, Dataset, load_from_disk, concatenate_datasets
from torch import Tensor
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoModelForCausalLM, GPT2LMHeadModel, TrainingArguments
from tqdm.auto import tqdm
from trl import ModelConfig, RewardConfig, RewardTrainer, get_peft_config, get_quantization_config
from torch.distributions.categorical import Categorical

2024-08-05 19:25:24.890678: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-05 19:25:24.890846: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-05 19:25:25.039702: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class CFG:
  seed = 42
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  batch_size = 64

In [5]:
def seed_env(seed: int = CFG.seed) -> None:
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

def seed_torch(seed: int = CFG.seed) -> None:
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

def seed_everything() -> None:
  """Set seeds"""
  seed_torch()
  seed_env()

def init_wandb() -> None:
  wandb.init(project='Alignment_experiments', entity='lulim')

In [7]:
class ImdbDatasetHandler:
  def __init__(self) -> None:
    self.train_split = load_dataset("stanfordnlp/imdb", split="train",)
    self.train_split = self.train_split.filter(lambda x: len(x["text"]) > 500, batched=False)

    self.test_split = load_dataset("stanfordnlp/imdb", split="test",)
    self.test_split = self.test_split.filter(lambda x: len(x["text"]) > 500, batched=False)
    self.all_pairs_train = []
    self.all_pairs_ds_train = None
    self.all_pairs_after_tokenize_train = None

    self.all_pairs_test = []
    self.all_pairs_ds_test = None
    self.all_pairs_after_tokenize_test = None

  def create_train_pairs(self) -> None:
    """Create train dataset of positive and negative examples"""
    positive_samples = self.train_split.filter(lambda x: x['label'] == 1)[:300]
    negative_samples = self.train_split.filter(lambda x: x['label'] == 0)[:300]
    for positive_item in tqdm(positive_samples['text']):
      for negative_item in negative_samples['text']:
        self.all_pairs_train.append({'positive': positive_item, 'negative': negative_item})
      self.all_pairs_ds_train = Dataset.from_list(self.all_pairs_train)

  def create_test_pairs(self) -> None:
    """Create test dataset of positive and negative examples"""
    positive_samples = self.test_split.filter(lambda x: x['label'] == 1)[-50:]
    negative_samples = self.test_split.filter(lambda x: x['label'] == 0)[-50:]
    for positive_item in tqdm(positive_samples['text']):
      for negative_item in negative_samples['text']:
        self.all_pairs_test.append({'positive': positive_item, 'negative': negative_item})
      self.all_pairs_ds_test = Dataset.from_list(self.all_pairs_test)

  def create_train_for_warp(self) -> Dataset:
    """Create train dataset that can use in WARP"""
    samples_zeros = Dataset.from_dict(self.train_split.filter(lambda x: x['label'] == 0)[500:550])
    samples_ones = Dataset.from_dict(self.train_split.filter(lambda x: x['label'] == 1)[600:650])
    return concatenate_datasets([samples_zeros, samples_ones]).shuffle(seed=42).remove_columns(['label'])

  def create_test_for_warp(self) -> Dataset:
    """Create test dataset that can use in WARP result validation"""
    samples_zeros = Dataset.from_dict(self.test_split.filter(lambda x: x['label'] == 0)[300:350])
    samples_ones = Dataset.from_dict(self.test_split.filter(lambda x: x['label'] == 1)[400:450])
    return concatenate_datasets([samples_zeros, samples_ones]).shuffle(seed=42)



In [None]:
class CustomModelConfig:
  def __init__(self)
  ## TL;DR model config

In [8]:
class DistilBertModel:
  def __init__(self, max_length: Optional[int] = 512,
               path_to_save_checkpoint: Optional[str] = '/kaggle/working/reward_model_bert/checkpoint-2812') -> None:
    self.tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-cased")
    self.model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-cased", num_labels=1)
    self.max_length = max_length
    self.path_to_save_checkpoint = path_to_save_checkpoint

  def load_from_checkpoint(self, path_to_checkpoint: str) -> None:
    self.tokenizer = AutoTokenizer.from_pretrained(path_to_checkpoint)
    self.model = AutoModelForSequenceClassification.from_pretrained(path_to_checkpoint)

  def preprocess_function(self, examples: Dataset) -> Dict[str, list]:
    """Create specific dict for reward model training"""
    new_examples = {
        'input_ids_chosen': [],
        'attention_mask_chosen': [],
        'input_ids_rejected': [],
        'attention_mask_rejected': []
    }
    for chosen, rejected in tqdm(zip(examples['positive'], examples['negative'])):
      tokenized_positive = self.tokenizer(chosen, truncation=True, max_length=self.max_length)
      tokenized_negative = self.tokenizer(rejected, truncation=True, max_length=self.max_length)

      if len(tokenized_positive['input_ids']) <= self.max_length \
          and len(tokenized_negative['input_ids']) <= self.max_length:
        new_examples['input_ids_chosen'].append(tokenized_positive['input_ids'])
        new_examples['attention_mask_chosen'].append(tokenized_positive['attention_mask'])
        new_examples['input_ids_rejected'].append(tokenized_negative['input_ids'])
        new_examples['attention_mask_rejected'].append(tokenized_negative['attention_mask'])

      gc.collect()
    return new_examples

  def reward_train(self, train_dataset: Dataset) -> None:
    training_arg = RewardConfig(
        output_dir='reward_model_bert',
        per_device_train_batch_size=16,
        num_train_epochs=1,
        gradient_accumulation_steps=2,
        gradient_checkpointing=True,
        learning_rate=1.41e-5,
        remove_unused_columns=False,
        optim='adamw_torch',
        max_length=512
    )
    trainer = RewardTrainer(
        model=self.model,
        tokenizer=self.tokenizer,
        args=training_arg,
        train_dataset=train_dataset
    )
    trainer.train()
    trainer.save_model(self.path_to_save_checkpoint)


In [9]:
class Warp:
    def __init__(self, reward_model_class: Type[DistilBertModel], prompt_dataset: DataLoader, optimizer: Any, I: int, M: int,
                 T: int, mu: float, lambd: float, eta: float, batch_size: int,
                 checkpoint_theta_dir: Optional[str] = '/content/drive/MyDrive/Alignment_project/train_checkpoints_theta_init',
                 checkpoint_final_dir: Optional[str] = '/content/drive/MyDrive/Alignment_project/train_checkpoints_final',
                 checkpoint_ema_dir: Optional[str] = '/content/drive/MyDrive/Alignment_project/train_checkpoints_ema') -> None:
        self.sft_tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb')
        self.sft_model = GPT2LMHeadModel.from_pretrained('lvwerra/gpt2-imdb').to(CFG.device)

        self.reward_tokenizer = reward_model_class.tokenizer
        self.reward_model = reward_model_class.model.to(CFG.device)

        self.prompt_dataset = prompt_dataset
        self.opt = optimizer
        self.I = I
        self.M = M
        self.T = T
        self.mu = mu
        self.lambd = lambd
        self.eta = eta
        self.batch_size = batch_size

        self.beta = 0.1  # from paper

        self.checkpoint_theta_dir = checkpoint_theta_dir
        self.checkpoint_final_dir = checkpoint_final_dir
        self.checkpoint_ema_dir = checkpoint_ema_dir

        # want to add b_ma from paper
    def sft_tokenize_func(self, sample: Any) -> tuple[Any, Any]:
        tokenized = self.sft_tokenizer(sample, truncation=True, max_length=15, return_tensors='pt').to(CFG.device)
        return tokenized['input_ids'], tokenized['attention_mask']

    def reward_tokenize_func(self, sample: Any) -> tuple[Any, Any]:
        tokenized = self.reward_tokenizer(sample, truncation=True, max_length=15, return_tensors='pt').to(CFG.device)
        return tokenized['input_ids'], tokenized['attention_mask']

    def compute_reward(self, y_input_ids_theta: Any, y_input_attention_mask_theta: Any, theta_m_model: Type[GPT2LMHeadModel],
                       theta_m_ema_model: Type[GPT2LMHeadModel]) -> tuple[Tensor, Tensor, Categorical]:
        """Compute KL regularized reward and KL divergence"""

        # calculate KL regularized component
        theta_m_logits = theta_m_model(input_ids=y_input_ids_theta, attention_mask=y_input_attention_mask_theta,
                                       output_hidden_states=True).logits[:, -1, :]
        theta_m_probs = Categorical(logits=theta_m_logits)

        theta_m_ema_logits = theta_m_ema_model(input_ids=y_input_ids_theta, attention_mask=y_input_attention_mask_theta,
                                               output_hidden_states=True).logits[:, -1, :]
        theta_m_ema_probs = Categorical(logits=theta_m_ema_logits)

        kl_div = torch.distributions.kl_divergence(theta_m_probs, theta_m_ema_probs).mean()

        # generate completion
        y_theta_m = theta_m_model.generate(input_ids=y_input_ids_theta, attention_mask=y_input_attention_mask_theta, max_length=50)
        y_theta_m = self.sft_tokenizer.decode(y_theta_m[0], skip_special_tokens=True)
        y_input_ids_reward, y_input_attention_mask_reward = self.reward_tokenize_func(y_theta_m)

        # reward for theta model output
        reward_logits = self.reward_model(input_ids=y_input_ids_reward, attention_mask=y_input_attention_mask_reward).logits

        # calculate KL regularized reward
        reward = reward_logits - self.beta * kl_div
        return reward, kl_div, theta_m_probs

    def policy_gradient_update(self, theta_m_model: Type[GPT2LMHeadModel], reward: Any, theta_m_probs: Type[GPT2LMHeadModel]) -> Tensor:
        """Compute loss and update weights of model"""
        policy_prod_loss = -torch.mean(theta_m_probs.log_prob(theta_m_probs.sample())) * torch.sum(reward)

        opt = torch.optim.Adam(theta_m_model.parameters(), lr=1e-6)  # from paper
        opt.zero_grad()
        policy_prod_loss.backward()
        opt.step()
        return policy_prod_loss

    def ema_update_weights(self, first_model: Type[GPT2LMHeadModel], second_model: Type[GPT2LMHeadModel], c: int) -> Type:
        """EMA for two models with coef c"""
        for first_param, second_param in zip(first_model.parameters(), second_model.parameters()):
            first_param.data = (1 - c) * first_param.data + c * second_param.data
        return first_model

    def get_angle_between_models(self, model_1: Type[GPT2LMHeadModel], model_2: Type[GPT2LMHeadModel]) -> float:
        """Compute angle between task vectors for slerp"""
        weights_1 = []
        weights_2 = []

        for param in model_1.parameters():
            weights_1.append(param.data.view(-1))
        weights_1 = torch.cat(weights_1)

        for param in model_2.parameters():
            weights_2.append(param.data.view(-1))
        weights_2 = torch.cat(weights_2)

        dot_prod = torch.dot(weights_1, weights_2)
        angle_rad = acos(dot_prod)
        return angle_rad

    def slerp(self, theta_init_model: Type[GPT2LMHeadModel], thetas: list, lambd_param: float) -> Type[GPT2LMHeadModel]:
        """Run slerp for two models"""
        result_model, delta_1, delta_2 = deepcopy(theta_init_model), deepcopy(thetas[0]), deepcopy(thetas[1])
        for delta_1_param, delta_2_param, theta_init_param in zip(delta_1.parameters(), delta_2.parameters(),
                                                                  theta_init_model.parameters()):
            delta_1_param.data -= theta_init_param.data
            delta_1_param.requires_grad = True
            delta_2_param.data -= theta_init_param.data
            delta_2_param.requires_grad = True

        omega = self.get_angle_between_models(delta_1, delta_2)

        for result_model_param, theta_init_model_param, delta_1_param, delta_2_param in zip(result_model.parameters(),
                                                                                            theta_init_model.parameters(),
                                                                                            delta_1.parameters(),
                                                                                            delta_2.parameters()):
            result_model_param.data = theta_init_model_param.data + sin((1 - lambd_param) * omega) / sin(omega) * delta_1_param.data + \
                                 sin(lambd_param * omega) / sin(omega) * delta_2_param.data
            result_model_param.requires_grad = True

        return result_model

    def slerpm(self, theta_init_model: Type[GPT2LMHeadModel], thetas: list) -> Type[GPT2LMHeadModel]:
        """Run slerpm"""
        m = len(thetas)
        if m == 2:
            return self.slerp(theta_init_model, thetas, self.lambd)
        else:
            return self.slerp(self.slerpm(theta_init_model, thetas[:-2]), thetas[:-1], self.lambd)

    def run_warp(self) -> tuple[Type[GPT2LMHeadModel], list[float], list[float], list[float]]:
        """Run WARP algorithm"""
        theta_init_model = deepcopy(self.sft_model)
        for i in range(self.I):
            theta_m_models = []
            rewards, kl_divs, policy_losses = [], [], []
            for m in range(self.M):
                theta_m_model, theta_m_ema_model = deepcopy(theta_init_model), deepcopy(theta_init_model)
                for _ in tqdm(range(self.T)):
                    for batch in self.prompt_dataset:
                        y_batch = batch['text']
                        y_input_ids_theta, y_input_attention_mask_theta = self.sft_tokenize_func(y_batch)
                        reward, kl_div, theta_m_probs = self.compute_reward(y_input_ids_theta, y_input_attention_mask_theta,
                                                                            theta_m_model, theta_m_ema_model)
                        rewards.append(reward)
                        kl_divs.append(kl_div)

                        policy_loss = self.policy_gradient_update(theta_m_model, reward, theta_m_probs)
                        policy_losses.append(policy_loss.item())

                        wandb.log({
                            'reward': reward,
                            'kl_div': kl_div,
                            'policy_loss': policy_loss
                        })

                        theta_m_ema_model = self.ema_update_weights(theta_m_ema_model, theta_m_model, self.mu)
                        theta_m_ema_model.save_pretrained(save_directory=self.checkpoint_ema_dir)
                        gc.collect()
                theta_m_models.append(theta_m_model)

            slerp_model = self.slerpm(theta_init_model, theta_m_models)
            theta_init_model = self.ema_update_weights(theta_init_model, slerp_model, self.eta)
            theta_init_model.save_pretrained(save_directory=self.checkpoint_theta_dir)
            gc.collect()

        final_model = self.ema_update_weights(self.sft_model, theta_init_model, self.eta)
        final_model.save_pretrained(save_directory=self.checkpoint_final_dir)

        return final_model, policy_losses, rewards, kl_divs


In [10]:
class ValidationResultsHandler:
    def __init__(self, reward_model: Type[DistilBertModel], prompt_dataset: Dataset, path_to_final_weights: str):
        self.final_theta_tokenizer = AutoTokenizer.from_pretrained('lvwerra/gpt2-imdb') # equal tokenizer with sft_model
        self.final_theta_model = GPT2LMHeadModel.from_pretrained(path_to_final_weights).to(CFG.device)

        self.sft_model = GPT2LMHeadModel.from_pretrained('lvwerra/gpt2-imdb').to(CFG.device)

        self.reward_tokenizer = reward_model.tokenizer
        self.reward_model = reward_model.model.to(CFG.device)

        self.prompt_dataloader = DataLoader(prompt_dataset, batch_size=CFG.batch_size, shuffle=False)

        self.beta = 0.1 # from paper

    def final_theta_tokenize_func(self, sample: Any) -> tuple[Any, Any]:
        tokenized = self.final_theta_tokenizer(sample, truncation=True, max_length=15, return_tensors='pt').to(CFG.device)
        return tokenized['input_ids'], tokenized['attention_mask']

    def reward_tokenize_func(self, sample: Any) -> tuple[Any, Any]:
        tokenized = self.reward_tokenizer(sample, truncation=True, max_length=15, return_tensors='pt').to(CFG.device)
        return tokenized['input_ids'], tokenized['attention_mask']

    def compute_reward(self, y_input_ids_theta: Any, y_input_attention_mask_theta: Any) -> tuple[float, float, float, float]:
        theta_final_logits = self.final_theta_model(input_ids=y_input_ids_theta, attention_mask=y_input_attention_mask_theta,
                                                    output_hidden_states=True).logits[:, -1, :]  # last token
        theta_final_probs = Categorical(logits=theta_final_logits)

        theta_sft_logits = self.sft_model(input_ids=y_input_ids_theta, attention_mask=y_input_attention_mask_theta,
                                               output_hidden_states=True).logits[:, -1, :]  # last token
        theta_sft_probs = Categorical(logits=theta_sft_logits)

        kl_div = torch.distributions.kl_divergence(theta_final_probs, theta_sft_probs).mean()

        y_theta_final_model = self.final_theta_model.generate(input_ids=y_input_ids_theta,
                                                              attention_mask=y_input_attention_mask_theta,
                                                              max_length=50)
        y_theta_final_model = self.final_theta_tokenizer.decode(y_theta_final_model[0], skip_special_tokens=True)
        y_input_ids_reward, y_input_attention_mask_reward = self.reward_tokenize_func(y_theta_final_model)
        reward_logits_final_model = self.reward_model(input_ids=y_input_ids_reward,
                                                      attention_mask=y_input_attention_mask_reward).logits

        y_theta_sft_model = self.sft_model.generate(input_ids=y_input_ids_theta,
                                                    attention_mask=y_input_attention_mask_theta,
                                                    max_length=50)
        y_theta_sft_model = self.final_theta_tokenizer.decode(y_theta_sft_model[0], skip_special_tokens=True)
        y_input_ids_reward_sft, y_input_attention_mask_reward_sft = self.reward_tokenize_func(y_theta_sft_model)
        reward_logits_sft_model = self.reward_model(input_ids=y_input_ids_reward_sft,
                                                    attention_mask=y_input_attention_mask_reward_sft).logits

        reward = reward_logits_final_model - self.beta * kl_div
        return reward, kl_div, reward_logits_final_model, reward_logits_sft_model

    def run_validation(self) -> tuple[list[float], list[float], list[float], list[float]]:
        """Run validation process"""
        rewards, kl_divs, rewads_final_model, rewards_sft_model = [], [], [], []
        with torch.no_grad():
            for batch in tqdm(self.prompt_dataloader):
                y_batch = batch['text']
                y_input_ids_theta, y_input_attention_mask_theta = self.final_theta_tokenize_func(y_batch)
                reward, kl_div, reward_logits_final_model, reward_logits_sft_model = self.compute_reward(y_input_ids_theta, y_input_attention_mask_theta)
                rewards.append(reward)
                kl_divs.append(kl_div)
                rewads_final_model.append(reward_logits_final_model)
                rewards_sft_model.append(reward_logits_sft_model)
        return rewards, kl_divs, rewads_final_model, rewards_sft_model

    def check_weights(self):
        for param_final, param_sft in zip(self.final_theta_model.parameters(), self.sft_model.parameters()):
            if not torch.equal(param_final, param_sft):
                print(f'yes')


In [11]:
seed_everything()
init_wandb()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [12]:
imdb_ds = ImdbDatasetHandler()

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
from google.colab import drive
drive.mount('/content/drive')

imdb_ds.create_train_pairs()

Mounted at /content/drive


  0%|          | 0/300 [00:00<?, ?it/s]

In [11]:
path = '/kaggle/input/train-300'
# imdb_ds.load_dataset_pairs_after_tokenize_train_from_csv(path)
imdb_ds.all_pairs_after_tokenize_train = load_from_disk(path)
imdb_ds.all_pairs_after_tokenize_train = imdb_ds.all_pairs_after_tokenize_train.remove_columns(['positive', 'negative'])
print(imdb_ds.all_pairs_after_tokenize_train)

Dataset({
    features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 90000
})


In [13]:
distil_bert_model = DistilBertModel()

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# if need to create dataset for train reward model and save this dataset to drive
path_300 = '/kaggle/working/reward_model_checkpoints_300'
imdb_ds.all_pairs_after_tokenize_train = imdb_ds.all_pairs_ds_train.map(
    distil_bert_model.preprocess_function,
    batched=True,
    num_proc=4
)
imdb_ds.all_pairs_after_tokenize_train = imdb_ds.all_pairs_after_tokenize_train.filter(
    lambda x: len(x['input_ids_chosen']) <= 512 and len(x['input_ids_rejected']) <= 512
)
imdb_ds.all_pairs_after_tokenize_train.save_to_disk(path_300)


In [13]:
distil_bert_model.reward_train(imdb_ds.all_pairs_after_tokenize_train) # if need train reward model

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
500,0.0466
1000,0.0001
1500,0.0
2000,0.0
2500,0.0




In [16]:
path_to_checkpoint = '/kaggle/input/reward-model-200'
distil_bert_model.load_from_checkpoint(path_to_checkpoint)

In [14]:
path_to_checkpoint = '/kaggle/working/reward_model_bert/checkpoint-2812'
distil_bert_model.load_from_checkpoint(path_to_checkpoint)

In [15]:
# create ds of prompts X for WARP
prompt_dataloader = DataLoader(imdb_ds.create_train_for_warp(), batch_size=CFG.batch_size, shuffle=True)
# next(iter(prompt_dataloader))

Filter:   0%|          | 0/22578 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22578 [00:00<?, ? examples/s]

In [20]:
next(iter(prompt_dataloader))['text'][1]

"After the superb AANKHEN(2002) which was a remake of a Gujarati play he comes with WAQT which too looks like a stage play<br /><br />In stage plays, we have characters shouting, overacting here too the same<br /><br />The first half shows Amitabh almost kidding the 40+ Akshay Kumar who acts too funny like a small nerd<br /><br />The film has a good message how not to spoil your son but sadly the way Amitabh wants to make Akki responsible is absolutely fake<br /><br />Even his reason for hiding his sickness, his runnign from the hospital and the melodramatic speech by Akki is a put off<br /><br />Some emotions do touch you but most are too over the top<br /><br />Rajpal's comedy is hilarious but too stretched in second half <br /><br />Direction by Vipul Shah is too overdone though some scenes are good Music is okay<br /><br />Amongst actors Amitabh overdoes it in the first half but is superb in emotional scenes Akshay Kumar too does his part well but looks umcomfortable in some too we

In [16]:
# add logging params for warp at wandb

wandb.log({
    'I': 2,
    'M': 2,
    'T': 100,
    'mu': 0.01,
    'lambd': 0.7,
    'eta': 0.5,
    'batch_size': 64
})

optimizer = torch.optim.Adam
warp = Warp(
    distil_bert_model,
    prompt_dataloader,
    optimizer,
    I=2,
    M=2,
    T=100,
    mu=0.01,
    lambd=0.7,
    eta=0.5,
    batch_size=CFG.batch_size
) #TL;DR optimizer

tokenizer_config.json:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/577 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [30]:
gc.collect()

51

In [None]:
# Create tokenized dataset X of prompts
# warp.create_prompts_dataset()
# warp.prompt_dataset[6]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Dataset({

    features: ['input_ids', 'attention_mask'],

    num_rows: 100

})


In [17]:
final_model, policy_losses, rewards, kl_divs = warp.run_warp()

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Angle in rad: 1.5609325015180175


  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Angle in rad: 1.5356760112227192


In [18]:
wandb.run.summary['mean_loss_train'] = np.mean(policy_losses)
wandb.run.summary['mean_rewards_train'] = np.mean(rewards)
wandb.run.summary['mean_KL_train'] = np.mean(kl_divs)

In [19]:
test_ds = imdb_ds.create_test_for_warp()
path_to_final_model = '/kaggle/working/train_checkpoints_final_300'
validation_handler = ValidationResultsHandler(distil_bert_model, test_ds, path_to_final_model)
rewards, kl_divs, rewards_final_model, rewards_sft_model = validation_handler.run_validation()

Filter:   0%|          | 0/22439 [00:00<?, ? examples/s]

Filter:   0%|          | 0/22439 [00:00<?, ? examples/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [20]:
# I want that this neednt
rewards_2 = [rew.detach().item() for rew in rewards]
policy = [polices for polices in policy_losses]
kl_divs_2 = [kl_div.detach().item() for kl_div in kl_divs]
rewards_sft_model = [reward.cpu().item() for reward in rewards_sft_model]
rewards_final_model = [reward.cpu().item() for reward in rewards_final_model]

In [21]:
rewards_sft_model

[3.2220163345336914, -1.3637049198150635]

In [22]:
rewards_final_model

[3.2220163345336914, -1.3637049198150635]

In [23]:
print(f'Mean reward final_model: {np.mean(rewards_final_model)}')
print(f'Mean  reward sft: {np.mean(rewards_sft_model)}')
print(f'Mean rewards: {np.mean(rewards_2)}')
print(f'Mean KL: {np.mean(kl_divs_2)}')

Mean reward final_model: 0.929155707359314
Mean  reward sft: 0.929155707359314
Mean rewards: 0.9291521310806274
Mean KL: 3.638561065599788e-05


In [24]:
wandb.run.summary['mean_reward_final_model'] = np.mean(rewards_final_model)
wandb.run.summary['mean_reward_sft_model'] = np.mean(rewards_sft_model)
wandb.run.summary['mean_reward'] = np.mean(rewards)
wandb.run.summary['mean_kl'] = np.mean(kl_divs)