In [19]:
import sys

sys.path.append("..")

import warnings

import pandas as pd
import torch
from datasets import load_dataset
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
)

from datasets import load_from_disk
import os

from scripts.utils.data import df_self_product, prepare_reward_dataset
from scripts.utils.misc import seed_everything
from scripts.constants import DATASET_DIR, CONFIG_DIR
from pathlib import Path

warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

device = "cuda" if torch.cuda.is_available() else "cpu"
device

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


'cpu'

In [48]:
model_name = Path(DATASET_DIR, "lvwerra/distilbert-imdb-cased")

reward_tokenizer = DistilBertTokenizerFast.from_pretrained(
    model_name, max_length=512
)
reward_model = DistilBertForSequenceClassification.from_pretrained(
    model_name
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at /home/ivanov.dko/projects/test/rl/artifacts/datasets/lvwerra/distilbert-imdb-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train, test = load_dataset("imdb", split=["train", "test"])
train, test = [pd.DataFrame(dataset) for dataset in [train, test]]
train

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [31]:
sets = (
    df_self_product(train, partition_col="label")
    .sample(100)
    .rename({"text_0": "chosen", "text_1": "rejected"})
)

In [32]:
reward_dataset = prepare_reward_dataset(
    sets.to_dict(as_series=False), tokenizer=reward_tokenizer, verbose=False
)

[32m2024-08-03 14:35:03.285[0m | [1mINFO    [0m | [36mscripts.utils.data[0m:[36mprepare_reward_dataset[0m:[36m123[0m - [1mStarting tokenizing `chosen`[0m


[32m2024-08-03 14:35:03.316[0m | [1mINFO    [0m | [36mscripts.utils.data[0m:[36mprepare_reward_dataset[0m:[36m123[0m - [1mStarting tokenizing `rejected`[0m


In [15]:
dataset_path = Path(DATASET_DIR, "reward_dataset")
if not os.path.exists(dataset_path):
    reward_dataset = reward_dataset.train_test_split(test_size=0.2)
    reward_dataset.save_to_disk(dataset_path)
else:
    reward_dataset = load_from_disk(dataset_path)

In [49]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
from omegaconf import OmegaConf

import os

peft_params = OmegaConf.to_container(
    OmegaConf.load(Path(CONFIG_DIR, "reward/peft.yaml"))
)["peft"]
peft_config = LoraConfig(**peft_params)

reward_trainer_params = OmegaConf.to_container(
    OmegaConf.load(Path(CONFIG_DIR, "reward/trainer.yaml"))
)["trainer"]
reward_config = RewardConfig(**reward_trainer_params)

trainer = RewardTrainer(
    model=reward_model,
    args=reward_config,
    tokenizer=reward_tokenizer,
    train_dataset=reward_dataset["train"],
    eval_dataset=reward_dataset["test"],
    peft_config=peft_config,
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=0.044092923402786255, metrics={'train_runtime': 23.4826, 'train_samples_per_second': 0.852, 'train_steps_per_second': 0.043, 'total_flos': 0.0, 'train_loss': 0.044092923402786255, 'epoch': 1.0})