In [1]:
import sys

sys.path.append("..")

import warnings
from collections import defaultdict
from typing import Dict, Optional

import numpy as np
import pandas as pd
import polars as pl
import torch
from datasets import Dataset, load_dataset
from scipy.stats import entropy
from torch import nn
from tqdm.auto import tqdm, trange
from transformers import (
    DistilBertForSequenceClassification,
    DistilBertTokenizerFast,
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    TrainingArguments,
)
from trl import DPOTrainer

from scripts.utils.data import df_self_product
from scripts.utils.misc import seed_everything

warnings.filterwarnings("ignore")

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [2]:
model_name = "lvwerra/distilbert-imdb-cased"

reward_tokenizer = DistilBertTokenizerFast.from_pretrained(
    model_name, max_length=512
)
reward_model = DistilBertForSequenceClassification.from_pretrained(
    model_name
).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at lvwerra/distilbert-imdb-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
model_name = "lvwerra/gpt2-imdb"

sft_tokenizer = GPT2TokenizerFast.from_pretrained(model_name, max_length=512)
sft_model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

In [5]:
train, test = load_dataset("imdb", split=["train", "test"])
train, test = [pd.DataFrame(dataset) for dataset in [train, test]]
train

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0
...,...,...
24995,A hit at the time but now better categorised a...,1
24996,I love this movie like no other. Another time ...,1
24997,This film and it's sequel Barry Mckenzie holds...,1
24998,'The Adventures Of Barry McKenzie' started lif...,1


In [6]:
sets = (
    df_self_product(train, partition_col="label")
    .sample(10000)
    .rename({"text_0": "chosen", "text_1": "rejected"})
)

In [None]:
import gc
from typing import Sequence, Literal, Tuple, List, Dict

import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm, trange

from joblib import Parallel, delayed


def multi_join(
    df_list: Sequence[pl.DataFrame | pl.LazyFrame],
    on: Sequence[str | None] | str | None,
    how: pl._typing.JoinStrategy = "inner",
) -> pl.DataFrame | pl.LazyFrame:
    """
    Join the list of pl.DataFrame of length N
    using respective keys of length N-1 as `on`
    with a given `how` method, common for all dataframes
    """

    if not isinstance(on, Sequence):
        on = [on] * len(df_list)

    while len(df_list) > 1:
        df_list[0] = df_list[0].join(df_list[1], on=on[0], how=how)
        del df_list[1], on[1]
        gc.collect()

    return df_list[0]


def sample_it(
    s: pl.Series,
    n_samples: int | float | None,
    sample_mode: Literal["exact", "approximate"] = "exact",
) -> pl.Series:
    """
    Custom pl.LazyFrame.sample implementation using shuffle
    or binomial sampling techinques
    """
    if isinstance(n_samples, float):
        n_samples = int(s.len() * n_samples)
    elif n_samples is None:
        n_samples = s.len()

    if sample_mode == "exact":
        values = np.random.permutation(
            np.hstack([np.ones(n_samples), np.zeros(s.len() - n_samples)])
        )
    elif sample_mode == "approximate":
        values = np.random.binomial(1, n_samples / s.len(), s.len())

    return pl.Series(
        values=values,
        dtype=pl.Boolean,
    )


def df_self_product(
    dataset: pd.DataFrame | pl.DataFrame,
    partition_col: str,
    fields: Sequence[str] | str | None = None,
    n_samples: int | float | None = None,
    sample_mode: Literal["exact", "approximate"] = "approximate",
) -> pl.DataFrame:
    """
    Dataframe self cross product of different columns with sampling if necessary
    """

    if fields is None:
        fields = dataset.columns

    dataset = pl.DataFrame(dataset).partition_by(
        partition_col, as_dict=True, include_key=False
    )
    dataset = [
        dataset[key].select(pl.all().name.suffix(f"_{key[0]}")).lazy()
        for key in dataset
    ]
    dataset = multi_join(dataset, on=fields, how="cross")

    # pl.LazyFrame has no efficient method of sampling,
    # the block below is a placeholder for the future
    if n_samples is not None:
        dataset = (
            dataset.with_columns(
                sample=pl.first().map_batches(
                    lambda x: sample_it(
                        x, n_samples=n_samples, sample_mode=sample_mode
                    )
                )
            )
            .filter(pl.col("sample"))
            .drop("sample")
        )

    return dataset.collect(streaming=True)


def prepare_reward_dataset(
    examples: tuple[list],
    tokenizer,
    verbose: bool = False,
    max_length: int = 512,
    truncation: bool = True,
) -> Dataset:
    dataset_keys = [
        "input_ids_chosen",
        "attention_mask_chosen",
        "input_ids_rejected",
        "attention_mask_rejected",
    ]
    new_examples = dict(
        zip(
            dataset_keys,
            [[0] * len(examples) for _ in range(len(dataset_keys))],
        )
    )

    def tokenize_pair(idx):
        tokenized_chosen = tokenizer(
            examples["chosen"][idx],
            truncation=truncation,
            max_length=max_length,
        )
        tokenized_rejected = tokenizer(
            examples["rejected"][idx],
            truncation=truncation,
            max_length=max_length,
        )

        new_examples["input_ids_chosen"][idx] = tokenized_chosen["input_ids"]
        new_examples["attention_mask_chosen"][idx] = tokenized_chosen[
            "attention_mask"
        ]
        new_examples["input_ids_rejected"][idx] = tokenized_rejected[
            "input_ids"
        ]
        new_examples["attention_mask_rejected"][idx] = tokenized_rejected[
            "attention_mask"
        ]

    iter_tuple = range(len(examples["chosen"]))
    if verbose:
        iter_tuple = tqdm(iter_tuple)

    Parallel(n_jobs=-1, prefer="threads")(
        delayed(tokenize_pair)(idx) for idx in iter_tuple
    )
    dataset = Dataset.from_dict(new_examples)
    dataset.set_format(type="torch")
    return dataset


reward_dataset = prepare_reward_dataset(sets, reward_tokenizer, verbose=True)

NameError: name 'Dataset' is not defined

In [48]:
import gc
from typing import Sequence, Literal, Tuple, List, Dict

import numpy as np
import pandas as pd
import polars as pl
from tqdm.notebook import tqdm, trange

from joblib import Parallel, delayed


def multi_join(
    df_list: Sequence[pl.DataFrame | pl.LazyFrame],
    on: Sequence[str | None] | str | None,
    how: pl._typing.JoinStrategy = "inner",
) -> pl.DataFrame | pl.LazyFrame:
    """
    Join the list of pl.DataFrame of length N
    using respective keys of length N-1 as `on`
    with a given `how` method, common for all dataframes
    """

    if not isinstance(on, Sequence):
        on = [on] * len(df_list)

    while len(df_list) > 1:
        df_list[0] = df_list[0].join(df_list[1], on=on[0], how=how)
        del df_list[1], on[1]
        gc.collect()

    return df_list[0]


def sample_it(
    s: pl.Series,
    n_samples: int | float | None,
    sample_mode: Literal["exact", "approximate"] = "exact",
) -> pl.Series:
    """
    Custom pl.LazyFrame.sample implementation using shuffle
    or binomial sampling techinques
    """
    if isinstance(n_samples, float):
        n_samples = int(s.len() * n_samples)
    elif n_samples is None:
        n_samples = s.len()

    if sample_mode == "exact":
        values = np.random.permutation(
            np.hstack([np.ones(n_samples), np.zeros(s.len() - n_samples)])
        )
    elif sample_mode == "approximate":
        values = np.random.binomial(1, n_samples / s.len(), s.len())

    return pl.Series(
        values=values,
        dtype=pl.Boolean,
    )


def df_self_product(
    dataset: pd.DataFrame | pl.DataFrame,
    partition_col: str,
    fields: Sequence[str] | str | None = None,
    n_samples: int | float | None = None,
    sample_mode: Literal["exact", "approximate"] = "approximate",
) -> pl.DataFrame:
    """
    Dataframe self cross product of different columns with sampling if necessary
    """

    if fields is None:
        fields = dataset.columns

    dataset = pl.DataFrame(dataset).partition_by(
        partition_col, as_dict=True, include_key=False
    )
    dataset = [
        dataset[key].select(pl.all().name.suffix(f"_{key[0]}")).lazy()
        for key in dataset
    ]
    dataset = multi_join(dataset, on=fields, how="cross")

    # pl.LazyFrame has no efficient method of sampling,
    # the block below is a placeholder for the future
    if n_samples is not None:
        dataset = (
            dataset.with_columns(
                sample=pl.first().map_batches(
                    lambda x: sample_it(
                        x, n_samples=n_samples, sample_mode=sample_mode
                    )
                )
            )
            .filter(pl.col("sample"))
            .drop("sample")
        )

    return dataset.collect(streaming=True)


def prepare_reward_dataset(
    examples: dict[list],
    tokenizer,
    max_length: int = 512,
    truncation: bool = True,
    verbose: bool = True,
) -> Dataset:

    logger.enable("__main__") if verbose else logger.disable("__main__")
    token_kwargs = dict(
        truncation=truncation,
        max_length=max_length,
        padding=True,
        return_tensors="pt",
    )
    new_examples = dict()

    for texts in examples.keys():
        logger.info(f"Starting tokenizing `{texts}`")
        tokenized = tokenizer(text=examples[texts], **token_kwargs)
        tokenized = {k + "_" + texts: v for k, v in tokenized.items()}
        new_examples.update(tokenized)
    dataset = Dataset.from_dict(new_examples)

    dataset.set_format(type="torch")
    return dataset


reward_dataset = prepare_reward_dataset(
    sets.to_dict(as_series=False), reward_tokenizer, verbose=True
)

[32m2024-08-03 13:12:22.204[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_reward_dataset[0m:[36m120[0m - [1mStarting tokenizing `chosen`[0m
[32m2024-08-03 13:12:26.350[0m | [1mINFO    [0m | [36m__main__[0m:[36mprepare_reward_dataset[0m:[36m120[0m - [1mStarting tokenizing `rejected`[0m


In [37]:
from loguru import logger

In [52]:
what = get_reward(
    pl.DataFrame(train).to_dict(as_series=False),
    reward_tokenizer,
    max_length=10,
)

[32m2024-08-03 13:12:53.920[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_reward[0m:[36m10[0m - [1mStarting tokenizing `text`[0m
[32m2024-08-03 13:12:57.020[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_reward[0m:[36m20[0m - [1mStarting reward estimation of `text`[0m
[32m2024-08-03 13:13:27.871[0m | [1mINFO    [0m | [36m__main__[0m:[36mget_reward[0m:[36m28[0m - [1mEstimating finished
[0m


In [None]:
reward_model(
    input_ids=torch.tensor(what["input_ids"]).squeeze(1), attention_mask=torch.tensor(what["attention_mask"]).squeeze(1)
)

In [None]:
torch.tensor(what["input_ids"]).squeeze(1)

In [None]:
what[100]

In [None]:
from datasets import load_from_disk
import os

if not os.path.exists("../artifacts/reward_dataset.hf"):
    reward_dataset = reward_dataset.train_test_split(test_size=0.2)
    reward_dataset.save_to_disk("../artifacts/reward_dataset.hf")
else:
    reward_dataset = load_from_disk("../artifacts/reward_dataset.hf")

In [None]:
reward_dataset = reward_dataset.train_test_split(test_size=0.2)
reward_dataset

In [None]:
def get_test_prompts(dataset, ):
    dataset["text"] = 
    imdb_prompts = [row["text"].split(".")[0] for row in dataset["train"] if row["label"] == 1]
    return np.random.choice(imdb_prompts, n_samples)

In [None]:
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardTrainer, RewardConfig
from omegaconf import OmegaConf

import os

peft_params = OmegaConf.load("../scripts/configs/peft_reward.yaml")
peft_config = LoraConfig(**peft_params)

reward_trainer_params = OmegaConf.load("../scripts/configs/config_reward_trainer.yaml")
reward_config = RewardConfig(**reward_trainer_params)

trainer = RewardTrainer(
    model=reward_model,
    args=reward_config,
    tokenizer=reward_tokenizer,
    train_dataset=reward_dataset["train"],
    eval_dataset=reward_dataset["test"],
)

trainer.train()

In [None]:
from omegaconf import OmegaConf

conf = OmegaConf.load("../scripts/configs/reward_config.yaml")

In [None]:
OmegaConf.to_container(conf)