In [3]:
!pip install trl
!pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [9]:
from datasets import load_dataset
import torch
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoModelForSeq2SeqLM, pipeline
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import DPOTrainer, create_reference_model
from numpy import percentile

In [3]:
df = pd.read_csv('small_Rlhf.csv')
df.shape
#df.head(2)
#df = df.sample(100)

(1000, 3)

In [4]:
dataset = Dataset.from_pandas(df)

In [5]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 1000
})

In [6]:
dataset['prompt'][0]

"SUBREDDIT: r/relationship_advice\nTITLE: [19/M] My Girlfriend [20/F] Thinks I'm Inexperienced and Thinks I Deserve More.\nPOST: Throwaway because I have a few friends that know my account and I don't want the wrong idea getting out.\n\nI've been with my girlfriend for 2.5 years and she is my first serious girlfriend, I've had a lot of my romantic firsts with her. Her on the other hand, she has been with about 5 guys before me.\n\nThroughout our relationship she has told me that she's afraid she's holding me back from experiencing college and a life I'll wish I had when I'm out of college. She says that she already had the experiences she wanted and that she would wait for me if I wanted to get everything out of my system, but I've told her on multiple occasions that she's all I want and need.\n\nRecently (about two weeks ago) the conversation came up again and I brushed it off like I normally do. The problem is that since bringing it up again I have had it stuck in my head and am star

In [7]:
dataset['chosen'][0]

"TL;DR: Girlfriend thinks she's holding me back from experiencing life because she was my first."

In [8]:
dataset['rejected'][0]

"TL;DR: Girlfriend wants me to experience college without her, I don't know if it's fair of me to ask her to leave her life and wait for me."

In [9]:
split_ratio = 0.8  # 80% for training, 20% for evaluation
num_train_samples = int(split_ratio * len(dataset))
train_dataset = dataset.select(range(num_train_samples))
eval_dataset = dataset.select(range(num_train_samples, len(dataset)))

In [10]:
train_dataset, eval_dataset

(Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 800
 }),
 Dataset({
     features: ['prompt', 'chosen', 'rejected'],
     num_rows: 200
 }))

In [15]:
DPO_model_id = "pszemraj/led-base-book-summary"
DPO_model = AutoModelForSeq2SeqLM.from_pretrained(DPO_model_id)
DPO_tokenizer = AutoTokenizer.from_pretrained(DPO_model_id)

In [12]:
ref_model = create_reference_model(DPO_model)

In [13]:
# lets find the p100 length of the prompt
prompt_length = int(percentile([len(DPO_tokenizer(x)["input_ids"]) for x in train_dataset["prompt"]], 100))
max_seq_length_chosen = int(percentile([len(DPO_tokenizer(x["prompt"] + x["chosen"])["input_ids"]) for x in train_dataset], 100))
max_seq_length_rejected = int(percentile([len(DPO_tokenizer(x["prompt"] + x["rejected"])["input_ids"]) for x in train_dataset], 100))
max_seq_length = max(max_seq_length_chosen, max_seq_length_rejected)
prompt_length,max_seq_length_chosen, max_seq_length_rejected, max_seq_length

(506, 563, 561, 563)

In [14]:
# filter datasets to remove samples that are too long
train_dataset = train_dataset.filter(lambda x: len(DPO_tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length)
eval_dataset = eval_dataset.filter(lambda x: len(DPO_tokenizer(x["prompt"] + x["chosen"])["input_ids"]) <= max_seq_length)
print(f"len(train_dataset): {len(train_dataset)}")
print(f"len(eval_dataset): {len(eval_dataset)}")

Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

len(train_dataset): 800
len(eval_dataset): 200


In [15]:
# Up the lengths to next multiple of 2, why 2? Don't know
prompt_length = ((prompt_length + 1) // 2) * 2
max_seq_length = ((max_seq_length + 1) // 2) * 2
print(f"p100 prompt length: {prompt_length}")
print(f"p100 prompt + chosen length: {max_seq_length}")

p100 prompt length: 506
p100 prompt + chosen length: 564


In [22]:
# # LoRA config based on QLoRA paper & Sebastian Raschka experiment
# peft_config = LoraConfig(
#         lora_alpha=32,
#         lora_dropout=0.05,
#         r=8,
#         bias="none",
#         #target_modules="all-linear",
#         task_type="SEQ_2_SEQ_LM"
# )

In [23]:
#from transformers import TrainingArguments

args = TrainingArguments(
    output_dir="dpo_summarize",               # directory to save and repository id
    num_train_epochs=30,                     # number of training epochs
    per_device_train_batch_size=12,         # batch size per device during training
    per_device_eval_batch_size=4,           # batch size for evaluation
    gradient_accumulation_steps=3,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    #optim="adamw_torch_fused",              # use fused adamw optimizer
    learning_rate=5e-5,                     # 10x higher LR than QLoRA paper
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.1,                       # warmup ratio based on QLoRA paper
    lr_scheduler_type="cosine",             # use cosine learning rate scheduler
    logging_steps=1,                       # log every 25 steps
    save_steps=1,                         # when to save checkpoint
    save_total_limit=2,                     # limit the total amount of checkpoints
    evaluation_strategy="epoch",            # evaluate every 1000 steps
    eval_steps=1,                         # when to evaluate
    #bf16=True,                              # use bfloat16 precision
    #tf32=True,                              # use tf32 precision
    push_to_hub=False,                      # push model to hub
    #report_to="wandb",                # report metrics to tensorboard
)

dpo_args = {
    "beta": 0.1,                            # The beta factor in DPO loss. Higher beta means less divergence
    "loss_type": "sigmoid"                  # The loss type for DPO.
}


In [24]:
prompt_length = 506
max_seq_length = 564

In [26]:
from trl import DPOTrainer

trainer = DPOTrainer(
    model = DPO_model,
    ref_model=None, # set to none since we use peft
    #peft_config=peft_config,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=DPO_tokenizer,
    max_length=max_seq_length,
    max_prompt_length=prompt_length,
    beta=dpo_args["beta"],
    loss_type=dpo_args["loss_type"],
)



Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [27]:
trainer.train()

Input ids are automatically padded from 506 to 1024 to be a multiple of `config.attention_window`: 1024
Could not estimate the number of tokens of the input, floating-point operations will not be computed
Input ids are automatically padded from 495 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 499 to 1024 to be a multiple of `config.attention_window`: 1024


Epoch,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/rejected,Logps/chosen,Logits/rejected,Logits/chosen
0,0.936,0.776837,0.145371,0.004046,0.525,0.141324,-131.05545,-131.561462,-4.390668,-4.538873
1,0.5085,0.925029,1.010943,0.816193,0.515,0.19475,-122.933983,-122.905762,-4.453427,-4.578654
3,0.116,1.245865,-0.49429,-0.90017,0.53,0.40588,-140.09761,-137.958084,-3.883238,-4.036222
4,0.1378,1.245979,-1.484988,-2.223435,0.575,0.738448,-153.330276,-147.865067,-4.735804,-4.843361
6,0.2581,1.715216,-2.513293,-2.888822,0.5,0.375528,-159.984116,-158.148117,-4.217726,-4.307676
7,0.064,1.486289,-2.640747,-3.237808,0.545,0.597062,-163.473999,-159.422653,-4.244321,-4.332305
9,0.1779,1.383885,-1.428633,-1.857887,0.53,0.429254,-149.674789,-147.301514,-4.513829,-4.570256
10,0.0038,1.565322,-1.271551,-1.892981,0.56,0.621431,-150.025726,-145.730682,-4.129722,-4.175605
12,0.0038,1.675921,-1.711895,-2.335452,0.535,0.623558,-154.450439,-150.134125,-3.843406,-3.904327
13,0.2646,1.675812,-1.321376,-2.014751,0.58,0.693374,-151.243423,-146.228943,-3.906543,-3.975127


Non-default generation parameters: {'max_length': 1024, 'min_length': 8, 'early_stopping': True, 'num_beams': 4, 'repetition_penalty': 3.5, 'length_penalty': 0.8, 'no_repeat_ngram_size': 3}
Input ids are automatically padded from 476 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 505 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 483 to 1024 to be a multiple of `config.attention_window`: 1024
Non-default generation parameters: {'max_length': 1024, 'min_length': 8, 'early_stopping': True, 'num_beams': 4, 'repetition_penalty': 3.5, 'length_penalty': 0.8, 'no_repeat_ngram_size': 3}
Input ids are automatically padded from 486 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 488 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 484 to 1024 to be a multiple of `config.attention_window`: 1

TrainOutput(global_step=660, training_loss=0.11249273650516416, metrics={'train_runtime': 27985.745, 'train_samples_per_second': 0.858, 'train_steps_per_second': 0.024, 'total_flos': 0.0, 'train_loss': 0.11249273650516416, 'epoch': 29.55})

In [28]:
trainer.save_model()

Non-default generation parameters: {'max_length': 1024, 'min_length': 8, 'early_stopping': True, 'num_beams': 4, 'repetition_penalty': 3.5, 'length_penalty': 0.8, 'no_repeat_ngram_size': 3}


In [29]:
DPO_tokenizer.save_pretrained('DPO_tokenizer')

('DPO_tokenizer/tokenizer_config.json',
 'DPO_tokenizer/special_tokens_map.json',
 'DPO_tokenizer/vocab.json',
 'DPO_tokenizer/merges.txt',
 'DPO_tokenizer/added_tokens.json',
 'DPO_tokenizer/tokenizer.json')

In [5]:
DPO_model_id = "dpo_summarize"
DPO_model_trained = AutoModelForSeq2SeqLM.from_pretrained(DPO_model_id)
DPO_tokenizer = AutoTokenizer.from_pretrained(DPO_model_id)

In [6]:
DPO_tokenizer

LEDTokenizerFast(name_or_path='dpo_summarize', vocab_size=50265, model_max_length=16384, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [33]:
from trl.core import LengthSampler
generation_kwargs = {
    "temperature": 1.0,
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

# Some initial values
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [42]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [51]:
# def generate_summary(prompt: str, model, tokenizer, generation_kwargs, output_length_sampler) -> str:
#     """
#     Generate a summary for a given prompt using a trained policy model.
    
#     Args:
#     - prompt (str): The input text for which a summary needs to be generated.
#     - model: The trained policy model.
#     - tokenizer: The tokenizer used for the policy model.
#     - generation_kwargs (dict): Arguments used for response generation.
#     - output_length_sampler (func): Function to sample the length of the output.

#     Returns:
#     - str: Generated summary.
#     """

#     # Tokenize the prompt
#     prompt_tensor = tokenizer.encode(prompt, return_tensors='pt')
    
#     # Ensure it's only one tensor and check its shape
#     assert prompt_tensor.dim() == 2, f"Unexpected tensor shape: {prompt_tensor.shape}"
    
#     # Set the generation arguments
#     max_new_tokens = output_length_sampler()
#     generation_kwargs["max_new_tokens"] = max_new_tokens
    
#     # Generate a summary
#     summary_tensor = model.generate(input_ids=prompt_tensor, **generation_kwargs)
    
#     # Decode and return the summary
#     summary = tokenizer.decode(summary_tensor[0], skip_special_tokens=True)
#     return summary

In [7]:
testing_sample = pd.read_csv('testing_samples.csv')
testing_sample

Unnamed: 0,prompt,label
0,SUBREDDIT: r/AskReddit\nTITLE: I need your hel...,American Family Insurance is screwing me with ...
1,SUBREDDIT: r/relationships\nTITLE: My boyfrien...,Boyfriend of 3 years started a business withou...
2,SUBREDDIT: r/AskReddit\nTITLE: Can someone hel...,Grandpa had a light bulb he could light up by ...
3,SUBREDDIT: r/travel\nTITLE: If I don't do this...,"I'm an American, bored with my career, wanting..."
4,SUBREDDIT: r/tifu\nTITLE: TIFU By Showing My H...,Made a bet with teacher to watch Vader vs Hitl...
5,SUBREDDIT: r/dating_advice\nTITLE: Should I [1...,Have a bit of a crush on a guy who I see every...
6,SUBREDDIT: r/relationship_advice\nTITLE: When ...,"If we both know we like each other, and have r..."
7,SUBREDDIT: r/relationships\nTITLE: I [18 M] ha...,Interested in a girl i sit with next to in cla...
8,SUBREDDIT: r/Advice\nTITLE: Freaking out about...,Freaking out about college being too much and ...
9,SUBREDDIT: r/personalfinance\nTITLE: 25 y/o lo...,forces out of home. I have $400 and $6000 debt...


In [13]:
testing_sample['prompt'][0]

"SUBREDDIT: r/AskReddit\nTITLE: I need your help Reddit. I'm getting screwed by American Family Insurance.\nPOST: I need your help Reddit.\n\nI was recently in a car accident where the other driver was at fault. They turned through a stopped lane of traffic turning into a parking lot and hit me in the second lane. Their insurance agency accepted fault for the accident. They arranged for me to get a rental car and are paying for the auto body repair at the shop my dealer recommended. \n\nNow here is where I ran into a problem. \n\nI have limited liability insurance on my car because I worked like a boss and saved my money up and paid for it out right so I would not go into debt. I took the risk of not covering the cost of damage to my car for when I am at fault because I have a clean record of driving and am willing to replace my car with a Junker if I wreck it. The amount I have saved not covering my car is enough to buy a replacement car. Anyways, when I picked up the rental car that 

In [11]:
pipe = pipeline("summarization", model=DPO_model_trained, tokenizer=DPO_tokenizer)
DPO_model_summary = []
for i in testing_sample['prompt']:
    output = pipe(i,temperature =  1.0, min_length = 5, top_k = 0.0, top_p = 1.0, do_sample = True, max_length=150)
    DPO_model_summary.append(output)

In [12]:
DPO_model_summary

[[{'summary_text': 'I was recently in a car accident where the other driver was at fault because I have a clean record of driving and am willing to replace my car with a Junker if I wreck it. I contacted the at fault insurance company and explained that this expense is a result of the accident that their client caused but they do not have to cover. My car will not be out of the shop for another 2 weeks and I will end up getting stuck with a $300 and something bill for the insurance. Please tell me if I have any options.'}],
 [{'summary_text': 'My boyfriend of 3 years keeps making huge decisions without communicating me any info about his car at all. How do I approach this problem with him? He keeps promising to keep me informed and keeps failing to tell me that he was going to lunch with anyone today. Is this normal?'}],
 [{'summary_text': "If anyone knows where I find a lightbulb that my grandpa would put into his mouth and somehow light it up. He wanted them touch it and look at it, 

In [16]:
pipe = pipeline("summarization", model=DPO_model, tokenizer=DPO_tokenizer)
Base_model_summary = []
for i in testing_sample['prompt']:
    output = pipe(i,temperature =  1.0, min_length = 5, top_k = 0.0, top_p = 1.0, do_sample = True, max_length=150)
    Base_model_summary.append(output)

In [17]:
Base_model_summary

[[{'summary_text': 'Redditor r/AskReddit explains how he\'s been hit by American Family Insurance. He has limited liability insurance, meaning that he doesn\'t have to pay for damage to his car when the other driver is at fault. However, because of this, he\'s now stuck with a $300 and something bill due to the insurance company. Reddit also tells us that there\'s one redditor who asks Reddit if they can help him figure out how their insurance company pays for "rental insurance" when the client isn\'t supposed to be paying for it. The redditor says yes, but not until he figures out what the heck his insurance company does.'}],
 [{'summary_text': 'A Reddit user has been asking a question: "Why is my boyfriend [27 M] of 3 years keeps making huge decisions without communicating with me [23 F] at all, is this normal?" This question relates to a relationship between a graduate student and his business partner. The couple works as a consultant for a consulting firm, but the college senior re

In [18]:
testing_sample['base_model'] = Base_model_summary
testing_sample['Dpo'] = DPO_model_summary
DPO_result = testing_sample
DPO_result.to_csv('DPO_result_new.csv')

In [46]:
# Base_model_summary = []
# for i in testing_sample['prompt']:
#     prompt = "summarize: " + i
#     generated_summary = generate_summary(prompt, DPO_model, DPO_tokenizer, generation_kwargs, output_length_sampler)
#     Base_model_summary.append(generated_summary)

Input ids are automatically padded from 372 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 194 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 251 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 259 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 221 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 361 to 1024 to be a multiple of `config.attention_window`: 1024
Input ids are automatically padded from 278 to 1024 to be a multiple of `config.attention_window`: 1024


In [53]:
# DPO_model_summary = []
# for i in testing_sample['prompt']:
#     prompt = "summarize: " + i
#     generated_summary = generate_summary(prompt, DPO_model_trained, DPO_tokenizer, generation_kwargs, output_length_sampler)
#     DPO_model_summary.append(generated_summary)