In [4]:
!pip install dataset



In [7]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [8]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

# Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# Filter English messages
english_data = dataset['train'].filter(lambda x: x['lang'] == 'en')
print(f"✅ English Dataset Loaded: {len(english_data)} samples")

# Extract prompt-response pairs
pairs = []
for conversation in english_data:
    if isinstance(conversation, dict) and 'messages' in conversation:
        messages = conversation['messages']
        for i in range(len(messages) - 1):
            if messages[i]['role'] == 'user' and messages[i + 1]['role'] == 'assistant':
                pairs.append({
                    'prompt': messages[i]['text'],
                    'response': messages[i + 1]['text'],
                    'quality': messages[i + 1].get('quality', 0)  # Default quality if missing
                })

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Example: GPT-2 model
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue

# Tokenize dataset
tokenized_pairs = []
for pair in pairs:
    tokenized_input = tokenizer(pair['prompt'], truncation=True, padding='max_length', max_length=128)
    tokenized_output = tokenizer(pair['response'], truncation=True, padding='max_length', max_length=128)
    tokenized_pairs.append({
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': tokenized_output['input_ids'],
        'quality': pair['quality']
    })

# Create a PyTorch dataset
class PreferenceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['labels']),
            'quality': torch.tensor(item['quality'])
        }

# Initialize dataset
train_dataset = PreferenceDataset(tokenized_pairs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/10.2k [00:00<?, ?B/s]

(…)-00000-of-00001-b42a775f407cee45.parquet:   0%|          | 0.00/39.5M [00:00<?, ?B/s]

(…)-00000-of-00001-134b8fd0c89408b6.parquet:   0%|          | 0.00/2.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/84437 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4401 [00:00<?, ? examples/s]

Filter:   0%|          | 0/84437 [00:00<?, ? examples/s]

✅ English Dataset Loaded: 39283 samples


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [11]:
import torch

def dpo_loss(policy_model, ref_model, batch, beta, device):
    """
    Calculates the Direct Preference Optimization (DPO) loss.
    """

    batch = {k: v.to(device) for k, v in batch.items()}

    chosen_input_ids = batch["input_ids"]
    chosen_attention_mask = batch["attention_mask"]
    rejected_input_ids = batch["input_ids"]  # Using the same input_ids as chosen for now
    rejected_attention_mask = batch["attention_mask"]  # Using the same attention_mask as chosen for now

    with torch.no_grad():
        ref_logits = ref_model(input_ids=chosen_input_ids, attention_mask=chosen_attention_mask).logits[:, -1, :]

    policy_logits = policy_model(input_ids=chosen_input_ids, attention_mask=chosen_attention_mask).logits[:, -1, :]
    chosen_log_probs = torch.log_softmax(policy_logits, dim=-1)
    chosen_probs = torch.exp(chosen_log_probs)  # Calculate probabilities from log_probs

    with torch.no_grad():
        ref_log_probs = torch.log_softmax(ref_logits, dim=-1)

    # Compute the DPO loss
    loss = -torch.sum(chosen_probs * (chosen_log_probs - ref_log_probs), dim=-1).mean()

    return loss

In [16]:
!pip install datasets transformers torch peft accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.m

In [17]:
from datasets import load_dataset
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

# ✅ Load the dataset
dataset = load_dataset("OpenAssistant/oasst1")

# ✅ Filter English messages
english_data = dataset['train'].filter(lambda x: x['lang'] == 'en')
print(f"✅ English Dataset Loaded: {len(english_data)} samples")

✅ English Dataset Loaded: 39283 samples


In [18]:
# ✅ Extract prompt-response pairs
pairs = []
for conversation in english_data:
    if isinstance(conversation, dict) and 'messages' in conversation:
        messages = conversation['messages']
        for i in range(len(messages) - 1):
            if messages[i]['role'] == 'user' and messages[i + 1]['role'] == 'assistant':
                pairs.append({
                    'prompt': messages[i]['text'],
                    'response': messages[i + 1]['text'],
                    'quality': messages[i + 1].get('quality', 0)  # Default quality if missing
                })
# ✅ Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Example: GPT-2 model
tokenizer.pad_token = tokenizer.eos_token  # Fix padding issue

# ✅ Tokenize dataset
tokenized_pairs = []
for pair in pairs:
    tokenized_input = tokenizer(pair['prompt'], truncation=True, padding='max_length', max_length=128)
    tokenized_output = tokenizer(pair['response'], truncation=True, padding='max_length', max_length=128)
    tokenized_pairs.append({
        'input_ids': tokenized_input['input_ids'],
        'attention_mask': tokenized_input['attention_mask'],
        'labels': tokenized_output['input_ids'],
        'quality': pair['quality']
    })
# ✅ Create a PyTorch dataset
class PreferenceDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['labels']),
            'quality': torch.tensor(item['quality'])
        }

# ✅ Initialize dataset
train_dataset = PreferenceDataset(tokenized_pairs)

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset

# ✅ Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()  # Free CUDA memory
print(f"🔹 Using device: {device}")

🔹 Using device: cuda


In [20]:
# ✅ Load dataset
dataset = load_dataset("openai/summarize_from_feedback", "axis", split="test")  # Use test split
print(f"✅ Dataset Loaded: {len(dataset)} samples")

README.md:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

summarize_from_feedback.py:   0%|          | 0.00/9.38k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.53M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/2.07M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/6312 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8585 [00:00<?, ? examples/s]

✅ Dataset Loaded: 6312 samples


In [21]:
# ✅ Load a pre-trained Transformer model (GPT-2 for reward modeling)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Fix the tokenizer padding issue (GPT models don’t have a default pad token)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS as padding token
reward_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
reward_model.config.pad_token_id = tokenizer.pad_token_id  # Ensure model uses pad_token_id

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
# ✅ Preprocessing function
def preprocess_data(example):
    """Processes dataset to extract chosen and rejected responses for reward training."""
    return {
        "prompt": str(example.get("info", {}).get("post", "")),  # Convert to string
        "chosen": str(example.get("summary", "")),  # Preferred summary
        "rejected": str(example.get("summary", ""))  # Duplicate since dataset only has 'summary'
    }

# ✅ Apply preprocessing safely
dataset = dataset.map(preprocess_data, remove_columns=["info", "summary", "worker", "batch", "split"], features=None)
# ✅ Convert dataset to list format
prompts = [entry["prompt"] for entry in dataset]
chosen_responses = [entry["chosen"] for entry in dataset]
rejected_responses = [entry["rejected"] for entry in dataset]

# ✅ Tokenize inputs
chosen_outputs = tokenizer(chosen_responses, padding=True, truncation=True, max_length=128, return_tensors="pt")
rejected_outputs = tokenizer(rejected_responses, padding=True, truncation=True, max_length=128, return_tensors="pt")

# ✅ Ensure tensors are non-empty
print("Chosen Outputs Shape:", chosen_outputs["input_ids"].shape)
print("Rejected Outputs Shape:", rejected_outputs["input_ids"].shape)

Map:   0%|          | 0/6312 [00:00<?, ? examples/s]

Chosen Outputs Shape: torch.Size([6312, 128])
Rejected Outputs Shape: torch.Size([6312, 128])


In [23]:
# ✅ Define optimizer & loss function
optimizer = optim.AdamW(reward_model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()

# ✅ Ensure labels match the shape of chosen/rejected outputs
num_samples = len(chosen_outputs["input_ids"])
labels = torch.cat([
    torch.ones(num_samples, dtype=torch.float32),
    torch.zeros(num_samples, dtype=torch.float32)
]).to(device)
BATCH_SIZE = 4

# ✅ Training function
def train_reward_model(model, optimizer, chosen_outputs, rejected_outputs, labels, epochs=3):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()

        for i in range(0, len(chosen_outputs["input_ids"]), BATCH_SIZE):
            # Select batch
            batch_chosen = {k: v[i:i+BATCH_SIZE].to(device) for k, v in chosen_outputs.items()}
            batch_rejected = {k: v[i:i+BATCH_SIZE].to(device) for k, v in rejected_outputs.items()}
            batch_labels = labels[i:i+BATCH_SIZE]

            # ✅ Ensure label shape matches model output
            chosen_scores = model(**batch_chosen).logits.squeeze()
            rejected_scores = model(**batch_rejected).logits.squeeze()

            # ✅ Check for empty tensors before computing loss
            if chosen_scores.shape[0] == 0 or rejected_scores.shape[0] == 0:
                print("Skipping empty batch...")
                continue

            # ✅ Fix label slicing to match logits shape
            loss = criterion(chosen_scores, batch_labels[:chosen_scores.shape[0]]) + criterion(rejected_scores, batch_labels[:rejected_scores.shape[0]])

            loss.backward()
            optimizer.step()

            # Free unused memory
            torch.cuda.empty_cache()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# ✅ Train the reward model
train_reward_model(reward_model, optimizer, chosen_outputs, rejected_outputs, labels)

Epoch 1, Loss: 0.0000
Epoch 2, Loss: 0.0000
Epoch 3, Loss: 0.0000


In [24]:
# ✅ Save trained reward model
torch.save(reward_model.state_dict(), "reward_model.pth")
print("✅ Reward model training complete & saved!")

✅ Reward model training complete & saved!


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader, TensorDataset

# ✅ Set device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

🔹 Using device: cuda


In [26]:
# ✅ Load trained reward model from Task 2
model_name = "gpt2"
reward_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
reward_model.load_state_dict(torch.load("reward_model.pth", map_location=device))
reward_model.config.pad_token_id = reward_model.config.eos_token_id  # Ensure pad token is set

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  reward_model.load_state_dict(torch.load("reward_model.pth", map_location=device))


In [27]:
# ✅ Load dataset (Using OpenAI Summarization Feedback dataset)
dataset = load_dataset("openai/summarize_from_feedback", "axis", split="test")  # Use test split
print(f"✅ Dataset Loaded: {len(dataset)} samples")

# ✅ Load tokenizer & set padding
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Use EOS as padding token

✅ Dataset Loaded: 6312 samples


In [28]:
# ✅ Preprocessing function for DPO
def preprocess_data(example):
    """Processes dataset for DPO training."""
    return {
        "prompt": str(example.get("info", {}).get("post", "")),  # Extract original post
        "chosen": str(example.get("summary", "")),  # Preferred summary
        "rejected": str(example.get("summary", ""))  # Currently duplicating since dataset only has 'summary'
    }

# ✅ Apply preprocessing
dataset = dataset.map(preprocess_data, remove_columns=["info", "summary", "worker", "batch", "split"], features=None)
# ✅ Convert dataset to list format
prompts = [entry["prompt"] for entry in dataset]
chosen_responses = [entry["chosen"] for entry in dataset]
rejected_responses = [entry["rejected"] for entry in dataset]

# ✅ Tokenize inputs
chosen_outputs = tokenizer(chosen_responses, padding=True, truncation=True, max_length=128, return_tensors="pt")
rejected_outputs = tokenizer(rejected_responses, padding=True, truncation=True, max_length=128, return_tensors="pt")

# ✅ Convert dataset to PyTorch tensors
inputs = tokenizer(prompts, padding=True, truncation=True, max_length=128, return_tensors="pt").to(device)
chosen_inputs = {k: v.to(device) for k, v in chosen_outputs.items()}
rejected_inputs = {k: v.to(device) for k, v in rejected_outputs.items()}
# ✅ Define DPO loss function
def dpo_loss(chosen_scores, rejected_scores, beta=0.1):
    """
    Implements Direct Preference Optimization (DPO) loss.

    Args:
    - chosen_scores: Model outputs for chosen responses
    - rejected_scores: Model outputs for rejected responses
    - beta: Temperature parameter for softmax scaling

    Returns:
    - Loss value
    """
    logits_diff = chosen_scores - rejected_scores
    loss = -torch.log(torch.sigmoid(logits_diff / beta)).mean()
    return loss
# ✅ Define optimizer
optimizer = optim.AdamW(reward_model.parameters(), lr=1e-5)

# ✅ Reduce batch size to prevent CUDA OOM
BATCH_SIZE = 4

# ✅ Training function using DPO
def train_dpo(model, optimizer, chosen_inputs, rejected_inputs, epochs=10):
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()

        for i in range(0, len(chosen_inputs["input_ids"]), BATCH_SIZE):
            # Select batch
            batch_chosen = {k: v[i:i+BATCH_SIZE] for k, v in chosen_inputs.items()}
            batch_rejected = {k: v[i:i+BATCH_SIZE] for k, v in rejected_inputs.items()}

            # Compute logits
            chosen_scores = model(**batch_chosen).logits.squeeze()
            rejected_scores = model(**batch_rejected).logits.squeeze()

            # Ensure correct tensor dimensions
            if chosen_scores.shape[0] == 0 or rejected_scores.shape[0] == 0:
                print("Skipping empty batch...")
                continue

            # Compute DPO loss
            loss = dpo_loss(chosen_scores, rejected_scores)

            loss.backward()
            optimizer.step()

            # Free unused memory
            torch.cuda.empty_cache()

        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

# ✅ Train the model with DPO
train_dpo(reward_model, optimizer, chosen_inputs, rejected_inputs)

Map:   0%|          | 0/6312 [00:00<?, ? examples/s]

Epoch 1, Loss: 1.2823
Epoch 2, Loss: 1.2868
Epoch 3, Loss: 0.6913
Epoch 4, Loss: 0.6197
Epoch 5, Loss: 0.9458
Epoch 6, Loss: 0.9186
Epoch 7, Loss: nan
Epoch 8, Loss: nan
Epoch 9, Loss: nan
Epoch 10, Loss: nan


In [29]:
# ✅ Save fine-tuned reward model
torch.save(reward_model.state_dict(), "reward_model_dpo.pth")
print("✅ DPO Fine-Tuning Complete & Model Saved!")

✅ DPO Fine-Tuning Complete & Model Saved!
