In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Set device to MPS for Mac users, otherwise CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load the preference dataset
df_preferences = pd.read_csv("../tests/preferences.csv")

# --- Split the dataset into train (90%), val (5%), and test (5%) ---
train_df = df_preferences.sample(frac=0.90, random_state=42)
temp_df = df_preferences.drop(train_df.index)
val_df = temp_df.sample(frac=0.50, random_state=42)
test_df = temp_df.drop(val_df.index)

# Define the Preference Dataset class
class PreferenceDataset(Dataset):
    def __init__(self, df):
        self.x_better = df[["x_better", "y_better"]].values
        self.x_worse = df[["x_worse", "y_worse"]].values
        self.labels = df["preference"].values
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return (
            torch.tensor(self.x_better[idx], dtype=torch.float32),
            torch.tensor(self.x_worse[idx], dtype=torch.float32),
            torch.tensor(self.labels[idx], dtype=torch.float32)
        )

# Define the Policy Network with configurable depth and hidden size
class PolicyNetwork(nn.Module):
    def __init__(self, input_dim=2, hidden_dim=128, num_layers=2):
        super(PolicyNetwork, self).__init__()
        layers = []
        layers.append(nn.Linear(input_dim, hidden_dim))
        layers.append(nn.ReLU())
        for _ in range(num_layers - 1):
            layers.append(nn.Linear(hidden_dim, hidden_dim))
            layers.append(nn.ReLU())
        layers.append(nn.Linear(hidden_dim, 1))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# DPO loss function
def dpo_loss(model, x_better, x_worse):
    r_better = model(x_better)
    r_worse = model(x_worse)
    return -torch.mean(torch.log(torch.sigmoid(r_better - r_worse)))

# Training parameters
batch_size = 128
epochs = 30
learning_rate = 1e-2
hidden_dim = 128
num_layers = 5

# Create Dataset objects for train, val, and test
train_dataset = PreferenceDataset(train_df)
val_dataset = PreferenceDataset(val_df)
test_dataset = PreferenceDataset(test_df)

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Initialize model, optimizer, and scheduler
model = PolicyNetwork(hidden_dim=hidden_dim, num_layers=num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Lists to store losses per epoch
train_losses = []
val_losses = []

# Training loop
for epoch in range(epochs):
    # --- Training Phase ---
    model.train()
    epoch_train_loss = 0.0
    for x_better, x_worse, _ in train_dataloader:
        x_better, x_worse = x_better.to(device), x_worse.to(device)
        optimizer.zero_grad()
        loss = dpo_loss(model, x_better, x_worse)
        loss.backward()
        optimizer.step()
        epoch_train_loss += loss.item()
    
    avg_train_loss = epoch_train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # --- Validation Phase ---
    model.eval()
    epoch_val_loss = 0.0
    with torch.no_grad():
        for x_better, x_worse, _ in val_dataloader:
            x_better, x_worse = x_better.to(device), x_worse.to(device)
            loss = dpo_loss(model, x_better, x_worse)
            epoch_val_loss += loss.item()
    avg_val_loss = epoch_val_loss / len(val_dataloader)
    val_losses.append(avg_val_loss)

    # Step the scheduler with the validation loss
    scheduler.step(avg_val_loss)

    print(f"Epoch {epoch+1}/{epochs}, "
          f"Train Loss: {avg_train_loss:.4f}, "
          f"Val Loss: {avg_val_loss:.4f}")

# --- Plot losses ---
plt.figure(figsize=(8,5))
plt.plot(train_losses, label="Training Loss", color="blue")
plt.plot(val_losses, label="Validation Loss", color="orange")
plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

# --- Test Phase: compute accuracy on test set ---
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for x_better, x_worse, _ in test_dataloader:
        x_better, x_worse = x_better.to(device), x_worse.to(device)
        r_better = model(x_better)
        r_worse = model(x_worse)
        # We consider the prediction correct if r_better > r_worse
        correct += torch.sum(r_better > r_worse).item()
        total += x_better.size(0)

test_accuracy = correct / total
print(f"Test accuracy: {test_accuracy:.4f}")

# Save the trained model
torch.save(model.state_dict(), "dpo_policy.pth")
print("Training completed and model saved!")



Epoch 1/30, Loss: 0.0666
Epoch 2/30, Loss: 0.0616
Epoch 3/30, Loss: 0.0613
Epoch 4/30, Loss: 0.0608
Epoch 5/30, Loss: 0.0602
Epoch 6/30, Loss: 0.0602
Epoch 7/30, Loss: 0.0605
Epoch 8/30, Loss: 0.0602
Epoch 9/30, Loss: 0.0600
Epoch 10/30, Loss: 0.0597
Epoch 11/30, Loss: 0.0601
Epoch 12/30, Loss: 0.0597
Epoch 13/30, Loss: 0.0597
Epoch 14/30, Loss: 0.0593
Epoch 15/30, Loss: 0.0598
Epoch 16/30, Loss: 0.0593
Epoch 17/30, Loss: 0.0595
Epoch 18/30, Loss: 0.0597
Epoch 19/30, Loss: 0.0597
Epoch 20/30, Loss: 0.0592
Epoch 21/30, Loss: 0.0595
Epoch 22/30, Loss: 0.0594
Epoch 23/30, Loss: 0.0591
Epoch 24/30, Loss: 0.0593
Epoch 25/30, Loss: 0.0591
Epoch 26/30, Loss: 0.0594
Epoch 27/30, Loss: 0.0594
Epoch 28/30, Loss: 0.0586
Epoch 29/30, Loss: 0.0585
Epoch 30/30, Loss: 0.0585
Training completed and model saved!
