# DPO Training Pipeline for Restaurant Recommendation

This notebook implements Direct Preference Optimization (DPO) to train a language model for personalized restaurant recommendations using user profiles and business data.

## 0. Environment Setup

Install required packages and configure Google Drive access.

In [None]:
!pip install trl
!pip install bitsandbytes
!pip install huggingface_hub

Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m465.5/465.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
import sys
import torch
import json
import argparse
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import wandb

project_path_llm4rec = '/content/drive/MyDrive/CS329H_DiningbyDesign'
sys.path.append(project_path_llm4rec)

## Experiment Configuration

Set the key parameters for dataset size and training constraints.


In [None]:
MODEL_NAME = "LiquidAI/LFM2-350M"
BASE_PROJECT_PATH = "/content/drive/MyDrive/CS329H_DiningbyDesign/yanzhen_final_single"
WANDB_PROJECT = "LLM4Rec-DPO-Single"
RANDOM_SEED = 42

test_data_size = 200
train_gap = 2
train_data_size = 2000

NUM_EPOCHS = 2

## 1. Preference Dataset Creation

Define functions to create DPO preference pairs from user reviews. The dataset pairs high-rated restaurants (chosen) with low-rated restaurants (rejected) for each user, based on their review history.

In [None]:
def load_jsonl(filepath):
    """Load data from JSONL file."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data


def create_business_lookup(business_data):
    """Create a lookup dictionary for businesses by business_id."""
    return {business['business_id']: business for business in business_data}


def find_best_and_worst_reviews(reviews, min_gap=1):
    """
    Find the most positive (highest stars) and most negative (lowest stars) reviews.
    Returns (best_review, worst_review) or (None, None) if gap < min_gap.
    """
    if not reviews or len(reviews) < 2:
        return None, None

    sorted_reviews = sorted(reviews, key=lambda x: x['stars'])
    worst_review = sorted_reviews[0]
    best_review = sorted_reviews[-1]

    gap = best_review['stars'] - worst_review['stars']
    if gap < min_gap:
        return None, None

    return best_review, worst_review

def create_dpo_dataset(user_profiles_path, business_path, output_dir, train_min_gap=2, test_min_gap=1, test_size=0.2, random_state=42):
    """
    Create DPO preference dataset with train/test split.

    Args:
        user_profiles_path: Path to user profiles JSONL
        business_path: Path to business JSONL
        output_dir: Directory to save output files
        train_min_gap: Minimum star gap for training set (default: 2)
        test_min_gap: Minimum star gap for test set (default: 1)
        test_size: Fraction of data for test set (default: 0.2)
        random_state: Random seed for reproducibility
    """
    users = load_jsonl(user_profiles_path)
    businesses = load_jsonl(business_path)
    business_lookup = create_business_lookup(businesses)
    all_dpo_data = []
    skipped_no_variance = 0
    missing_business_count = 0

    for user in tqdm(users, desc="Processing users"):
        user_id = user['user_id']
        user_profile = user.get('profile', '')
        reviews = user.get('reviews', [])

        best_review, worst_review = find_best_and_worst_reviews(reviews, min_gap=1)

        if best_review is None or worst_review is None:
            skipped_no_variance += 1
            continue

        best_business_id = best_review['business_id']
        worst_business_id = worst_review['business_id']

        best_business = business_lookup.get(best_business_id)
        worst_business = business_lookup.get(worst_business_id)

        if not best_business or not worst_business:
            missing_business_count += 1
            continue

        best_business_profile = best_business.get('profile', '')
        worst_business_profile = worst_business.get('profile', '')

        gap = best_review['stars'] - worst_review['stars']

        dpo_example = {
            'user_id': user_id,
            'user_profile': user_profile,
            'star_gap': gap,
            'chosen': {
                'business_id': best_business_id,
                'business_name': best_review['name'],
                'business_profile': best_business_profile,
                'text': f"{user_profile}\n\n{best_business_profile}",
                'rating': best_review['stars'],
                'review_text': best_review['text']
            },
            'rejected': {
                'business_id': worst_business_id,
                'business_name': worst_review['name'],
                'business_profile': worst_business_profile,
                'text': f"{user_profile}\n\n{worst_business_profile}",
                'rating': worst_review['stars'],
                'review_text': worst_review['text']
            }
        }

        all_dpo_data.append(dpo_example)

    train_data_all, test_data_all = train_test_split(
        all_dpo_data,
        test_size=test_size,
        random_state=random_state
    )

    train_data = [ex for ex in train_data_all if ex['star_gap'] >= train_min_gap]
    test_data = [ex for ex in test_data_all if ex['star_gap'] >= test_min_gap]

    print(f"Created {len(train_data)} training and {len(test_data)} test examples")

    os.makedirs(output_dir, exist_ok=True)

    train_path = os.path.join(output_dir, f"dpo_train_gap{train_min_gap}.jsonl")
    test_path = os.path.join(output_dir, f"dpo_test_gap{test_min_gap}.jsonl")

    with open(train_path, 'w', encoding='utf-8') as f:
        for item in train_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    with open(test_path, 'w', encoding='utf-8') as f:
        for item in test_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    return train_data, test_data

## 2. Model Training Setup

Define data loading, formatting, and utility functions for DPO training.

In [None]:
def load_dpo_data(filepath):
    """Load DPO preference dataset from JSONL file."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def format_preference_data(dpo_data):
    """Format DPO data with user profiles and restaurant pairs for preference learning."""
    formatted_data = []

    for item in tqdm(dpo_data, desc="Formatting data"):
        prompt = f"User Profile:\n{item['user_profile']}\n\nRecommended Restaurant:\n"

        chosen = (
            f"{item['chosen']['business_name']}\n"
            f"{item['chosen']['business_profile']}"
        )

        rejected = (
            f"{item['rejected']['business_name']}\n"
            f"{item['rejected']['business_profile']}"
        )

        formatted_data.append({
            'prompt': prompt,
            'chosen': chosen,
            'rejected': rejected,
            'user_id': item['user_id'],
            'chosen_rating': item['chosen']['rating'],
            'rejected_rating': item['rejected']['rating']
        })

    return formatted_data

def data_formulate(data, tokenizer):
    """Apply chat template to format the prompt."""
    system_prompt = (
        "You are a restaurant recommendation assistant. "
        "Given a user's dining preferences, recommend a restaurant that matches their taste."
    )

    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": data['prompt']},
    ]

    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return prompt

def print_trainable_parameters(model):
    """Print the number of trainable parameters."""
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"\nüöÄ Trainable parameters: {trainable_params:,}")
    print(f"üì¶ Total parameters:     {all_param:,}")
    print(f"üìà Percentage:           {100 * trainable_params / all_param:.4f}%\n")

## 3. Training Configuration

Configure model, LoRA parameters, training hyperparameters, and logging settings.


In [None]:
HF_TOKEN = ""

LEARNING_RATE = 1e-5
BATCH_SIZE = 4
EVAL_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_STEPS = 0
MAX_LENGTH = 2048

LORA_R = 64
LORA_ALPHA = 128
LORA_DROPOUT = 0.05

USE_4BIT = True

LOGGING_STEPS = 4
EVAL_STEPS = 250
SAVE_STEPS = 250
SAVE_TOTAL_LIMIT = 2

USE_WANDB = True
WANDB_RUN_NAME = f"dpo-{MODEL_NAME.split('/')[-1]}-gap{train_gap}-n{train_data_size}-epoch{NUM_EPOCHS}-lr{LEARNING_RATE}-bs{BATCH_SIZE}"
WANDB_API_KEY = ""

TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
    "fc1", "fc2",
    "w1", "w2", "w3",
]

## 4. Load Model and Tokenizer

Initialize the base model with optional 4-bit quantization for efficient training.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

if USE_4BIT:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        quantization_config=quantization_config
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## 5. Prepare Dataset

Create the DPO preference dataset from user profiles and business data, then split into training and test sets.

In [None]:
BASE_PATH = "/content/drive/MyDrive/329H_Final_project/LLM4Rec/data"
OUTPUT_DIR = f"{BASE_PROJECT_PATH}/data"

user_profiles_path = f"{BASE_PATH}/user_profiles.jsonl"
business_path = f"{BASE_PATH}/business.jsonl"

test_gap = 1

os.makedirs(OUTPUT_DIR, exist_ok=True)

train_data, test_data = create_dpo_dataset(
    user_profiles_path=user_profiles_path,
    business_path=business_path,
    output_dir=OUTPUT_DIR,
    train_min_gap=train_gap,
    test_min_gap=test_gap,
    test_size=0.2,
    random_state=RANDOM_SEED
)

Processing users:   0%|          | 0/20000 [00:00<?, ?it/s]

Created 11140 training and 3504 test examples


## 6. Load and Format Training Data

Load the generated preference datasets and format them for DPO training.


In [None]:
full_train_data_path = f"{BASE_PROJECT_PATH}/data/dpo_train_gap{train_gap}.jsonl"
train_preference_data = load_dpo_data(full_train_data_path)
train_data = format_preference_data(train_preference_data[:train_data_size])

full_test_data_path = f"{BASE_PROJECT_PATH}/data/dpo_test_gap{test_gap}.jsonl"
test_preference_data = load_dpo_data(full_test_data_path)
test_data = format_preference_data(test_preference_data[:test_data_size])

Formatting data:   0%|          | 0/2000 [00:00<?, ?it/s]

Formatting data:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
prompt_list = [data_formulate(data, tokenizer) for data in tqdm(train_data, desc="Formatting training") if data['chosen_rating'] - data['rejected_rating'] >= 1]
chosen_list = [data['chosen'] for data in train_data]
rejected_list = [data['rejected'] for data in train_data]

train_dataset = Dataset.from_dict({
    'prompt': prompt_list,
    'chosen': chosen_list,
    'rejected': rejected_list
})

test_prompt_list = [data_formulate(data, tokenizer) for data in tqdm(test_data, desc="Formatting test") if data['chosen_rating'] - data['rejected_rating'] >= 1]
test_chosen_list = [data['chosen'] for data in test_data]
test_rejected_list = [data['rejected'] for data in test_data]

test_dataset = Dataset.from_dict({
    'prompt': test_prompt_list,
    'chosen': test_chosen_list,
    'rejected': test_rejected_list
})

Formatting training:   0%|          | 0/2000 [00:00<?, ?it/s]

Formatting test:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
print(train_dataset[0]["prompt"])

<|startoftext|><|im_start|>system
You are a restaurant recommendation assistant. Given a user's dining preferences, recommend a restaurant that matches their taste.<|im_end|>
<|im_start|>user
User Profile:
Porca is a laid‚Äëback foodie who gravitates toward classic American comfort fare and iconic regional specialties, especially cheesesteaks, pizza, and beer‚Äëfriendly venues. He prefers hearty, budget‚Äëto‚Äëmid‚Äërange dishes (price range 1‚Äë2) and values cheap, filling meals over upscale dining. His favorite spots are casual, lively, and unpretentious ‚Äì often family‚Äëfriendly, noisy enough to feel bustling, and sometimes touristy but authentic, with outdoor seating and bike‚Äëparking being pluses. He doesn‚Äôt mind waiting in line for a legendary sandwich and is forgiving of slower service as long as the food lives up to tradition. Practical perks he looks for include street or bike parking, take‚Äëout or delivery options, no‚Äëreservation policies, and venues that accommodate 

## 7. Initialize DPO Trainer

Set up Weights & Biases logging, configure the trainer, and initialize the DPO training process.

In [None]:
if USE_WANDB and WANDB_API_KEY:
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY
    wandb.login(key=WANDB_API_KEY)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myanzhen4_stanford[0m ([33myanzhen4_stanford-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def save_adapter(
    base_output_dir,
    model_name,
    train_data_size,
    train_gap,
    num_epochs,
    batch_size,
    learning_rate
):
    """Save LoRA adapter with descriptive folder name."""
    model_short_name = model_name.split("/")[-1]
    folder_name = f"{model_short_name}_gap{train_gap}_n{train_data_size}_ep{num_epochs}_bs{batch_size}_lr{learning_rate}"
    output_dir = os.path.join(base_output_dir, folder_name)
    os.makedirs(output_dir, exist_ok=True)
    return output_dir


base_output_dir = f"{BASE_PROJECT_PATH}/models"

os.makedirs(base_output_dir, exist_ok=True)

model_output_dir = save_adapter(
    base_output_dir=base_output_dir,
    model_name=MODEL_NAME,
    train_data_size=train_data_size,
    train_gap=train_gap,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)

In [None]:
training_args = DPOConfig(
    output_dir=model_output_dir,
    logging_steps=LOGGING_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    save_only_model=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    warmup_steps=WARMUP_STEPS,
    eval_strategy="steps" if test_data_size > 0 else "no",
    eval_steps=EVAL_STEPS if test_data_size > 0 else None,
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
    report_to="wandb" if USE_WANDB else "none",
    run_name=WANDB_RUN_NAME if USE_WANDB else None,
    remove_unused_columns=False,
    max_length=MAX_LENGTH,
)

if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_RUN_NAME,
        config={
            "model": MODEL_NAME,
            "learning_rate": LEARNING_RATE,
            "batch_size": BATCH_SIZE,
            "epochs": NUM_EPOCHS,
            "lora_r": LORA_R,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT,
            "use_4bit": USE_4BIT,
        }
    )

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset if test_data_size > 0 else None,
    processing_class=tokenizer,
    peft_config=peft_config,
)

print_trainable_parameters(dpo_trainer.model)

[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Extracting prompt in train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]


üöÄ Trainable parameters: 19,267,584
üì¶ Total parameters:     230,096,640
üìà Percentage:           8.3737%



## 8. Train the Model

Execute the DPO training process with the configured parameters.

In [None]:
dpo_trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 7}.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
250,0.5559,0.573475,-1.38748,-2.12702,0.71,0.73954,-647.173401,-646.904358,-2.607921,-2.636134
500,0.6325,0.55378,-0.250235,-1.249489,0.715,0.999254,-635.801025,-638.129089,-2.658036,-2.690053
750,0.1673,0.558086,-0.325598,-1.514973,0.725,1.189376,-636.554565,-640.783936,-2.595055,-2.627477
1000,0.3956,0.578989,-1.192053,-2.444018,0.71,1.251965,-645.219116,-650.074402,-2.560474,-2.592735


TrainOutput(global_step=1000, training_loss=0.39931700602173803, metrics={'train_runtime': 374.3869, 'train_samples_per_second': 10.684, 'train_steps_per_second': 2.671, 'total_flos': 0.0, 'train_loss': 0.39931700602173803, 'epoch': 2.0})

## 9. Save the Model

Save the trained LoRA adapter and tokenizer to disk.

In [None]:
final_model_path = model_output_dir
dpo_trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Model saved to: {final_model_path}")

Model saved to: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_single/models/LFM2-350M_gap2_n2000_ep2_bs4_lr1e-05


## 10. Model Evaluation

Test the trained model by comparing perplexity scores between base and trained models on the test set.

In [None]:
import numpy as np
import torch.nn.functional as F
from peft import PeftModel

def load_model_for_inference(base_model_name, adapter_path=None):
    """Load model for inference."""
    inf_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    inf_tokenizer.pad_token = inf_tokenizer.eos_token

    inf_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    if adapter_path:
        inf_model = PeftModel.from_pretrained(inf_model, adapter_path)
        inf_model = inf_model.merge_and_unload()

    return inf_model, inf_tokenizer

def calculate_perplexity(model, tokenizer, prompt, response):
    """Calculate perplexity of a response given a prompt."""
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    prompt_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        logits = outputs.logits
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = inputs['input_ids'][..., 1:].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        losses = losses.view(shift_labels.shape)
        response_losses = losses[:, prompt_length-1:]

        avg_loss = response_losses.mean().item()
        perplexity = np.exp(avg_loss)

    return perplexity, avg_loss

def evaluate_business_perplexity(model, tokenizer, formatted_prompt_str, business_name, business_profile):
    """Evaluate perplexity for a specific business recommendation using an already formatted prompt."""
    response = f"{business_name}\n{business_profile}"
    perplexity, avg_loss = calculate_perplexity(model, tokenizer, formatted_prompt_str, response)
    return perplexity, avg_loss

def generate_recommendation(model, tokenizer, formatted_prompt_str, max_new_tokens=512):
    """Generate a restaurant recommendation for a given user profile using an already formatted prompt."""
    inputs = tokenizer(formatted_prompt_str, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    if "<|im_start|>assistant" in response:
        response = response.split("<|im_start|>assistant")[-1].strip()
    elif "assistant\\n" in response:
        response = response.split("assistant\\n")[-1].strip()

    return response, formatted_prompt_str

In [None]:
adapter_path = model_output_dir
base_model, base_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=None)
trained_model, trained_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=adapter_path)

base_chosen_perplexities = []
base_rejected_perplexities = []
trained_chosen_perplexities = []
trained_rejected_perplexities = []

base_correct = 0
trained_correct = 0

for i, sample in enumerate(tqdm(test_data, desc="Evaluating")):
    full_prompt_for_inference = sample['prompt']

    chosen_parts = sample['chosen'].split('\n', 1)
    chosen_business_name = chosen_parts[0].strip()
    chosen_business_profile = chosen_parts[1].strip() if len(chosen_parts) > 1 else ""

    rejected_parts = sample['rejected'].split('\n', 1)
    rejected_business_name = rejected_parts[0].strip()
    rejected_business_profile = rejected_parts[1].strip() if len(rejected_parts) > 1 else ""

    chosen_perp_base, _ = evaluate_business_perplexity(
        base_model, base_tokenizer, full_prompt_for_inference, chosen_business_name, chosen_business_profile
    )
    base_chosen_perplexities.append(chosen_perp_base)

    rejected_perp_base, _ = evaluate_business_perplexity(
        base_model, base_tokenizer, full_prompt_for_inference, rejected_business_name, rejected_business_profile
    )
    base_rejected_perplexities.append(rejected_perp_base)

    if chosen_perp_base < rejected_perp_base:
        base_correct += 1

    chosen_perp_trained, _ = evaluate_business_perplexity(
        trained_model, trained_tokenizer, full_prompt_for_inference, chosen_business_name, chosen_business_profile
    )
    trained_chosen_perplexities.append(chosen_perp_trained)

    rejected_perp_trained, _ = evaluate_business_perplexity(
        trained_model, trained_tokenizer, full_prompt_for_inference, rejected_business_name, rejected_business_profile
    )
    trained_rejected_perplexities.append(rejected_perp_trained)

    if chosen_perp_trained < rejected_perp_trained:
        trained_correct += 1

`torch_dtype` is deprecated! Use `dtype` instead!


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
import numpy as np

# Calculate average perplexities
average_base_chosen_perplexity = np.mean(base_chosen_perplexities[:100])
average_base_rejected_perplexity = np.mean(base_rejected_perplexities[:100])
average_trained_chosen_perplexity = np.mean(trained_chosen_perplexities[:100])
average_trained_rejected_perplexity = np.mean(trained_rejected_perplexities[:100])

base_ppl_diff = average_base_rejected_perplexity - average_base_chosen_perplexity
trained_ppl_diff = average_trained_rejected_perplexity - average_trained_chosen_perplexity

num_samples = len(test_data)
base_accuracy = base_correct / num_samples
trained_accuracy = trained_correct / num_samples

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"\nüìä Base Model:")
print(f"  Chosen Perplexity:   {average_base_chosen_perplexity:.2f}")
print(f"  Rejected Perplexity: {average_base_rejected_perplexity:.2f}")
print(f"  PPL Difference:      {base_ppl_diff:.2f} (rejected - chosen)")
print(f"  Accuracy:            {base_accuracy:.2%} ({base_correct}/{num_samples})")

print(f"\nüéØ Trained Model:")
print(f"  Chosen Perplexity:   {average_trained_chosen_perplexity:.2f}")
print(f"  Rejected Perplexity: {average_trained_rejected_perplexity:.2f}")
print(f"  PPL Difference:      {trained_ppl_diff:.2f} (rejected - chosen)")
print(f"  Accuracy:            {trained_accuracy:.2%} ({trained_correct}/{num_samples})")

print(f"\nüìà Improvement:")
print(f"  Accuracy Gain:       {(trained_accuracy - base_accuracy):.2%}")
print(f"  PPL Diff Gain:       {(trained_ppl_diff - base_ppl_diff):.2f}")
print("="*60 + "\n")

def save_evaluation_results(
    adapter_path,
    base_chosen_ppl,
    base_rejected_ppl,
    base_ppl_diff,
    base_accuracy,
    trained_chosen_ppl,
    trained_rejected_ppl,
    trained_ppl_diff,
    trained_accuracy,
    results_dir=None
):
    """Append evaluation results to results.txt"""
    if results_dir is None:
        results_dir = f"{BASE_PROJECT_PATH}/result"
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, "results.txt")

    with open(results_path, 'a') as f:
        f.write(f"{adapter_path}\n")
        f.write(f"Base Model:\n")
        f.write(f"  Chosen PPL:     {base_chosen_ppl:.2f}\n")
        f.write(f"  Rejected PPL:   {base_rejected_ppl:.2f}\n")
        f.write(f"  PPL Difference: {base_ppl_diff:.2f}\n")
        f.write(f"  Accuracy:       {base_accuracy:.2%}\n")
        f.write(f"\nTrained Model:\n")
        f.write(f"  Chosen PPL:     {trained_chosen_ppl:.2f}\n")
        f.write(f"  Rejected PPL:   {trained_rejected_ppl:.2f}\n")
        f.write(f"  PPL Difference: {trained_ppl_diff:.2f}\n")
        f.write(f"  Accuracy:       {trained_accuracy:.2%}\n")
        f.write(f"\nImprovement:\n")
        f.write(f"  Accuracy Gain:  {(trained_accuracy - base_accuracy):.2%}\n")
        f.write(f"  PPL Diff Gain:  {(trained_ppl_diff - base_ppl_diff):.2f}\n")
        f.write("-" * 50 + "\n")

    print(f"Results appended to: {results_path}")


save_evaluation_results(
    adapter_path=adapter_path,
    base_chosen_ppl=average_base_chosen_perplexity,
    base_rejected_ppl=average_base_rejected_perplexity,
    base_ppl_diff=base_ppl_diff,
    base_accuracy=base_accuracy,
    trained_chosen_ppl=average_trained_chosen_perplexity,
    trained_rejected_ppl=average_trained_rejected_perplexity,
    trained_ppl_diff=trained_ppl_diff,
    trained_accuracy=trained_accuracy
)


EVALUATION RESULTS

üìä Base Model:
  Chosen Perplexity:   28.93
  Rejected Perplexity: 28.81
  PPL Difference:      -0.12 (rejected - chosen)
  Accuracy:            47.50% (95/200)

üéØ Trained Model:
  Chosen Perplexity:   29.96
  Rejected Perplexity: 31.18
  PPL Difference:      1.21 (rejected - chosen)
  Accuracy:            54.00% (108/200)

üìà Improvement:
  Accuracy Gain:       6.50%
  PPL Diff Gain:       1.33

Results appended to: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_single/result/results.txt


## 11. Push Model to Hugging Face

Upload the trained model adapter and optionally the merged model to Hugging Face Hub.

In [None]:
from huggingface_hub import upload_folder, create_repo

HF_USERNAME = "HannahGrj" 
REPO_NAME = "dpo-lfm2-350m-single-gap2-n5000"

HF_TOKEN = "" 
repo_id = f"{HF_USERNAME}/{REPO_NAME}"

LOCAL_MODEL_PATH = "/Users/guanruijia/Desktop/Stanford/CS329H/CS329H_DiningbyDesign/LLM4Rec/yanzhen_final_single/models/LFM2-350M_gap2_n5000_ep3_bs4_lr1e-05"

try:
    create_repo(
        repo_id=repo_id,
        token=HF_TOKEN,
        repo_type="model",  
        private=False,  
        exist_ok=True 
    )
    print(f"‚úÖ Repository created: https://huggingface.co/{repo_id}")
except Exception as e:
    print(f"‚ÑπÔ∏è  Repository may already exist or error: {e}")


upload_folder(
    folder_path=LOCAL_MODEL_PATH,  
    repo_id=repo_id,
    token=HF_TOKEN,
    commit_message=f"Upload DPO adapter (gap2, n=5000)"
)

print(f"‚úÖ Model uploaded to: https://huggingface.co/{repo_id}")

‚úÖ Repository created: https://huggingface.co/HannahGrj/dpo-lfm2-350m-single-gap2-n5000


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

‚úÖ Model uploaded to: https://huggingface.co/HannahGrj/dpo-lfm2-350m-single-gap2-n5000
