# DPO Training Pipeline for Restaurant Recommendation

This notebook implements Direct Preference Optimization (DPO) to train a language model for personalized restaurant recommendations using user profiles and business data.

## 0. Environment Setup

Install required packages and configure Google Drive access.

In [None]:
!pip install trl
!pip install bitsandbytes
!pip install huggingface_hub

Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m465.5/465.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [None]:
from google.colab import drive
drive.mount('/content/drive/')
print("Google Drive remounted successfully.")

Mounted at /content/drive/
Google Drive remounted successfully.


In [None]:
import os
import sys
import torch
import json
import argparse
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import wandb
import random

project_path_llm4rec = '/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec'
sys.path.append(project_path_llm4rec)

In [None]:
MODEL_NAME = "LiquidAI/LFM2-350M"
BASE_PROJECT_PATH = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/yanzhen_final_list"
WANDB_PROJECT = "LLM4Rec-DPO-List"
RANDOM_SEED = 42

test_data_size = 200
train_gap = 2
train_data_size = 2000

NUM_EPOCHS = 2

HF_TOKEN="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"
random.seed(RANDOM_SEED)

## Experiment Configuration

Set the key parameters for dataset size and training constraints.


Code Doc: https://github.com/HannahGuan/CS329H_DiningbyDesign/tree/main/LLM4Rec

## 1. Preference Dataset Creation

Define functions to create DPO preference pairs from user reviews. The dataset pairs high-rated restaurants (chosen) with low-rated restaurants (rejected) for each user, based on their review history.

In [None]:
# DPO Dataset Creation Functions

def load_jsonl(filepath):
    """Load data from JSONL file."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def create_business_lookup(business_data):
    """Create a lookup dictionary for businesses by business_id."""
    return {business['business_id']: business for business in business_data}

def find_best_and_worst_reviews(reviews, min_gap=1):
    """
    Find the most positive (highest stars) and most negative (lowest stars) reviews.
    Returns (best_review, worst_review) or (None, None) if gap < min_gap.
    """
    if not reviews or len(reviews) < 2:
        return None, None

    sorted_reviews = sorted(reviews, key=lambda x: x['stars'])
    worst_review = sorted_reviews[0]
    best_review = sorted_reviews[-1]

    # Check if gap meets minimum requirement
    gap = best_review['stars'] - worst_review['stars']
    if gap < min_gap:
        return None, None

    return best_review, worst_review

def create_dpo_dataset(user_profiles_path, business_path, output_dir, train_min_gap=2, test_min_gap=1, test_size=0.2, random_state=42):
    """
    Create DPO preference dataset with train/test split.

    Args:
        user_profiles_path: Path to user profiles JSONL
        business_path: Path to business JSONL
        output_dir: Directory to save output files
        train_min_gap: Minimum star gap for training set (default: 2)
        test_min_gap: Minimum star gap for test set (default: 1)
        test_size: Fraction of data for test set (default: 0.2)
        random_state: Random seed for reproducibility
    """
    users = load_jsonl(user_profiles_path)
    businesses = load_jsonl(business_path)
    business_lookup = create_business_lookup(businesses)

    all_dpo_data = []
    skipped_no_variance = 0
    missing_business_count = 0

    for user in tqdm(users, desc="Processing users"):
        user_id = user['user_id']
        user_profile = user.get('profile', '')
        reviews = user.get('reviews', [])

        # Use min_gap=1 to get all potential examples
        best_review, worst_review = find_best_and_worst_reviews(reviews, min_gap=1)

        if best_review is None or worst_review is None:
            skipped_no_variance += 1
            continue

        best_business_id = best_review['business_id']
        worst_business_id = worst_review['business_id']

        best_business = business_lookup.get(best_business_id)
        worst_business = business_lookup.get(worst_business_id)

        if not best_business or not worst_business:
            missing_business_count += 1
            continue

        best_business_profile = best_business.get('profile', '')
        worst_business_profile = worst_business.get('profile', '')

        gap = best_review['stars'] - worst_review['stars']

        dpo_example = {
            'user_id': user_id,
            'user_profile': user_profile,
            'star_gap': gap,
            'chosen': {
                'business_id': best_business_id,
                'business_name': best_review['name'],
                'business_profile': best_business_profile,
                'text': f"{user_profile}\n\n{best_business_profile}",
                'rating': best_review['stars'],
                'review_text': best_review['text']
            },
            'rejected': {
                'business_id': worst_business_id,
                'business_name': worst_review['name'],
                'business_profile': worst_business_profile,
                'text': f"{user_profile}\n\n{worst_business_profile}",
                'rating': worst_review['stars'],
                'review_text': worst_review['text']
            }
        }

        all_dpo_data.append(dpo_example)

    train_data_all, test_data_all = train_test_split(
        all_dpo_data,
        test_size=test_size,
        random_state=random_state
    )

    train_data = [ex for ex in train_data_all if ex['star_gap'] >= train_min_gap]
    test_data = [ex for ex in test_data_all if ex['star_gap'] >= test_min_gap]

    os.makedirs(output_dir, exist_ok=True)

    train_path = os.path.join(output_dir, f"dpo_train_gap{train_min_gap}.jsonl")
    test_path = os.path.join(output_dir, f"dpo_test_gap{test_min_gap}.jsonl")

    with open(train_path, 'w', encoding='utf-8') as f:
        for item in train_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    with open(test_path, 'w', encoding='utf-8') as f:
        for item in test_data:
            json.dump(item, f, ensure_ascii=False)
            f.write('\n')

    print(f"Created {len(train_data)} training and {len(test_data)} test examples")

    return train_data, test_data

## 2. Model Training Setup

Define data loading, formatting, and utility functions for DPO training.

In [None]:
# Training configuration
LEARNING_RATE = 1e-5
BATCH_SIZE = 4  # Reduce to 2 or 1 if you run out of memory
EVAL_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_STEPS = 0
MAX_LENGTH = 2048

# LoRA configuration
LORA_R = 64
LORA_ALPHA = 128
LORA_DROPOUT = 0.05

# Quantizat ion
USE_4BIT = True

# Logging
LOGGING_STEPS = 4
EVAL_STEPS = 250
SAVE_STEPS = 250
SAVE_TOTAL_LIMIT = 2

# Weights & Biases
USE_WANDB = True
WANDB_RUN_NAME = f"dpo-{MODEL_NAME.split('/')[-1]}-gap{train_gap}-n{train_data_size}-epoch{NUM_EPOCHS}-lr{LEARNING_RATE}-bs{BATCH_SIZE}"
WANDB_API_KEY = "b53a8b344440d37f80e675cd93858227bab887a7"

# Target modules for LoRA
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
    "fc1", "fc2",
    "w1", "w2", "w3",
]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

if USE_4BIT:
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        quantization_config=quantization_config
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/999 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/709M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

## 3. Data Loading and Utility Functions

Define functions to load and format DPO preference datasets for training.

In [None]:
def load_dpo_data(filepath):
    """Load DPO preference dataset from JSONL file."""
    data = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data.append(json.loads(line))
    return data

def format_preference_data(dpo_data):
    """
    FIXED: Create raw user message content WITHOUT chat template tokens.
    The data_formulate function will apply the proper chat template.
    """
    formatted_data = []

    chosen_first_count = 0
    chosen_second_count = 0

    for item in tqdm(dpo_data, desc="Formatting data"):
        # Extract business names
        chosen_name = item['chosen']['business_name']
        rejected_name = item['rejected']['business_name']

        user_profile = item['user_profile']
        chosen_profile = item['chosen']['business_profile']
        rejected_profile = item['rejected']['business_profile']

        if random.random() < 0.5:
            # Order: chosen first, rejected second
            first_name = chosen_name
            first_profile = chosen_profile
            second_name = rejected_name
            second_profile = rejected_profile
            chosen_first_count += 1
        else:
            # Order: rejected first, chosen second
            first_name = rejected_name
            first_profile = rejected_profile
            second_name = chosen_name
            second_profile = chosen_profile
            chosen_second_count += 1

        # FIXED: Create just the raw user message content
        # Chat template formatting will be applied by data_formulate()
        user_msg = (
            f"User Profile:\n{user_profile}\n\n"
            f"Restaurant Candidates:\n"
            f"- {first_name}: {first_profile}\n\n"
            f"- {second_name}: {second_profile}\n\n"
            f"Rank these restaurants from best to worst match for this user."
        )

        # FIXED: Simple output format matching data_formulate expectations
        chosen_output = f'["{chosen_name}", "{rejected_name}"]'
        rejected_output = f'["{rejected_name}", "{chosen_name}"]'

        formatted_data.append({
            "prompt": user_msg,  # Raw user message, no chat template tokens
            "chosen": chosen_output,
            "rejected": rejected_output,
        })

    print("Chosen first count: ", chosen_first_count)
    print("Chosen second count: ", chosen_second_count)

    return formatted_data

def data_formulate(data, tokenizer):
    """
    Apply chat template to format the ranking task prompt.
    Takes raw user message from format_preference_data and wraps it with
    system instructions using the tokenizer's chat template.
    """

    system_prompt = (
        "You are a preference-aware ranking assistant. "
        "Given a user's dining preferences and descriptions of candidate restaurants, "
        "your job is to rank the restaurants from most to least suitable.\n\n"
        "IMPORTANT: Your final output MUST ONLY contain the business names in a Python list, "
        "ranked in descending order of suitability.\n\n"
        "The REQUIRED output format is exactly:\n"
        '["business_name_1", "business_name_2"]\n\n'
        "Do NOT include numbering, line breaks, explanations, or any other text."
    )

    # data['prompt'] is now a raw user message without chat template tokens
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": data['prompt']},
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )

    # Only print first time for debugging
    if not hasattr(data_formulate, 'printed'):
        print("Sample Final Prompt: ", prompt)
        data_formulate.printed = True

    return prompt

def print_trainable_parameters(model):
    """Print the number of trainable parameters."""
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"\nüöÄ Trainable parameters: {trainable_params:,}")
    print(f"üì¶ Total parameters:     {all_param:,}")
    print(f"üìà Percentage:           {100 * trainable_params / all_param:.4f}%\n")

In [None]:
OUTPUT_DIR = f"{BASE_PROJECT_PATH}/data"

user_profiles_path = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/data/user_profiles.jsonl"
business_path = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/data/business.jsonl"

print(f"Models folder: {BASE_PROJECT_PATH}/models")
print(f"Results folder: {BASE_PROJECT_PATH}/result")

train_data, test_data = create_dpo_dataset(
    user_profiles_path=user_profiles_path,
    business_path=business_path,
    output_dir=OUTPUT_DIR,
    train_min_gap=train_gap,
    test_min_gap=1,
    test_size=0.2,
    random_state=RANDOM_SEED
)

Models folder: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_list/models
Results folder: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_list/result


Processing users:   0%|          | 0/20000 [00:00<?, ?it/s]

Created 11140 training and 3504 test examples


In [None]:
full_train_data_path = f"{BASE_PROJECT_PATH}/data/dpo_train_gap{train_gap}.jsonl"
train_preference_data = load_dpo_data(full_train_data_path)
train_data = format_preference_data(train_preference_data[:train_data_size])

full_test_data_path = f"{BASE_PROJECT_PATH}/data/dpo_test_gap1.jsonl"
test_preference_data = load_dpo_data(full_test_data_path)
test_data = format_preference_data(test_preference_data[:test_data_size])

Formatting data:   0%|          | 0/2000 [00:00<?, ?it/s]

Chosen first count:  969
Chosen second count:  1031


Formatting data:   0%|          | 0/200 [00:00<?, ?it/s]

Chosen first count:  94
Chosen second count:  106


In [None]:
prompt_list = [data_formulate(data, tokenizer) for data in tqdm(train_data, desc="Formatting training")]
chosen_list = [data['chosen'] for data in train_data]
rejected_list = [data['rejected'] for data in train_data]

train_dataset = Dataset.from_dict({
    'prompt': prompt_list,
    'chosen': chosen_list,
    'rejected': rejected_list
})

test_prompt_list = [data_formulate(data, tokenizer) for data in tqdm(test_data, desc="Formatting test")]
test_chosen_list = [data['chosen'] for data in test_data]
test_rejected_list = [data['rejected'] for data in test_data]

test_dataset = Dataset.from_dict({
    'prompt': test_prompt_list,
    'chosen': test_chosen_list,
    'rejected': test_rejected_list
})

Formatting training:   0%|          | 0/2000 [00:00<?, ?it/s]

Sample Final Prompt:  <|startoftext|><|im_start|>system
You are a preference-aware ranking assistant. Given a user's dining preferences and descriptions of candidate restaurants, your job is to rank the restaurants from most to least suitable.

IMPORTANT: Your final output MUST ONLY contain the business names in a Python list, ranked in descending order of suitability.

The REQUIRED output format is exactly:
["business_name_1", "business_name_2"]

Do NOT include numbering, line breaks, explanations, or any other text.<|im_end|>
<|im_start|>user
User Profile:
Porca is a laid‚Äëback foodie who gravitates toward classic American comfort fare and iconic regional specialties, especially cheesesteaks, pizza, and beer‚Äëfriendly venues. He prefers hearty, budget‚Äëto‚Äëmid‚Äërange dishes (price range 1‚Äë2) and values cheap, filling meals over upscale dining. His favorite spots are casual, lively, and unpretentious ‚Äì often family‚Äëfriendly, noisy enough to feel bustling, and sometimes tour

Formatting test:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
print(test_dataset[0]["chosen"])

["Street- Taco and Beer Co.", "Trident Grill III"]


## 4. Initialize DPO Trainer

Set up Weights & Biases logging, configure the trainer, and initialize the DPO training process.

In [None]:
if USE_WANDB and WANDB_API_KEY:
    os.environ["WANDB_API_KEY"] = WANDB_API_KEY
    wandb.login(key=WANDB_API_KEY)

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myanzhen4_stanford[0m ([33myanzhen4_stanford-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
def save_adapter(
    base_output_dir,
    model_name,
    train_data_size,
    train_gap,
    num_epochs,
    batch_size,
    learning_rate
):
    """Save LoRA adapter with descriptive folder name."""
    model_short_name = model_name.split("/")[-1]
    folder_name = f"{model_short_name}_gap{train_gap}_n{train_data_size}_ep{num_epochs}_bs{batch_size}_lr{learning_rate}"
    output_dir = os.path.join(base_output_dir, folder_name)
    os.makedirs(output_dir, exist_ok=True)
    return output_dir

base_output_dir = f"{BASE_PROJECT_PATH}/models"

model_output_dir = save_adapter(
    base_output_dir=base_output_dir,
    model_name=MODEL_NAME,
    train_data_size=train_data_size,
    train_gap=train_gap,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)

In [None]:
training_args = DPOConfig(
    output_dir=model_output_dir,
    logging_steps=LOGGING_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    save_only_model=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    warmup_steps=WARMUP_STEPS,
    eval_strategy="steps" if test_data_size > 0 else "no",
    eval_steps=EVAL_STEPS if test_data_size > 0 else None,
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
    report_to="wandb" if USE_WANDB else "none",
    run_name=WANDB_RUN_NAME if USE_WANDB else None,
    remove_unused_columns=False,
    max_length=MAX_LENGTH,
)

if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_RUN_NAME,
        config={
            "model": MODEL_NAME,
            "learning_rate": LEARNING_RATE,
            "batch_size": BATCH_SIZE,
            "epochs": NUM_EPOCHS,
            "lora_r": LORA_R,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT,
            "use_4bit": USE_4BIT,
        }
    )

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset if test_data_size > 0 else None,
    processing_class=tokenizer,
    peft_config=peft_config,
)

print_trainable_parameters(dpo_trainer.model)

[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Extracting prompt in train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]


üöÄ Trainable parameters: 19,267,584
üì¶ Total parameters:     230,096,640
üìà Percentage:           8.3737%



## 5. Train the Model

Execute the DPO training process with the configured parameters.

In [None]:
dpo_trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 7}.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
250,0.6625,0.704732,-0.208039,-0.211047,0.48,0.003008,-21.763189,-21.411318,-1.330446,-1.332307
500,0.7593,0.711585,-0.722706,-0.735803,0.52,0.013097,-26.909863,-26.658882,-1.207471,-1.208429
750,0.607,0.720248,-0.878601,-0.907468,0.545,0.028866,-28.468813,-28.375526,-1.190576,-1.195058
1000,0.6716,0.738908,-0.727516,-0.739064,0.505,0.011547,-26.957964,-26.691484,-1.14091,-1.145836


TrainOutput(global_step=1000, training_loss=0.6455441553592682, metrics={'train_runtime': 369.8513, 'train_samples_per_second': 10.815, 'train_steps_per_second': 2.704, 'total_flos': 0.0, 'train_loss': 0.6455441553592682, 'epoch': 2.0})

## 6. Save the Model

Save the trained LoRA adapter and tokenizer to disk.

In [None]:
final_model_path = model_output_dir
dpo_trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to: {final_model_path}")

Model saved to: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_list/models/LFM2-350M_gap2_n2000_ep2_bs4_lr1e-05


## 7. Model Evaluation

Test the trained model by comparing perplexity scores between base and trained models on the test set.

In [None]:
import numpy as np
import torch.nn.functional as F
from peft import PeftModel

def load_model_for_inference(base_model_name, adapter_path=None):
    """Load model for inference."""
    inf_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    inf_tokenizer.pad_token = inf_tokenizer.eos_token

    inf_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    if adapter_path:
        inf_model = PeftModel.from_pretrained(inf_model, adapter_path)
        inf_model = inf_model.merge_and_unload()

    return inf_model, inf_tokenizer

def calculate_perplexity(model, tokenizer, prompt, response):
    """Calculate perplexity of a response given a prompt."""
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    prompt_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        logits = outputs.logits
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = inputs['input_ids'][..., 1:].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        losses = losses.view(shift_labels.shape)
        response_losses = losses[:, prompt_length-1:]

        avg_loss = response_losses.mean().item()
        perplexity = np.exp(avg_loss)

    return perplexity, avg_loss

def evaluate_list_perplexity(model, tokenizer, formatted_prompt_str, list_response):
    """Evaluate perplexity for a list recommendation using an already formatted prompt."""
    perplexity, avg_loss = calculate_perplexity(model, tokenizer, formatted_prompt_str, list_response)
    return perplexity, avg_loss

In [None]:
test_dataset

Dataset({
    features: ['prompt', 'chosen', 'rejected'],
    num_rows: 200
})

In [None]:
adapter_path = model_output_dir
base_model, base_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=None)
trained_model, trained_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=adapter_path)

base_chosen_perplexities = []
base_rejected_perplexities = []
trained_chosen_perplexities = []
trained_rejected_perplexities = []

base_correct = 0
trained_correct = 0

for i, sample in enumerate(tqdm(test_data, desc="Evaluating")):
    full_prompt_for_inference = sample['prompt']

    chosen_response = sample['chosen']
    rejected_response = sample['rejected']

    chosen_perp_base, _ = evaluate_list_perplexity(
        base_model, base_tokenizer, full_prompt_for_inference, chosen_response
    )
    base_chosen_perplexities.append(chosen_perp_base)

    rejected_perp_base, _ = evaluate_list_perplexity(
        base_model, base_tokenizer, full_prompt_for_inference, rejected_response
    )
    base_rejected_perplexities.append(rejected_perp_base)

    if chosen_perp_base < rejected_perp_base:
        base_correct += 1

    chosen_perp_trained, _ = evaluate_list_perplexity(
        trained_model, trained_tokenizer, full_prompt_for_inference, chosen_response
    )
    trained_chosen_perplexities.append(chosen_perp_trained)

    rejected_perp_trained, _ = evaluate_list_perplexity(
        trained_model, trained_tokenizer, full_prompt_for_inference, rejected_response
    )
    trained_rejected_perplexities.append(rejected_perp_trained)

    if chosen_perp_trained < rejected_perp_trained:
        trained_correct += 1

`torch_dtype` is deprecated! Use `dtype` instead!


Evaluating:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
import numpy as np

# Calculate average perplexities
average_base_chosen_perplexity = np.mean(base_chosen_perplexities)
average_base_rejected_perplexity = np.mean(base_rejected_perplexities)
average_trained_chosen_perplexity = np.mean(trained_chosen_perplexities)
average_trained_rejected_perplexity = np.mean(trained_rejected_perplexities)

base_ppl_diff = average_base_rejected_perplexity - average_base_chosen_perplexity
trained_ppl_diff = average_trained_rejected_perplexity - average_trained_chosen_perplexity

num_samples = len(test_data)
base_accuracy = base_correct / num_samples
trained_accuracy = trained_correct / num_samples

print("\n" + "="*60)
print("EVALUATION RESULTS")
print("="*60)
print(f"\nüìä Base Model:")
print(f"  Chosen Perplexity:   {average_base_chosen_perplexity:.2f}")
print(f"  Rejected Perplexity: {average_base_rejected_perplexity:.2f}")
print(f"  PPL Difference:      {base_ppl_diff:.2f} (rejected - chosen)")
print(f"  Accuracy:            {base_accuracy:.2%} ({base_correct}/{num_samples})")

print(f"\nüéØ Trained Model:")
print(f"  Chosen Perplexity:   {average_trained_chosen_perplexity:.2f}")
print(f"  Rejected Perplexity: {average_trained_rejected_perplexity:.2f}")
print(f"  PPL Difference:      {trained_ppl_diff:.2f} (rejected - chosen)")
print(f"  Accuracy:            {trained_accuracy:.2%} ({trained_correct}/{num_samples})")

print(f"\nüìà Improvement:")
print(f"  Accuracy Gain:       {(trained_accuracy - base_accuracy):.2%}")
print(f"  PPL Diff Gain:       {(trained_ppl_diff - base_ppl_diff):.2f}")
print("="*60 + "\n")

def save_evaluation_results(
    adapter_path,
    base_chosen_ppl,
    base_rejected_ppl,
    base_ppl_diff,
    base_accuracy,
    trained_chosen_ppl,
    trained_rejected_ppl,
    trained_ppl_diff,
    trained_accuracy,
    results_dir=None
):
    """Append evaluation results to results.txt"""
    if results_dir is None:
        results_dir = f"{BASE_PROJECT_PATH}/result"
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, "results.txt")

    with open(results_path, 'a') as f:
        f.write(f"{adapter_path}\n")
        f.write(f"Base Model:\n")
        f.write(f"  Chosen PPL:     {base_chosen_ppl:.2f}\n")
        f.write(f"  Rejected PPL:   {base_rejected_ppl:.2f}\n")
        f.write(f"  PPL Difference: {base_ppl_diff:.2f}\n")
        f.write(f"  Accuracy:       {base_accuracy:.2%}\n")
        f.write(f"\nTrained Model:\n")
        f.write(f"  Chosen PPL:     {trained_chosen_ppl:.2f}\n")
        f.write(f"  Rejected PPL:   {trained_rejected_ppl:.2f}\n")
        f.write(f"  PPL Difference: {trained_ppl_diff:.2f}\n")
        f.write(f"  Accuracy:       {trained_accuracy:.2%}\n")
        f.write(f"\nImprovement:\n")
        f.write(f"  Accuracy Gain:  {(trained_accuracy - base_accuracy):.2%}\n")
        f.write(f"  PPL Diff Gain:  {(trained_ppl_diff - base_ppl_diff):.2f}\n")
        f.write("-" * 50 + "\n")

    print(f"Results appended to: {results_path}")


save_evaluation_results(
    adapter_path=adapter_path,
    base_chosen_ppl=average_base_chosen_perplexity,
    base_rejected_ppl=average_base_rejected_perplexity,
    base_ppl_diff=base_ppl_diff,
    base_accuracy=base_accuracy,
    trained_chosen_ppl=average_trained_chosen_perplexity,
    trained_rejected_ppl=average_trained_rejected_perplexity,
    trained_ppl_diff=trained_ppl_diff,
    trained_accuracy=trained_accuracy
)



EVALUATION RESULTS

üìä Base Model:
  Chosen Perplexity:   11.76
  Rejected Perplexity: 10.90
  PPL Difference:      -0.87 (rejected - chosen)
  Accuracy:            47.50% (95/200)

üéØ Trained Model:
  Chosen Perplexity:   11.85
  Rejected Perplexity: 11.16
  PPL Difference:      -0.69 (rejected - chosen)
  Accuracy:            45.50% (91/200)

üìà Improvement:
  Accuracy Gain:       -2.00%
  PPL Diff Gain:       0.17

Results appended to: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_list/result/results.txt


In [None]:
import numpy as np
import torch.nn.functional as F
from peft import PeftModel

def load_model_for_inference(base_model_name, adapter_path=None):
    """Load model for inference."""
    inf_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    inf_tokenizer.pad_token = inf_tokenizer.eos_token

    inf_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    if adapter_path:
        inf_model = PeftModel.from_pretrained(inf_model, adapter_path)
        inf_model = inf_model.merge_and_unload()

    return inf_model, inf_tokenizer

def calculate_perplexity(model, tokenizer, prompt, response):
    """Calculate perplexity of a response given a prompt."""
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt").to(model.device)
    prompt_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    prompt_length = prompt_inputs['input_ids'].shape[1]

    with torch.no_grad():
        outputs = model(**inputs, labels=inputs['input_ids'])
        logits = outputs.logits
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = inputs['input_ids'][..., 1:].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        losses = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
        losses = losses.view(shift_labels.shape)
        response_losses = losses[:, prompt_length-1:]

        avg_loss = response_losses.mean().item()
        perplexity = np.exp(avg_loss)

    return perplexity, avg_loss

def evaluate_list_perplexity(model, tokenizer, formatted_prompt_str, list_response):
    """Evaluate perplexity for a list recommendation using an already formatted prompt."""
    perplexity, avg_loss = calculate_perplexity(model, tokenizer, formatted_prompt_str, list_response)
    return perplexity, avg_loss

# -------------------------------------------------------
# Generation-based Ranking Accuracy Functions
# -------------------------------------------------------
def generate_ranked_list(model, tokenizer, prompt, max_new_tokens=128, verbose=False):
    """
    Generate a ranked list from the model.
    """
    if verbose:
        print("Prompt: ", prompt)

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    # Decode only the new tokens (exclude the input prompt)
    new_tokens = outputs[0][len(inputs.input_ids[0]):]
    decoded = tokenizer.decode(new_tokens, skip_special_tokens=True)

    if verbose:
        print("Full model output:", decoded)

    # Extract just the JSON list (everything from [ to ])
    import re
    match = re.search(r'\[.*?\]', decoded, re.DOTALL)
    if match:
        decoded = match.group(0)
        if verbose:
            print("Extracted JSON:", decoded)

    return decoded.strip()

def extract_ranked_businesses(output_str):
    """
    Extract business names from model output and normalize them.
    Returns a list of normalized business names.
    """
    import ast
    import re

    output_str = output_str.strip()

    # Try to parse as JSON-style Python list
    try:
        parsed = ast.literal_eval(output_str)
        if isinstance(parsed, list) and len(parsed) > 0:
            # Filter out empty strings and normalize
            normalized = [normalize_name(x) for x in parsed if x and isinstance(x, str)]
            return normalized
    except Exception as e:
        pass

    # Fallback: match string inside [...] and split by commas
    m = re.search(r"\[([^\]]+)\]", output_str)
    if m:
        inside = m.group(1)
        # split by comma, remove quotes/spaces
        items = [
            normalize_name(x.strip().strip('"').strip("'"))
            for x in inside.split(",")
        ]
        return [x for x in items if x]

    # Final fallback: empty (model didn't follow format)
    return []

def normalize_name(name):
    """
    Lowercase and remove all non-letter characters for robust comparison.
    Keeps only a-z letters.
    """
    import re

    if not isinstance(name, str):
        return ""
    name = name.lower()
    name = re.sub(r'[^a-z]', '', name)  # keep only a‚Äìz
    return name

def evaluate_ranking_accuracy(model, tokenizer, test_dataset, verbose=False):
    """
    Evaluate if the model ranks the chosen business higher than rejected.

    Args:
        model: The model to evaluate
        tokenizer: The tokenizer
        test_dataset: HF Dataset with 'prompt', 'chosen', 'rejected' keys
        verbose: If True, print detailed debug info for first few samples
    """
    import ast

    correct = 0
    total = 0

    # Handle both Dataset object and dictionary (from slicing)
    if isinstance(test_dataset, dict):
        num_samples = len(test_dataset['prompt'])
        prompts = test_dataset['prompt']
        chosen_outputs = test_dataset['chosen']
        rejected_outputs = test_dataset['rejected']
    else:
        num_samples = len(test_dataset)
        prompts = test_dataset['prompt']
        chosen_outputs = test_dataset['chosen']
        rejected_outputs = test_dataset['rejected']

    for i in tqdm(range(num_samples), desc="Evaluating ranking accuracy"):
        prompt = prompts[i]
        chosen_output = chosen_outputs[i]
        rejected_output = rejected_outputs[i]

        # Parse names from the format: '["name1", "name2"]'
        try:
            chosen_list = ast.literal_eval(chosen_output)
            rejected_list = ast.literal_eval(rejected_output)

            chosen_name_raw = chosen_list[0]
            rejected_name_raw = rejected_list[0]
        except Exception as e:
            print(f"Warning: Could not parse chosen/rejected for sample {i}: {e}")
            continue

        # Normalize both
        chosen_name = normalize_name(chosen_name_raw)
        rejected_name = normalize_name(rejected_name_raw)

        # Generate model output
        show_verbose = verbose and i < 3
        output = generate_ranked_list(model, tokenizer, prompt, verbose=show_verbose)
        ranked = extract_ranked_businesses(output)

        if show_verbose:
            print(f"\n{'='*60}")
            print(f"Sample {i} Debug Info:")
            print(f"{'='*60}")
            print(f"Chosen (raw): {chosen_name_raw}")
            print(f"Chosen (normalized): {chosen_name}")
            print(f"Rejected (raw): {rejected_name_raw}")
            print(f"Rejected (normalized): {rejected_name}")
            print(f"Model output: {output}")
            print(f"Ranked (normalized): {ranked}")
            print(f"{'='*60}\n")

        # Check if model output is valid (has at least one restaurant)
        if len(ranked) >= 1:
            total += 1
            # Check if chosen restaurant is in first position
            if ranked[0] == chosen_name:
                correct += 1

    if total == 0:
        print("Warning: No valid samples to evaluate!")
        return 0.0, 0, 0

    return correct / total, correct, total

In [None]:
print("\n" + "="*60)
print("GENERATION-BASED RANKING ACCURACY")
print("="*60)

VERBOSE_EVAL = False

# Evaluate on a subset of test dataset for generation
eval_size = min(200, len(test_dataset))

print(f"\nEvaluating on {eval_size} samples...")
print(f"\nüìä Base Model (Generation):")
base_gen_accuracy, base_gen_correct, base_gen_total = evaluate_ranking_accuracy(
    base_model, base_tokenizer, test_dataset[:eval_size], verbose=VERBOSE_EVAL
)
print(f"  Ranking Accuracy: {base_gen_accuracy:.2%} ({base_gen_correct}/{base_gen_total})")

print(f"\nüéØ Trained Model (Generation):")
trained_gen_accuracy, trained_gen_correct, trained_gen_total = evaluate_ranking_accuracy(
    trained_model, trained_tokenizer, test_dataset[:eval_size], verbose=VERBOSE_EVAL
)
print(f"  Ranking Accuracy: {trained_gen_accuracy:.2%} ({trained_gen_correct}/{trained_gen_total})")

print(f"\nüìà Improvement:")
print(f"  Accuracy Gain:  {(trained_gen_accuracy - base_gen_accuracy):.2%}")
print("="*60 + "\n")

# Save generation accuracy results to separate file
def save_generation_accuracy_results(
    adapter_path,
    base_gen_accuracy,
    base_gen_correct,
    base_gen_total,
    trained_gen_accuracy,
    trained_gen_correct,
    trained_gen_total,
    results_dir=None
):
    """Append generation-based ranking accuracy results to result_accuracy.txt"""
    if results_dir is None:
        results_dir = f"{BASE_PROJECT_PATH}/result"
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, "result_accuracy.txt")

    with open(results_path, 'a') as f:
        f.write(f"{adapter_path}\n")
        f.write(f"Generation-Based Ranking Accuracy:\n")
        f.write(f"\nBase Model:\n")
        f.write(f"  Accuracy: {base_gen_accuracy:.2%} ({base_gen_correct}/{base_gen_total})\n")
        f.write(f"\nTrained Model:\n")
        f.write(f"  Accuracy: {trained_gen_accuracy:.2%} ({trained_gen_correct}/{trained_gen_total})\n")
        f.write(f"\nImprovement:\n")
        f.write(f"  Accuracy Gain: {(trained_gen_accuracy - base_gen_accuracy):.2%}\n")
        f.write("-" * 50 + "\n")

    print(f"Generation accuracy results appended to: {results_path}")

save_generation_accuracy_results(
    adapter_path=adapter_path,
    base_gen_accuracy=base_gen_accuracy,
    base_gen_correct=base_gen_correct,
    base_gen_total=base_gen_total,
    trained_gen_accuracy=trained_gen_accuracy,
    trained_gen_correct=trained_gen_correct,
    trained_gen_total=trained_gen_total
)



GENERATION-BASED RANKING ACCURACY

Evaluating on 200 samples...

üìä Base Model (Generation):


Evaluating ranking accuracy:   0%|          | 0/200 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


  Ranking Accuracy: 56.50% (113/200)

üéØ Trained Model (Generation):


Evaluating ranking accuracy:   0%|          | 0/200 [00:00<?, ?it/s]

  Ranking Accuracy: 52.50% (105/200)

üìà Improvement:
  Accuracy Gain:  -4.00%

Generation accuracy results appended to: /content/drive/MyDrive/329H_Final_project/LLM4Rec/yanzhen_final_list/result/result_accuracy.txt
