### 0. Setting

In [1]:
!pip install trl
!pip install bitsandbytes
!pip install huggingface_hub

Collecting trl
  Downloading trl-0.25.1-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.25.1-py3-none-any.whl (465 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m465.5/465.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: trl
Successfully installed trl-0.25.1
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl (59.4 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.2


In [2]:
from google.colab import drive
drive.mount('/content/drive/')
print("Google Drive remounted successfully.")

Mounted at /content/drive/
Google Drive remounted successfully.


In [3]:
import os
import sys
import torch
import json
import argparse
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
import wandb
import random

# CHANGE IT BASED ON YOUR GOOGLE DRIVE STRUCTURE
project_path_llm4rec = '/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec'
print(f"Listing contents of '{project_path_llm4rec}':")
!ls {project_path_llm4rec}

sys.path.append(project_path_llm4rec)

Listing contents of '/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec':
hannah_GPT4o_baseline  justin_LFM_fulllist   yanzhen_final_single
hannah_qwen_list       justin_Qwen_fulllist
hannah_qwen_single     yanzhen_final_list


Code Doc: https://github.com/HannahGuan/CS329H_DiningbyDesign/tree/main/LLM4Rec

### 1. Create Preference Dataset and Setup Training

In [4]:
from datasets import load_dataset


def process_to_raw_dpo_pushHF(min_gap = 2):
  data = load_dataset("zetianli/CS329H_Project_user_profiles",split="train")

  def get_dpo_sample(row):
    reviews = row['reviews']
    sorted_reviews = sorted(reviews, key=lambda x: x['rating'], reverse=True)
    star_gap = sorted_reviews[0]['stars'] - sorted_reviews[-1]['stars']
    # suffle reviews
    random.shuffle(reviews)
    business_profiles = [r['profile'] for r in reviews]

    chosen_rating = [r['stars'] for r in sorted_reviews]
    rejected_rating = [r['stars'] for r in reviews]

    chosen_business_id = [r['business_id'] for r in sorted_reviews]
    rejected_business_id = [r['business_id'] for r in reviews]

    chosen_business_name = [r['name'] for r in sorted_reviews]
    rejected_business_name = [r['name'] for r in reviews]

    chosen = {
        "business_id": chosen_business_id,
        "business_name": chosen_business_name,
        "rating": chosen_rating
    }

    rejected = {
        "business_id": rejected_business_id,
        "business_name": rejected_business_name,
        "rating": rejected_rating
    }

    rating_map = json.dumps({r['name']: r['stars'] for r in sorted_reviews})
    user_profile = row['profile']
    user_id = row['user_id']

    sample = {
        "user_id": user_id,
        "user_profile": user_profile,
        "star_gap": star_gap,
        "chosen": chosen,
        "rejected": rejected,
        "rating_map": rating_map,
        "business_profiles": business_profiles,
    }
    return sample

  dpo_data = []
  for row in data:
    sample = get_dpo_sample(row)
    if sample['star_gap'] < min_gap:
      continue
    dpo_data.append(sample)

  dpo_dataset = Dataset.from_list(dpo_data)
  dpo_dataset.push_to_hub("zetianli/CS329H_Project_dpo")


def format_messages(row):
  system_prompt = (
        "You are a restaurant recommendation expert. Rank restaurants based on user preferences.\n"
        "If NameA ranked 1, NameB ranked 2, NameC ranked 3, NameD ranked 4, and NameE ranked 5, you should ONLY output the a json object as the following:\n"
        "{'1': NameA, '2': NameB, '3': NameC, '4': NameD, '5': NameE}\n\n"
        "Please DO NOT include other text!"
  )

  restaurant_candidates =""
  for i in range(len(row['business_profiles'])):
    restaurant_candidates += f"{i+1}. {row["rejected"]['business_name'][i]}: {row['business_profiles'][i]}\n\n"

  user_prompt = (
      "User Profile:\n"
      f"{row['user_profile']}\n\n"
      f"Restaurant Candidates:\n"
      f"{restaurant_candidates}\n"
  )

  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_prompt},
  ]

  return messages



def data_formulate_to_HF(tokenizer):
  """format the raw dpo data, then push to HF"""
  dataset = load_dataset("zetianli/CS329H_Project_dpo",split="train")
  formatted_data = []
  for row in dataset:
    prompt = tokenizer.apply_chat_template(
        format_messages(row),
        tokenize=False,
        add_generation_prompt=True
    )

    chosen = json.dumps({i+1: row['chosen']['business_name'][i] for i in range(len(row['chosen']['business_name']))})
    rejected = json.dumps({i+1: row['rejected']['business_name'][i] for i in range(len(row['rejected']['business_name']))})
    stars_mapping = row['rating_map']
    ground_truth_rating_rank = row['chosen']['rating']

    formatted_sample = {
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected,
        "stars_mapping": stars_mapping,
        "ground_truth_rating_rank": ground_truth_rating_rank
    }
    formatted_data.append(formatted_sample)

  hf_dataset = Dataset.from_list(formatted_data)
  # push
  hf_dataset.push_to_hub("zetianli/CS329H_Project_dpo_formatted")

def print_trainable_parameters(model):
    """Print the number of trainable parameters."""
    trainable_params = 0
    all_param = 0
    for name, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"\nüöÄ Trainable parameters: {trainable_params:,}")
    print(f"üì¶ Total parameters:     {all_param:,}")
    print(f"üìà Percentage:           {100 * trainable_params / all_param:.4f}%\n")

In [5]:
# Model configuration
MODEL_NAME = "Qwen/Qwen3-0.6B" #"LiquidAI/LFM2-350M"
HF_TOKEN="" # put your token
# Data configuration
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

# Training configuration
LEARNING_RATE = 1e-5
BATCH_SIZE = 4  # Reduce to 2 or 1 if you run out of memory
EVAL_BATCH_SIZE = 4
NUM_EPOCHS = 4
GRADIENT_ACCUMULATION_STEPS = 1
WARMUP_STEPS = 0
MAX_LENGTH = 2048

# LoRA configuration
LORA_R = 64
LORA_ALPHA = 128
LORA_DROPOUT = 0.05

# Quantization
USE_4BIT = True  # Set to False if you have enough GPU memory

# Logging
LOGGING_STEPS = 4
EVAL_STEPS = 500
SAVE_STEPS = 250
SAVE_TOTAL_LIMIT = 2

# Weights & Biases (optional)
USE_WANDB = True  # Set to True if you want to use W&B
WANDB_PROJECT = "LLM4Rec-DPO"
WANDB_RUN_NAME = f"dpo-{MODEL_NAME.split('/')[-1]}-lr{LEARNING_RATE}-bs{BATCH_SIZE}"
WANDB_API_KEY = "b53a8b344440d37f80e675cd93858227bab887a7"  # Set your W&B API key if using: "xxxxx"

# Target modules for LoRA
TARGET_MODULES = [
    "q_proj", "k_proj", "v_proj", "o_proj",
    "gate_proj", "up_proj", "down_proj",
    "fc1", "fc2",
    "w1", "w2", "w3",
]

print("Configuration set!")
print(f"Model: {MODEL_NAME}")
print(f"Epochs: {NUM_EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Using 4-bit quantization: {USE_4BIT}")

Configuration set!
Model: Qwen/Qwen3-0.6B
Epochs: 4
Batch size: 4
Learning rate: 1e-05
Using 4-bit quantization: True


In [6]:
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
tokenizer.pad_token = tokenizer.eos_token

print("Loading model...")
if USE_4BIT:
    print("Using 4-bit quantization...")
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type='nf4'
    )
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        quantization_config=quantization_config
    )
else:
    print("Loading full precision model...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        token=HF_TOKEN,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

print("Model loaded successfully!")

Loading tokenizer...


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Loading model...
Using 4-bit quantization...


config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Model loaded successfully!


##### 2. Get Training and Test Data

In [7]:
# Set your paths
BASE_PATH = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/data"
OUTPUT_DIR = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/data"


# load train and test
test_data_size = 200
train_data_size = 5000
train_dataset = load_dataset("zetianli/CS329H_DPO_FullList_train", split="train")
test_dataset = load_dataset("zetianli/CS329H_DPO_FullList_test", split="train")

# create train and test set
"""
process_to_raw_dpo_pushHF() # process from raw yelp data into raw dpo
data = load_dataset("zetianli/CS329H_Project_dpo_formatted",split="train")
data = data.shuffle(seed=42)
train_dataset = data.select(range(train_data_size))
test_dataset = data.select(range(train_data_size, train_data_size + test_data_size))

train_dataset.push_to_hub("zetianli/CS329H_DPO_FullList_train")
test_dataset.push_to_hub("zetianli/CS329H_DPO_FullList_test")
"""

README.md:   0%|          | 0.00/467 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5000 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/442 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/569k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

'\nprocess_to_raw_dpo_pushHF() # process from raw yelp data into raw dpo\ndata = load_dataset("zetianli/CS329H_Project_dpo_formatted",split="train")\ndata = data.shuffle(seed=42)\ntrain_dataset = data.select(range(train_data_size))\ntest_dataset = data.select(range(train_data_size, train_data_size + test_data_size))\n\ntrain_dataset.push_to_hub("zetianli/CS329H_DPO_FullList_train")\ntest_dataset.push_to_hub("zetianli/CS329H_DPO_FullList_test")\n'

##### 3- initialize DPO training

In [8]:
# Setup Wandb if requested
if USE_WANDB:
    if WANDB_API_KEY:
        os.environ["WANDB_API_KEY"] = WANDB_API_KEY
        wandb.login(key=WANDB_API_KEY)
        print(f"Weights & Biases project: {WANDB_PROJECT}")
        print(f"Run name: {WANDB_RUN_NAME}")
    else:
        print("Warning: Wandb requested but no API key provided. Disabling wandb.")
        USE_WANDB = False
else:
    print("Weights & Biases logging disabled")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33myanzhen4_stanford[0m ([33myanzhen4_stanford-stanford-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Weights & Biases project: LLM4Rec-DPO
Run name: dpo-Qwen3-0.6B-lr1e-05-bs4


In [9]:
def save_adapter(
    base_output_dir,
    model_name,
    train_data_size,
    train_gap,
    num_epochs,
    batch_size,
    learning_rate
):
    """Save LoRA adapter with descriptive folder name."""

    # Create folder name with key hyperparameters
    model_short_name = model_name.split("/")[-1]
    folder_name = f"{model_short_name}_gap{train_gap}_n{train_data_size}_ep{num_epochs}_bs{batch_size}_lr{learning_rate}"
    output_dir = os.path.join(base_output_dir, folder_name)

    # Save adapter
    os.makedirs(output_dir, exist_ok=True)

    print(f"Adapter saved to: {output_dir}")

    return output_dir


# Usage
base_output_dir = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model"

model_output_dir = save_adapter(
    base_output_dir=base_output_dir,
    model_name=MODEL_NAME,
    train_data_size=train_data_size,
    train_gap=2,
    num_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)

Adapter saved to: /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap2_n5000_ep4_bs4_lr1e-05


In [10]:
base_output_dir = "/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model"

In [11]:
print("Setting up training configuration...")

training_args = DPOConfig(
    output_dir=model_output_dir,
    logging_steps=LOGGING_STEPS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    save_only_model=True,
    learning_rate=LEARNING_RATE,
    num_train_epochs=NUM_EPOCHS,
    warmup_steps=WARMUP_STEPS,
    eval_strategy="steps" if test_data_size > 0 else "no",
    eval_steps=EVAL_STEPS if test_data_size > 0 else None,
    save_steps=SAVE_STEPS,
    save_total_limit=SAVE_TOTAL_LIMIT,
    report_to="wandb" if USE_WANDB else "none",
    run_name=WANDB_RUN_NAME if USE_WANDB else None,
    remove_unused_columns=False,
    max_length=MAX_LENGTH,
)

if USE_WANDB:
    wandb.init(
        project=WANDB_PROJECT,
        name=WANDB_RUN_NAME,
        config={
            "model": MODEL_NAME,
            "learning_rate": LEARNING_RATE,
            "batch_size": BATCH_SIZE,
            "epochs": NUM_EPOCHS,
            "lora_r": LORA_R,
            "lora_alpha": LORA_ALPHA,
            "lora_dropout": LORA_DROPOUT,
            "use_4bit": USE_4BIT,
        }
    )

peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
)

print("Configuration complete!")

print("Initializing DPO trainer...")
dpo_trainer = DPOTrainer(
    model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset if test_data_size > 0 else None,
    processing_class=tokenizer,
    peft_config=peft_config,
)

print_trainable_parameters(dpo_trainer.model)
print("Trainer initialized!")

Setting up training configuration...


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Configuration complete!
Initializing DPO trainer...


Extracting prompt in train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/200 [00:00<?, ? examples/s]


üöÄ Trainable parameters: 40,370,176
üì¶ Total parameters:     416,219,136
üìà Percentage:           9.6993%

Trainer initialized!


##### 4- train the model

In [12]:
"""
print("="*80)
print("Starting DPO Training...")
print("="*80)

# Train the model
dpo_trainer.train()

print("\n" + "="*80)
print("Training Complete!")
print("="*80)
"""

'\nprint("="*80)\nprint("Starting DPO Training...")\nprint("="*80)\n\n# Train the model\ndpo_trainer.train()\n\nprint("\n" + "="*80)\nprint("Training Complete!")\nprint("="*80)\n'

##### 5- save the model

In [13]:
# save locally
final_model_path = '/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs'
print(f"Saving final model to {final_model_path}...")
dpo_trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print("\nTraining complete! Model saved successfully.")
print(f"Model location: {final_model_path}")

Saving final model to /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs...

Training complete! Model saved successfully.
Model location: /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs


### Test the Model via Inference

In [14]:
import numpy as np
import re
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import torch
import os

# -------------------------------------------------------
# Load Model for Inference
# -------------------------------------------------------
def load_model_for_inference(base_model_name, adapter_path=None):
    print(f"Loading tokenizer from {base_model_name}...")
    inf_tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    inf_tokenizer.pad_token = inf_tokenizer.eos_token

    print(f"Loading base model from {base_model_name}...")
    inf_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto"
    )

    if adapter_path:
        print(f"Loading LoRA adapter from {adapter_path}...")
        inf_model = PeftModel.from_pretrained(inf_model, adapter_path)
        inf_model = inf_model.merge_and_unload()

    return inf_model, inf_tokenizer

def remove_think_blocks(text):
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL)
# -------------------------------------------------------
# Generation: model must output ranked list only
# -------------------------------------------------------
def generate_ranked_json(model, tokenizer, prompt, max_new_tokens=256):
    """
    FIXED: Added stop tokens and better extraction logic.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    # FIXED: Add stop tokens to prevent rambling
    stop_tokens = ["<|im_end|>", "\n\n\n"]
    stop_token_ids = [tokenizer.encode(t, add_special_tokens=False) for t in stop_tokens]

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    try:
      decoded = tokenizer.decode(outputs[0], skip_special_tokens=False)[len(inputs):]
      decoded = remove_think_blocks(decoded)
     # print(decoded)

      # FIXED: Better extraction - get only the assistant's response
      if "<|im_start|>assistant" in decoded:
          response = decoded.split("<|im_start|>assistant")[-1]
          # Remove end token if present
          if "<|im_end|>" in response:
              response = response.split("<|im_end|>")[0]
          decoded = response.strip()
      if decoded[-1] != "}":
        decoded = decoded + "}"
      json_obj = re.search(r"\{.*?\}", decoded, flags=re.DOTALL).group(0)


      return json_obj
    except:
      return ""



In [17]:
MODEL_NAME = "Qwen/Qwen3-0.6B"
adapter_path = '/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs'
print("Loading base model...")
base_model, base_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=None)

print("\nLoading trained model...")
trained_model, trained_tokenizer = load_model_for_inference(MODEL_NAME, adapter_path=adapter_path)

Loading base model...
Loading tokenizer from Qwen/Qwen3-0.6B...
Loading base model from Qwen/Qwen3-0.6B...


`torch_dtype` is deprecated! Use `dtype` instead!



Loading trained model...
Loading tokenizer from Qwen/Qwen3-0.6B...
Loading base model from Qwen/Qwen3-0.6B...
Loading LoRA adapter from /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs...


In [18]:
# inference
"""
trained_response = []
base_response = []
for sample in tqdm(test_dataset):
  prompt = sample["prompt"].replace("Rank these restaurants from best to worst match for this user.", "Rank these restaurants from best to worst match for this user. /no_think") # turn off thinking to speed up
  trained_response.append(generate_ranked_json(trained_model, trained_tokenizer, prompt))
  base_response.append(generate_ranked_json(base_model, base_tokenizer, prompt))
"""

'\ntrained_response = []\nbase_response = []\nfor sample in tqdm(test_dataset):\n  prompt = sample["prompt"].replace("Rank these restaurants from best to worst match for this user.", "Rank these restaurants from best to worst match for this user. /no_think") # turn off thinking to speed up\n  trained_response.append(generate_ranked_json(trained_model, trained_tokenizer, prompt))\n  base_response.append(generate_ranked_json(base_model, base_tokenizer, prompt))\n'

In [19]:
# get test output dataset
#test_dataset = test_dataset.remove_columns("base_response")
#test_dataset = test_dataset.remove_columns("trained_response")
#test_dataset = test_dataset.add_column("trained_response", trained_response)
#test_dataset = test_dataset.add_column("base_response", base_response)

In [20]:
test_dataset = load_dataset("zetianli/CS329H_DPO_Qwen_test_output", split='train')

README.md:   0%|          | 0.00/529 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/605k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

# Evaluation

In [21]:
import re
import json
import ast
from datasets import load_dataset

In [22]:
def get_order(scores):
    scores = np.array(scores)
    # negative because argsort = ascending
    return list(np.argsort(-scores))


def map_at_k(gold_scores, pred_scores, k=None, rel_threshold=4):
    if k is None:
        k = len(gold_scores)

    pred_order = get_order(pred_scores)
    relevant = [i for i, s in enumerate(gold_scores) if s >= rel_threshold]

    hits = 0
    precisions = []

    for rank, i in enumerate(pred_order[:k], start=1):
        if i in relevant:
            hits += 1
            precisions.append(hits / rank)

    if not relevant:
        return 0.0
    #print(f"map: {sum(precisions)}/{len(relevant)}= {sum(precisions) / len(relevant)}")
    return sum(precisions) / len(relevant)


import math

def ndcg_at_k(gold_scores, pred_scores, k=None):
    if k is None:
        k = len(gold_scores)

    # reorder predicted scores by predicted ranking
    pred_order = get_order(pred_scores)
    pred_rels = [gold_scores[i] for i in pred_order[:k]]  # use gold relevance

    def dcg(rels):
        return sum(
            (2**rel - 1) / math.log2(i + 2)        # i starts at 0 ‚Üí rank i+1
            for i, rel in enumerate(rels)
        )

    dcg_pred = dcg(pred_rels)

    # ideal DCG
    ideal_rels = sorted(gold_scores, reverse=True)[:k]
    idcg = dcg(ideal_rels)

    #print(f"ndcg: {dcg_pred}/{idcg} = {dcg_pred / idcg if idcg > 0 else 0.0}")
    return dcg_pred / idcg if idcg > 0 else 0.0


def pairwise_accuracy(gold_scores, pred_scores):
    n = len(gold_scores)
    pred_order = get_order(pred_scores)
    pos = {idx: rank for rank, idx in enumerate(pred_order)}  # position map

    correct = 0
    total = 0

    for i in range(n):
        for j in range(i + 1, n):
            if gold_scores[i] == gold_scores[j]:
                continue  # skip ties

            gold_pref = 1 if gold_scores[i] > gold_scores[j] else -1
            pred_pref = 1 if pos[i] < pos[j] else -1

            total += 1
            if gold_pref == pred_pref:
                correct += 1
    #print(f"pair: {correct}/{total} = {correct / total if total > 0 else 0.0}")
    return correct / total if total > 0 else 0.0


import torch
import math
def perplexity(text, model, tokenizer):
  model.eval()
  enc = tokenizer(text, return_tensors="pt")
  input_ids = enc.input_ids.to(model.device)
  with torch.no_grad():
      outputs = model(input_ids, labels=input_ids)
      loss = outputs.loss
  return math.exp(loss.item())

In [23]:
import json, ast, re

def safe_parse_llm_dict(text):
    # Step 1: extract the {...} block
    m = re.search(r"\{[\s\S]*\}", text)
    if not m:
        raise ValueError("No JSON-like object found.")
    block = m.group(0)

    # Step 2: try strict JSON first
    try:
        return json.loads(block)
    except:
        pass

    # Step 3: fix unescaped single quotes ‚Üí convert to JSON
    # replace single quotes around keys/values with double quotes
    fixed = re.sub(r"'([^']*)'\s*:", r'"\1":', block)    # keys
    fixed = re.sub(r":\s*'([^']*)'", r': "\1"', fixed)   # values

    # Step 4: try JSON again
    try:
        return json.loads(fixed)
    except:
        pass

    # Step 5: last resort: try literal_eval AFTER escaping inner quotes
    try:
        safe = block.replace("\\'", "'")  # remove useless escapes
        return ast.literal_eval(safe)
    except:
        pass

    raise ValueError("Could not parse LLM dict.")



In [24]:
def super_parse(text):
    """
    Extract and parse LLM dicts containing messy quotes, double escapes, etc.
    Returns: a clean Python dict.
    """

    if isinstance(text, dict):
        return text

    # 1. Extract first {...} block
    m = re.search(r"\{[\s\S]*\}", text)
    if not m:
        raise ValueError("No dict-like object found.")
    block = m.group(0)

    # 2. Fix double escapes (\\' ‚Üí ')
    block = block.replace("\\\\", "\\")   # reduce double escaping
    block = block.replace("\\'", "'")     # fix '\'' inside values

    # 3. Try literal_eval directly
    try:
        return ast.literal_eval(block)
    except:
        pass

    # 4. Convert Python dict ‚Üí JSON (replace single quotes with double quotes)
    block_json = re.sub(r"'([^']*)'\s*:", r'"\1":', block)    # keys
    block_json = re.sub(r":\s*'([^']*)'", r': "\1"', block_json)  # values

    # 5. Remove remaining stray escape characters
    block_json = block_json.replace("\\'", "'")

    # 6. Try JSON parse
    try:
        return json.loads(block_json)
    except:
        pass

    # 7. Final fallback ‚Äî brute-force cleaning
    cleaned = block.replace("'", '"')
    try:
        return json.loads(cleaned)
    except:
        pass

    raise ValueError("Could not parse dict after all cleaning steps.")

In [25]:
import numpy as np
def compute_scores(test_dataset, model_base, tokenizer_base, model_trained, tokenizer_trained, total_candidates = 5, if_local_model=True):
  total_points = len(test_dataset)

  #Pair Wise
  trained_pairwise = 0
  base_pairwise = 0

  # MAP
  trained_mcp = 0
  base_mcp = 0

  # NDCG
  trained_ndcg = 0
  base_ndcg = 0


  # perplexity
  trained_perplexity_chosen = 0
  base_perplexity_chosen = 0
  trained_perplexity_rejected = 0
  base_perplexity_rejected = 0

  for sample in test_dataset:
    gound_truth_scores = np.array(sample["ground_truth_rating_rank"])
    trained_response = {}
    base_response = {}
    score_maper = json.loads(sample['stars_mappoing'])
    if if_local_model:
      trained_perplexity_chosen += perplexity(sample['chosen'], model_trained, tokenizer_trained)
      trained_perplexity_rejected += perplexity(sample['rejected'], model_trained, tokenizer_trained)
      base_perplexity_chosen += perplexity(sample['chosen'], model_base, tokenizer_base)
      base_perplexity_rejected += perplexity(sample['rejected'], model_base, tokenizer_base)
    try:
      trained_response = safe_parse_llm_dict(sample["generated_response"])
      if "ranks" in trained_response:
        trained_response = trained_response["ranks"]
      if "rankings" in trained_response:
        trained_response = trained_response["rankings"]
      trained_response = {int(k): v for k, v in trained_response.items()}
    except Exception as e:
      trained_response = {}


    try:
      base_response = safe_parse_llm_dict(sample["base_response"])
      if "ranks" in trained_response:
        base_response = base_response["ranks"]
      if "rankings" in base_response:
        base_response = base_response["rankings"]
      base_response = {int(k): v for k, v in base_response.items()}
    except:
      base_response = {}

    mapped_trained_scores = []
    mapped_base_scores = []
    for i in range(1, total_candidates + 1):
      train_response_candidate = trained_response.get(i, "")
      mapped_score = score_maper.get(train_response_candidate, 0)
      mapped_trained_scores.append(mapped_score)

      base_response_candidate = base_response.get(i, "")
      mapped_score = score_maper.get(base_response_candidate, 0)
      mapped_base_scores.append(mapped_score)

    mapped_trained_scores = np.array(mapped_trained_scores)
    mapped_base_scores = np.array(mapped_base_scores)

    #print(mapped_trained_scores, mapped_base_scores)
    # map
    trained_mcp += map_at_k(gound_truth_scores, mapped_trained_scores)
    base_mcp += map_at_k(gound_truth_scores, mapped_base_scores)

    # ndcg
    trained_ndcg += ndcg_at_k(gound_truth_scores, mapped_trained_scores)
    base_ndcg += ndcg_at_k(gound_truth_scores, mapped_base_scores)

    # pairwise
    trained_pairwise += pairwise_accuracy(gound_truth_scores, mapped_trained_scores)
    base_pairwise += pairwise_accuracy(gound_truth_scores, mapped_base_scores)




    #print(f"Trained = {trained_response}")
    #print(f"Base = {base_response}")
    #print(f"Ground Truth = {gound_truth_scores}")


  trained_scores = {
      "map": trained_mcp / total_points,
      "ndcg": float(trained_ndcg / total_points),
      "pairwise": trained_pairwise / total_points,
      "perplexity_chosen": trained_perplexity_chosen / total_points,
      "perplexity_rejected": trained_perplexity_rejected / total_points,
      "perplexity_diff": (trained_perplexity_chosen - trained_perplexity_rejected) / total_points
  }

  base_scores = {
      "map": base_mcp / total_points,
      "ndcg": float(base_ndcg / total_points),
      "pairwise": base_pairwise / total_points,
      "perplexity_chosen": base_perplexity_chosen / total_points,
      "perplexity_rejected": base_perplexity_rejected / total_points,
      "perplexity_diff": (base_perplexity_chosen - base_perplexity_rejected) / total_points
  }

  final_scores = {
      "trained": trained_scores,
      "base": base_scores
  }

  return final_scores


In [26]:
final_scores = compute_scores(test_dataset,
                              model_base = base_model,
                              tokenizer_base = base_tokenizer,
                              model_trained = trained_model,
                              tokenizer_trained = trained_tokenizer)

In [27]:
final_scores

{'trained': {'map': 0.98,
  'ndcg': 1.0,
  'pairwise': 1.0,
  'perplexity_chosen': 37.9281628713691,
  'perplexity_rejected': 38.84784112697867,
  'perplexity_diff': -0.919678255609565},
 'base': {'map': 0.8143680555555557,
  'ndcg': 0.8967812519841963,
  'pairwise': 0.6865773809523807,
  'perplexity_chosen': 37.9281628713691,
  'perplexity_rejected': 38.84784112697867,
  'perplexity_diff': -0.919678255609565}}

In [28]:
# -------------------------------------------------------
# Load models
# -------------------------------------------------------
#adapter_path = model_output_dir
import os
adapter_path = model_output_dir
# -------------------------------------------------------
# Save results
# -------------------------------------------------------
def save_evaluation_results(
    adapter_path, final_scores,
    results_dir="/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/results"
):
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, "results.txt")
    with open(results_path, 'a') as f:
        f.write(f"{adapter_path}\n")
        f.write(f"Base Accuracy: {json.dumps(final_scores['base'], indent=4)}\n")
        f.write(f"Trained Accuracy: {json.dumps(final_scores['trained'], indent=4)}\n")
        f.write("-" * 50 + "\n")

    print(f"Results appended to: {results_path}")


# Usage
save_evaluation_results(
    adapter_path=adapter_path,
    final_scores=final_scores,
)


Results appended to: /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/results/results.txt


# Run Score for GPT4o-mini


In [29]:
test_dataset = load_dataset("HannahGrj/LLM4Rec_DPO_List_test_with_responses", split="train")

README.md:   0%|          | 0.00/489 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/587k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/200 [00:00<?, ? examples/s]

In [30]:
final_scores = compute_scores(test_dataset,
                              model_base = "",
                              tokenizer_base = "",
                              model_trained = "",
                              tokenizer_trained = "",
                              if_local_model = False)

In [31]:
final_scores

{'trained': {'map': 0.9263263888888887,
  'ndcg': 0.9675447618775059,
  'pairwise': 0.8766170634920634,
  'perplexity_chosen': 0.0,
  'perplexity_rejected': 0.0,
  'perplexity_diff': 0.0},
 'base': {'map': 0.98,
  'ndcg': 1.0,
  'pairwise': 1.0,
  'perplexity_chosen': 0.0,
  'perplexity_rejected': 0.0,
  'perplexity_diff': 0.0}}

In [32]:
# -------------------------------------------------------
# Load models
# -------------------------------------------------------
#adapter_path = model_output_dir
import os
adapter_path = "GPT4o-mini"
# -------------------------------------------------------
# Save results
# -------------------------------------------------------
def save_evaluation_results(
    adapter_path, final_scores,
    results_dir="/content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/results"
):
    os.makedirs(results_dir, exist_ok=True)
    results_path = os.path.join(results_dir, "results.txt")
    with open(results_path, 'a') as f:
        f.write(f"{adapter_path}\n")
        f.write(f"Base Accuracy: {json.dumps(final_scores['base'], indent=4)}\n")
        f.write(f"Trained Accuracy: {json.dumps(final_scores['trained'], indent=4)}\n")
        f.write("-" * 50 + "\n")

    print(f"Results appended to: {results_path}")


# Usage
save_evaluation_results(
    adapter_path=adapter_path,
    final_scores=final_scores,
)

Results appended to: /content/drive/MyDrive/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_fulllist/results/results.txt


## 11. Push Model to Hugging Face

Upload the trained model adapter and optionally the merged model to Hugging Face Hub.

In [None]:
from huggingface_hub import upload_folder, create_repo

HF_USERNAME = "HannahGrj"
REPO_NAME = "dpo-qwen-fullList-gap2-n5000"

HF_TOKEN = ""
repo_id = f"{HF_USERNAME}/{REPO_NAME}"

LOCAL_MODEL_PATH = "/Users/guanruijia/Desktop/Stanford/CS329H/CS329H_DiningbyDesign/LLM4Rec/justin_Qwen_Fulllist/model/Qwen3-0.6B_gap1_n5000_ep4_bs4_lr1e-05fs"

try:
    create_repo(
        repo_id=repo_id,
        token=HF_TOKEN,
        repo_type="model",
        private=False,
        exist_ok=True
    )
    print(f"‚úÖ Repository created: https://huggingface.co/{repo_id}")
except Exception as e:
    print(f"‚ÑπÔ∏è  Repository may already exist or error: {e}")


upload_folder(
    folder_path=LOCAL_MODEL_PATH,
    repo_id=repo_id,
    token=HF_TOKEN,
    commit_message=f"Upload DPO adapter (gap2, n=5000)"
)

print(f"‚úÖ Model uploaded to: https://huggingface.co/{repo_id}")

‚úÖ Repository created: https://huggingface.co/HannahGrj/dpo-qwen-fullList-gap2-n5000


Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

‚úÖ Model uploaded to: https://huggingface.co/HannahGrj/dpo-qwen-fullList-gap2-n5000
