In [1]:
# Step 1: Imports and environment setup
import os
import subprocess
from training.trainer import RLHFTrainer
from training.human_feedback_collector import convert_preferences_to_dataset
from inference.render_inference import render_trajectory_video


trainer = RLHFTrainer(save_dir="saved_models")

os.makedirs("data/clips", exist_ok=True)
os.makedirs("data/trajectories", exist_ok=True)

NUM_ITERATIONS = 3
PREFERENCES_PER_ITER = 1  # number of preferences to collect before training

def convert_to_h264(video_path):
    h264_path = video_path.replace(".mp4", "_h264.mp4")
    os.system(f"ffmpeg -y -i {video_path} -vcodec libx264 -crf 23 {h264_path}")
    os.remove(video_path)
    os.rename(h264_path, video_path)


def generate_and_save_trajectories(iteration):
    traj1 = trainer.ppo_trainer.collect_trajectory()
    traj2 = trainer.ppo_trainer.collect_trajectory()
    
    prefix1 = f"left_clip_{iteration:03d}"
    prefix2 = f"right_clip_{iteration:03d}"
    
    render_trajectory_video(traj1, trainer.env, prefix1)
    render_trajectory_video(traj2, trainer.env, prefix2)
    
    return prefix1, prefix2

print("🚀 Launching Streamlit UI for preference collection...")
subprocess.Popen(["streamlit", "run", "scripts/visualize_trajectories.py"])

# Step 2: Main Preference Training Loop
for iteration in range(NUM_ITERATIONS):
    print(f"\n🔁 Iteration {iteration + 1}/{NUM_ITERATIONS}")

    # 1. Generate two trajectories with the current PPO policy
    prefix_a, prefix_b = generate_and_save_trajectories(iteration)

    # 2. Ask the user to label preference
    # print("🚀 Launching Streamlit UI for preference collection...")
    # # subprocess.run(["streamlit", "run", "scripts/visualize_trajectories.py"])
    # subprocess.Popen(["streamlit", "run", "scripts/visualize_trajectories.py"])
    input("📝 Label your preferences in the Streamlit app. Press Enter here when done...")

    # 3. Convert preference logs to training dataset
    print("📦 Converting preferences to dataset...")
    convert_preferences_to_dataset()

    # 4. Train reward model using new preferences
    print("🎯 Training reward model...")
    trainer.train_reward_model()

    # 5. Train PPO with updated reward predictor
    print("🤖 Training PPO using updated reward function...")
    trainer.train_step()

print("\n✅ Done. Preference-based RL pipeline completed.")

pybullet build time: Sep  3 2024 12:54:04


Version = 4.1 ATI-6.1.13
Vendor = ATI Technologies Inc.
Renderer = AMD Radeon Pro 5300M OpenGL Engine
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started


2025-05-09 08:52:19.661 python[95118:7553806] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-09 08:52:19.661 python[95118:7553806] +[IMKInputSession subclass]: chose IMKInputSession_Modern


🚀 Launching Streamlit UI for preference collection...

🔁 Iteration 1/3

  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8505
  Network URL: http://192.168.1.113:8505

  For better performance, install the Watchdog module:

  $ xcode-select --install
  $ pip install watchdog
            


2025-05-09 08:52:21.626 python[95118:7553806] _TIPropertyValueIsValid called with 4 on nil context!
2025-05-09 08:52:21.626 python[95118:7553806] imkxpc_getApplicationProperty:reply: called with incorrect property value 4, bailing.
2025-05-09 08:52:21.626 python[95118:7553806] Text input context does not respond to _valueForTIProperty:
2025-05-09 08:52:21.626 python[95118:7553806] _TIPropertyValueIsValid called with 4 on nil context!
2025-05-09 08:52:21.626 python[95118:7553806] imkxpc_getApplicationProperty:reply: called with incorrect property value 4, bailing.
2025-05-09 08:52:21.626 python[95118:7553806] Text input context does not respond to _valueForTIProperty:
ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gn

🎥 Saved: data/clips/left_clip_000.mp4


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

🎥 Saved: data/clips/right_clip_000.mp4
📦 Converting preferences to dataset...
Saved 4 pairs to data/reward_training_data.pt
🎯 Training reward model...
No preferences to train on!
🤖 Training PPO using updated reward function...

🔁 Iteration 2/3


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

🎥 Saved: data/clips/left_clip_001.mp4


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

🎥 Saved: data/clips/right_clip_001.mp4
📦 Converting preferences to dataset...
Saved 4 pairs to data/reward_training_data.pt
🎯 Training reward model...
No preferences to train on!
🤖 Training PPO using updated reward function...

🔁 Iteration 3/3


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

🎥 Saved: data/clips/left_clip_002.mp4


ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
  built with Apple clang version 16.0.0 (clang-1600.0.26.6)
  configuration: --prefix=/usr/local/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --

🎥 Saved: data/clips/right_clip_002.mp4
📦 Converting preferences to dataset...
Saved 4 pairs to data/reward_training_data.pt
🎯 Training reward model...
No preferences to train on!
🤖 Training PPO using updated reward function...

✅ Done. Preference-based RL pipeline completed.
