In [None]:
# Step 1: Imports and environment setup
import os
import subprocess
from training.trainer import RLHFTrainer
from training.human_feedback_collector import convert_preferences_to_dataset
from inference.render_inference import render_trajectory_video


trainer = RLHFTrainer(save_dir="saved_models")

os.makedirs("data/clips", exist_ok=True)
os.makedirs("data/trajectories", exist_ok=True)

NUM_ITERATIONS = 3
PREFERENCES_PER_ITER = 1  # number of preferences to collect before training

def convert_to_h264(video_path):
    h264_path = video_path.replace(".mp4", "_h264.mp4")
    os.system(f"ffmpeg -y -i {video_path} -vcodec libx264 -crf 23 {h264_path}")
    os.remove(video_path)
    os.rename(h264_path, video_path)


def generate_and_save_trajectories(iteration):
    traj1 = trainer.ppo_trainer.collect_trajectory()
    traj2 = trainer.ppo_trainer.collect_trajectory()
    
    prefix1 = f"left_clip_{iteration:03d}"
    prefix2 = f"right_clip_{iteration:03d}"
    
    render_trajectory_video(traj1, trainer.env, prefix1)
    render_trajectory_video(traj2, trainer.env, prefix2)
    
    return prefix1, prefix2

print("🚀 Launching Streamlit UI for preference collection...")
subprocess.Popen(["streamlit", "run", "scripts/visualize_trajectories.py"])

# Step 2: Main Preference Training Loop
for iteration in range(NUM_ITERATIONS):
    print(f"\n🔁 Iteration {iteration + 1}/{NUM_ITERATIONS}")

    # 1. Generate two trajectories with the current PPO policy
    prefix_a, prefix_b = generate_and_save_trajectories(iteration)

    # 2. Ask the user to label preference
    # print("🚀 Launching Streamlit UI for preference collection...")
    # # subprocess.run(["streamlit", "run", "scripts/visualize_trajectories.py"])
    # subprocess.Popen(["streamlit", "run", "scripts/visualize_trajectories.py"])
    input("📝 Label your preferences in the Streamlit app. Press Enter here when done...")

    # 3. Convert preference logs to training dataset
    print("📦 Converting preferences to dataset...")
    convert_preferences_to_dataset()

    # 4. Train reward model using new preferences
    print("🎯 Training reward model...")
    trainer.train_reward_model()

    # 5. Train PPO with updated reward predictor
    print("🤖 Training PPO using updated reward function...")
    trainer.train_step()

print("\n✅ Done. Preference-based RL pipeline completed.")

pybullet build time: Apr 13 2025 15:37:52


Version = 4.1 Metal - 89.3
Vendor = Apple


2025-05-09 12:08:09.214 python[44832:10993571] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-09 12:08:09.214 python[44832:10993571] +[IMKInputSession subclass]: chose IMKInputSession_Modern


Renderer = Apple M1
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
🚀 Launching Streamlit UI for preference collection...

🔁 Iteration 1/3

  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8502
  Network URL: http://192.168.1.194:8502

  For better performance, install the Watchdog module:

  $ xcode-select --install
  $ pip install watchdog
            


sh: ffmpeg: command not found


FileNotFoundError: [Errno 2] No such file or directory: 'data/clips/left_clip_000_h264.mp4' -> 'data/clips/left_clip_000.mp4'

: 