# 🧠 RLHF Pipeline: Trajectory Generation, Labeling, and Training

In [1]:
# ✅ Step 1: Set up imports and environment
import os
import sys
sys.path.append(os.path.abspath("."))

from training.trainer import RLHFTrainer

trainer = RLHFTrainer()

pybullet build time: Apr 13 2025 15:37:52


Version = 4.1 Metal - 89.3
Vendor = Apple
Renderer = Apple M1
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started


2025-05-05 14:33:48.724 python[12561:9914370] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-05 14:33:48.724 python[12561:9914370] +[IMKInputSession subclass]: chose IMKInputSession_Modern


In [18]:
# ✅ Step 2: Generate initial trajectories and save them as clips
import numpy as np
import cv2
import pybullet as p

def save_trajectory_video_and_data(env, trajectory, filename_prefix):
    width, height = 320, 240
    video_path = f"data/clips/{filename_prefix}.mp4"
    data_path = f"data/trajectories/{filename_prefix}.npz"

    # Setup PyBullet camera
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(video_path, fourcc, 30.0, (width, height))

    env.reset()
    for i in range(len(trajectory['actions'])):
        print(i)
        p.stepSimulation()
        view_matrix = p.computeViewMatrixFromYawPitchRoll(cameraTargetPosition=[0.5, 0, 0.5],
                                                          distance=1.0,
                                                          yaw=50,
                                                          pitch=-35,
                                                          roll=0,
                                                          upAxisIndex=2)
        proj_matrix = p.computeProjectionMatrixFOV(fov=60,
                                                   aspect=width/height,
                                                   nearVal=0.1,
                                                   farVal=100.0)
        (_, _, px, _, _) = p.getCameraImage(width=width,
                                            height=height,
                                            viewMatrix=view_matrix,
                                            projectionMatrix=proj_matrix,
                                            renderer=p.ER_BULLET_HARDWARE_OPENGL)
        frame = np.reshape(px, (height, width, 4))[:, :, :3].astype(np.uint8)
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
        video_writer.write(frame)
    video_writer.release()

    np.savez(data_path, obs=trajectory['observations'], act=trajectory['actions'])

os.makedirs("data/clips", exist_ok=True)
os.makedirs("data/trajectories", exist_ok=True)

for i in range(2):
    print(i)
    traj = trainer.ppo_trainer.collect_trajectory()
    save_trajectory_video_and_data(trainer.env, traj, f"left_clip_{i:03d}" if i % 2 == 0 else f"right_clip_{i:03d}")

0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
1
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99


In [11]:
# ✅ Step 3: Launch labeling UI (Streamlit)
import subprocess
subprocess.run(["streamlit", "run", "scripts/visualize_trajectories.py"])


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8503
  Network URL: http://10.48.21.228:8503

  For better performance, install the Watchdog module:

  $ xcode-select --install
  $ pip install watchdog
            
  Stopping...


KeyboardInterrupt: 

In [19]:
# ✅ Step 4: Convert labeled preferences to training data
!python training/human_feedback_collector.py

Saved 1 trajectory preferences to data/reward_training_data.pt


In [20]:
# ✅ Step 5: Train reward model and PPO using collected preferences
!python training/trainer.py

pybullet build time: Apr 13 2025 15:37:52
Version = 4.1 Metal - 89.3
Vendor = Apple
Renderer = Apple M1
b3Printf: Selected demo: Physics Server
startThreads creating 1 threads.
starting thread 0
started thread 0 
MotionThreadFunc thread started
2025-05-05 14:59:03.544 python[13543:9935772] +[IMKClient subclass]: chose IMKClient_Modern
2025-05-05 14:59:03.544 python[13543:9935772] +[IMKInputSession subclass]: chose IMKInputSession_Modern

Iteration 1
Collecting human preferences...
2025-05-05 14:59:08.772 
  command:

    streamlit run training/trainer.py [ARGUMENTS]
2025-05-05 14:59:08.773 Session state does not function when running a script without `streamlit run`
Policy Loss: -0.0432, Value Loss: 474.2726, Entropy: 1.4197

Iteration 2
Policy Loss: -0.0324, Value Loss: 427.0591, Entropy: 1.4202

Iteration 3
Policy Loss: -0.0300, Value Loss: 374.4110, Entropy: 1.4208

Iteration 4
Policy Loss: -0.0306, Value Loss: 464.8276, Entropy: 1.4217

Iteration 5
Policy Loss: -0.0242, Value Loss: