In [5]:
# Install dependencies
# !pip install stable-baselines3 gym==0.26.2 gym-notices pybullet pybullet-gym imageio opencv-python tensorboard

In [6]:
LOG_DIR = f"logs"
MODEL_DIR = f"models"
DEMO_DIR = f"demos"
TB_DIR = f"tensorboard"

TOTAL_TIMESTEPS = 200_000        # 1_000_000
CHECKPOINT_FREQ = 50_000
IMG_W, IMG_H = 84, 84
SEED = 0

In [7]:
# Cell 2 — Imports & folders
import os, time
import gym
import pybullet_envs  # registers pybullet envs
import numpy as np
import cv2
import imageio
import matplotlib.pyplot as plt
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv, VecTransposeImage
from stable_baselines3.common.callbacks import CheckpointCallback

# Create directories
for d in [PROJECT_DIR, LOG_DIR, MODEL_DIR, DEMO_DIR, TB_DIR]:
    os.makedirs(d, exist_ok=True)

print("Folders created:", PROJECT_DIR, LOG_DIR, MODEL_DIR, DEMO_DIR, TB_DIR)

ModuleNotFoundError: No module named 'gym'

In [8]:
# Cell 3 — RGBObservationWrapper
import gym
from gym import spaces

class RGBObservationWrapper(gym.ObservationWrapper):
    """
    Replaces observations with a resized RGB image from env.render(mode='rgb_array').
    Returns shape (H, W, 3) uint8. SB3 VecTransposeImage converts to (C,H,W).
    """
    def __init__(self, env, width=IMG_W, height=IMG_H):
        super().__init__(env)
        self.width = width
        self.height = height
        # observation space: H,W,3 uint8
        self.observation_space = spaces.Box(
            low=0, high=255, shape=(self.height, self.width, 3), dtype=np.uint8
        )

    def observation(self, obs):
        # render returns HxWx3 RGB array
        img = self.env.render(mode='rgb_array')
        if img is None:
            # fallback: return black image if render unsupported
            img = np.zeros((self.height, self.width, 3), dtype=np.uint8)
        else:
            img = cv2.resize(img, (self.width, self.height))
        return img

ModuleNotFoundError: No module named 'gym'

In [9]:
# Cell 4 — Make a test env and show one frame
def make_env(seed=SEED):
    env = gym.make("ReacherBulletEnv-v0")  # lightweight reach task
    env.seed(seed)
    env = RGBObservationWrapper(env, width=IMG_W, height=IMG_H)
    env = Monitor(env)  # records to monitor file
    return env

# quick render test
env = make_env()
obs = env.reset()
frame = env.render(mode='rgb_array')
print("Frame shape (raw):", None if frame is None else frame.shape)
# display inline if in notebook
from IPython.display import display, Image
if frame is not None:
    # convert to PNG bytes and display
    _, buf = cv2.imencode('.png', cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
    display(Image(data=buf.tobytes()))
env.close()

NameError: name 'gym' is not defined

In [10]:
# Cell 5 — Training PPO from pixels
from stable_baselines3.common.vec_env import VecFrameStack

def train(total_timesteps=TOTAL_TIMESTEPS):
    def _make():
        return make_env()
    venv = DummyVecEnv([_make])
    # SB3 expects channel-first (C,H,W). VecTransposeImage handles H,W,3 -> C,H,W
    venv = VecTransposeImage(venv, (3, IMG_H, IMG_W))
    # optional: frame stacking (improves temporal awareness) — uncomment if desired
    # venv = VecFrameStack(venv, n_stack=4)

    model = PPO("CnnPolicy", venv, verbose=1, tensorboard_log=TB_DIR)
    checkpoint_cb = CheckpointCallback(save_freq=CHECKPOINT_FREQ, save_path=MODEL_DIR, name_prefix="ppo_rgb")
    t0 = time.time()
    model.learn(total_timesteps=total_timesteps, callback=checkpoint_cb)
    t1 = time.time()
    model_path = os.path.join(MODEL_DIR, "ppo_rgb_final")
    model.save(model_path)
    print(f"Training finished in {t1-t0:.1f}s. Model saved to {model_path}")
    return model_path

# Run training (this cell will take time)
model_path = train()

ModuleNotFoundError: No module named 'stable_baselines3'

In [11]:
# Cell 6 — Evaluate policy and get average episodic reward
from stable_baselines3 import PPO

def evaluate_model(model_path, episodes=20, render_size=256, record=False, record_path=None):
    env = gym.make("ReacherBulletEnv-v0")
    env = RGBObservationWrapper(env, width=render_size, height=render_size)
    model = PPO.load(model_path)
    episode_rewards = []
    frames = []
    for ep in range(episodes):
        obs = env.reset()
        done = False
        ep_rew = 0.0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, rew, done, info = env.step(action)
            ep_rew += rew
            if record:
                frm = env.render(mode='rgb_array')
                if frm is not None:
                    frames.append(frm)
        episode_rewards.append(ep_rew)
    env.close()
    if record and record_path and frames:
        imageio.mimwrite(record_path, frames, fps=30)
    avg = float(np.mean(episode_rewards))
    std = float(np.std(episode_rewards))
    return {"avg_reward": avg, "std_reward": std, "episode_rewards": episode_rewards}

# Example usage (set model_path to returned path after training)
# result = evaluate_model(model_path, episodes=10, record=True, record_path=f"{DEMO_DIR}/policy_demo.mp4")
# print(result)

ModuleNotFoundError: No module named 'stable_baselines3'

In [12]:
# Cell 7 — Record demonstration video (one episode, larger resolution)
def record_one_demo(model_path, out_file=f"{DEMO_DIR}/policy_demo.mp4", render_size=256):
    r = evaluate_model(model_path, episodes=1, render_size=render_size, record=True, record_path=out_file)
    print("Recorded demo:", out_file, " | result:", r)
    return out_file

# Example (uncomment to run after training)
# demo_file = record_one_demo(model_path)

In [13]:
# Cell 8 — Plot reward curve from Monitor files
import glob
import pandas as pd

def load_monitor_files(log_dir=LOG_DIR):
    # SB3 Monitor often writes monitor files to current working directory or provided path.
    # Search recursively in project dir
    files = glob.glob(f"{PROJECT_DIR}/**/monitor.csv", recursive=True) + glob.glob(f"{LOG_DIR}/**/monitor.csv", recursive=True)
    files = list(set(files))
    return files

def plot_rewards_from_monitors(save_path=f"{PROJECT_DIR}/reward_curve.png", window=10):
    files = load_monitor_files()
    if not files:
        print("No monitor.csv found. If you used Monitor, check where logs were written.")
        return
    all_rewards = []
    for f in files:
        df = pd.read_csv(f, comment='#')
        if 'r' in df.columns:
            all_rewards.append(df['r'].values)
    # Concatenate and plot moving average
    rewards = np.concatenate([r for r in all_rewards]) if all_rewards else np.array([])
    if rewards.size == 0:
        print("No reward entries found.")
        return
    movavg = np.convolve(rewards, np.ones(window)/window, mode='valid')
    plt.figure(figsize=(8,4))
    plt.plot(movavg)
    plt.xlabel("episodes (smoothed)")
    plt.ylabel("episodic reward (moving avg)")
    plt.title("Training reward curve (smoothed)")
    plt.tight_layout()
    plt.savefig(save_path, dpi=200)
    plt.show()
    print("Saved reward curve to", save_path)

# Example
# plot_rewards_from_monitors()

NameError: name 'PROJECT_DIR' is not defined

In [None]:
# Cell 9 — Save report (~400 words) for submission
report_text = """Integrating visual perception into policy learning...
(PLACE the 400-word report text here — replace this placeholder with the report content provided earlier.)
"""
report_path = f"{PROJECT_DIR}/report.md"
with open(report_path, "w") as f:
    f.write(report_text.strip())
print("Report saved to", report_path)

In [None]:
# Cell 9 — Save report (~400 words) for submission
report_text = """Integrating visual perception into policy learning...
(PLACE the 400-word report text here — replace this placeholder with the report content provided earlier.)
"""
report_path = f"{PROJECT_DIR}/report.md"
with open(report_path, "w") as f:
    f.write(report_text.strip())
print("Report saved to", report_path)