In [None]:
pip install swig

Collecting swig
  Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (3.5 kB)
Downloading swig-4.3.1.post0-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: swig
Successfully installed swig-4.3.1.post0


In [None]:
pip install "gymnasium[box2d]"

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Downloading box2d-py-2.3.5.tar.gz (374 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/374.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━[0m [32m317.4/374.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.4/374.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp312-cp312-linux_x86_64.whl size=2409532 sha256=82a7a8f71b71315f928671ed6da7770ac2f53e42268e3f3efd48d5a914fdf253
  Stored in directory: /root/.cache/pip/wheels/2a/e9/60/774da0bcd07f7dc7761a8590fa2d065e4069568e78dcdc3318
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully ins

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from gymnasium import spaces
from collections import deque
import random
import matplotlib.pyplot as plt
import os
import pickle
from datetime import datetime

# Deterministic Environment from Assignment 1

In [None]:
class DeterministicWarehouseRobotEnv(gym.Env):
    metadata = {"render_modes": ["human", "rgb_array"]}

    def __init__(self, stochastic=False, max_steps=100):
        super(DeterministicWarehouseRobotEnv, self).__init__()
        self.grid_size = 6
        self.obstacles = [(1,1), (2,3), (4,4)]
        self.pickup_loc = (0,5)
        self.dropoff_loc = (5,0)

        self.action_space = spaces.Discrete(6)
        self.observation_space = spaces.MultiDiscrete([self.grid_size, self.grid_size, 2])

        self.max_steps = max_steps

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.agent_pos = [0, 0]
        self.carrying = 0
        self.current_step = 0
        self.terminated = False
        self.truncated = False
        return self._get_obs(), {}

    def step(self, action):
        self.current_step += 1
        reward = -1
        new_pos = list(self.agent_pos)

        if action == 0:      # Up
            new_pos[0] -= 1
        elif action == 1:    # Down
            new_pos[0] += 1
        elif action == 2:    # Left
            new_pos[1] -= 1
        elif action == 3:    # Right
            new_pos[1] += 1
        elif action == 4:    # Pick-up
            if tuple(self.agent_pos) == self.pickup_loc and self.carrying == 0:
                self.carrying = 1
                reward = 25
            new_pos = self.agent_pos
        elif action == 5:    # Drop-off
            if tuple(self.agent_pos) == self.dropoff_loc and self.carrying == 1:
                reward = 100
                self.terminated = True
            new_pos = self.agent_pos

        # Boundary & obstacle check
        if 0 <= new_pos[0] < self.grid_size and 0 <= new_pos[1] < self.grid_size:
            if tuple(new_pos) in self.obstacles:
                reward -= 20
            else:
                self.agent_pos = new_pos

        if self.current_step >= self.max_steps:
            self.truncated = True

        return self._get_obs(), reward, self.terminated, self.truncated, {}

    def _get_obs(self):
        return np.array([self.agent_pos[0], self.agent_pos[1], self.carrying], dtype=np.int32)

    def render(self, mode="human"):
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        ax.set_xlim(-0.5, self.grid_size - 0.5)
        ax.set_ylim(-0.5, self.grid_size - 0.5)
        ax.set_xticks(np.arange(self.grid_size))
        ax.set_yticks(np.arange(self.grid_size))
        ax.grid(True, which="both", color="gray", linestyle="-", linewidth=0.5)
        ax.set_facecolor('#D3D3D3')

        # Obstacles
        for (x, y) in self.obstacles:
            ax.add_patch(plt.Rectangle((y - 0.5, x - 0.5), 1, 1, facecolor='black'))

        # Pickup (blue)
        ax.add_patch(plt.Rectangle((self.pickup_loc[1] - 0.5, self.pickup_loc[0] - 0.5), 1, 1, facecolor='blue'))

        # Drop-off (yellow)
        ax.add_patch(plt.Rectangle((self.dropoff_loc[1] - 0.5, self.dropoff_loc[0] - 0.5), 1, 1, facecolor='yellow'))

        # Agent
        x, y = self.agent_pos
        color = 'red' if self.carrying == 0 else 'green'
        ax.add_patch(plt.Circle((y, x), 0.3, facecolor=color))

        plt.title(f"Step {self.current_step} - {'Carrying' if self.carrying else 'Not Carrying'}")
        plt.gca().invert_yaxis()

        if mode == "human":
            plt.show()
        elif mode == "rgb_array":
            fig.canvas.draw()
            img = np.array(fig.canvas.renderer.buffer_rgba())
            plt.close()
            return img
        plt.close()

# DQN Network

In [None]:
class DQNNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(DQNNet, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_size)
        )

    def forward(self, x):
        return self.net(x)

# Replay Buffer

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

# DQN Agent

In [None]:
class DQNAgent:
    def __init__(self, state_dims, action_size,
                 lr=1e-3, gamma=0.99, epsilon=1.0,
                 epsilon_min=0.01, epsilon_decay=0.995,
                 buffer_size=10000, batch_size=64):

        self.state_dims = state_dims
        self.action_size = action_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.batch_size = batch_size

        self.memory = ReplayBuffer(buffer_size)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        input_size = state_dims
        self.model = DQNNet(input_size, action_size).to(self.device)
        self.target_model = DQNNet(input_size, action_size).to(self.device)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=lr)
        self.criterion = nn.SmoothL1Loss()

        self.update_target_every = 1000
        self.step_count = 0

    def select_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state_tensor = torch.FloatTensor(state).flatten().unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.model(state_tensor)
        return q_values.argmax().item()

    def store(self, state, action, reward, next_state, done):
        self.memory.push(state, action, reward, next_state, done)

    def soft_update(self, tau=0.005):
      for target_param, param in zip(self.target_model.parameters(), self.model.parameters()):
          target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

    def train_step(self):
        if len(self.memory) < self.batch_size:
            return None

        batch = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states = torch.FloatTensor(np.stack(states)).to(self.device)  # shape: (B, 3)
        actions = torch.LongTensor(actions).to(self.device)
        rewards = torch.FloatTensor(rewards).to(self.device)
        next_states = torch.FloatTensor(np.stack(next_states)).to(self.device)  # shape: (B, 3)
        dones = torch.FloatTensor(dones).to(self.device)

        current_q = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1)
        next_actions = self.model(next_states).argmax(1).unsqueeze(1)
        next_q = self.target_model(next_states).gather(1, next_actions).squeeze(1).detach()
        target_q = rewards + (1 - dones) * self.gamma * next_q

        loss = self.criterion(current_q, target_q)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.step_count += 1
        self.soft_update(tau=0.01)

        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

        return loss.item()

    def save_model(self, path):
        torch.save(self.model.state_dict(), path)

# Training Function

In [None]:
def train_dqn(env, env_name, episodes=1000, max_steps=500):
    print(f"\n=== Training DQN on {env_name} ===")
    if env_name == "Warehouse":
        state_dim = 3
    else:
        state_dim = env.observation_space.shape[0]  # CartPole, LunarLander

    agent = DQNAgent(state_dim, env.action_space.n)

    rewards_history = []
    epsilon_history = []
    losses = []

    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        step = 0

        while not done and step < max_steps:
            action = agent.select_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            agent.store(state.ravel(), action, reward, next_state.ravel(), done)
            loss = agent.train_step()
            if loss is not None:
                losses.append(loss)
            state = next_state
            total_reward += reward
            step += 1

        rewards_history.append(total_reward)
        epsilon_history.append(agent.epsilon)

        if (ep+1) % 100 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f"Episode {ep+1}/{episodes} | Avg Reward (last 100): {avg_reward:.2f} | Epsilon: {agent.epsilon:.3f}")

    # Save model
    model_path = f"a2_part_2_dqn_{env_name.lower().replace('-', '')}_TEAMMATE1_TEAMMATE2.pth"
    agent.save_model(model_path)
    print(f"Model saved: {model_path}")

    return rewards_history, epsilon_history, model_path

# Evaluation (Greedy)

In [None]:
def evaluate_greedy(env, model_path, state_dims, episodes=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if hasattr(env.observation_space, "nvec"):
        input_size = 3
    else:
        input_size = int(np.prod(state_dims))

    model = DQNNet(input_size, env.action_space.n).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    rewards = []
    for ep in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            s = torch.FloatTensor(state).flatten().unsqueeze(0).to(device)
            with torch.no_grad():
                action = model(s).argmax().item()
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward
            state = next_state
        rewards.append(total_reward)
        print(f"Episode {ep+1}: Total Reward = {total_reward}")

    print(f"\nAverage Reward over {episodes} eval episodes: {np.mean(rewards):.2f}")
    return rewards

# Render and Save video

In [None]:
import imageio
from IPython.display import Video, display
def render_and_save_video(env, model_path, state_dims, filename="warehouse_eval.mp4", fps=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if hasattr(env.observation_space, "nvec"):
        input_size = 3  # your warehouse state vector length
    else:
        input_size = int(np.prod(state_dims))

    # Load model
    model = DQNNet(input_size, env.action_space.n).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    frames = []
    state, _ = env.reset()
    done = False

    while not done:
        # Render one frame (use rgb_array mode)
        frame = env.render(mode="rgb_array") if callable(getattr(env, "render", None)) else None
        if frame is not None:
            frames.append(frame)

        s = torch.FloatTensor(state).flatten().unsqueeze(0).to(device)
        with torch.no_grad():
            action = model(s).argmax().item()
        next_state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        state = next_state

    # Save video (use imageio + ffmpeg)
    try:
        writer = imageio.get_writer(filename, fps=fps, macro_block_size=None)
        for f in frames:
            arr = np.asarray(f)
            if arr.dtype != np.uint8:
                arr = (255 * np.clip(arr, 0, 1)).astype(np.uint8)
            writer.append_data(arr)
        writer.close()
        print(f"✅ Video saved: {filename}")
    except Exception as e:
        fallback = filename.rsplit(".", 1)[0] + ".gif"
        imageio.mimsave(fallback, frames, fps=fps)
        print(f"Saved as GIF instead ({fallback}) because MP4 failed. Error: {e}")

    # Show video inline
    try:
        display(Video(filename, embed=True, width=480))
    except:
        pass

    env.close()


# Training

In [None]:
# 1. Warehouse
warehouse_env = DeterministicWarehouseRobotEnv(max_steps=200)
warehouse_rewards, warehouse_eps, warehouse_model = train_dqn(
    warehouse_env, "Warehouse", episodes=800
)


=== Training DQN on Warehouse ===
Episode 100/800 | Avg Reward (last 100): -182.62 | Epsilon: 0.010
Episode 200/800 | Avg Reward (last 100): -190.50 | Epsilon: 0.010
Episode 300/800 | Avg Reward (last 100): -165.68 | Epsilon: 0.010
Episode 400/800 | Avg Reward (last 100): 99.51 | Epsilon: 0.010
Episode 500/800 | Avg Reward (last 100): 61.72 | Epsilon: 0.010
Episode 600/800 | Avg Reward (last 100): 42.69 | Epsilon: 0.010
Episode 700/800 | Avg Reward (last 100): 97.16 | Epsilon: 0.010
Episode 800/800 | Avg Reward (last 100): 86.88 | Epsilon: 0.010
Model saved: a2_part_2_dqn_warehouse_TEAMMATE1_TEAMMATE2.pth


In [None]:
# 2. CartPole
cartpole_env = gym.make("CartPole-v1", render_mode="rgb_array")
cartpole_rewards, cartpole_eps, cartpole_model = train_dqn(
    cartpole_env, "CartPole-v1", episodes=500
)


=== Training DQN on CartPole-v1 ===
Episode 100/500 | Avg Reward (last 100): 13.98 | Epsilon: 0.010
Episode 200/500 | Avg Reward (last 100): 12.04 | Epsilon: 0.010
Episode 300/500 | Avg Reward (last 100): 99.49 | Epsilon: 0.010
Episode 400/500 | Avg Reward (last 100): 211.10 | Epsilon: 0.010
Episode 500/500 | Avg Reward (last 100): 198.53 | Epsilon: 0.010
Model saved: a2_part_2_dqn_cartpolev1_TEAMMATE1_TEAMMATE2.pth


In [None]:
# 3. LunarLander
lander_env = gym.make("LunarLander-v2", render_mode="rgb_array")
lander_rewards, lander_eps, lander_model = train_dqn(
    lander_env, "LunarLander-v2", episodes=800
)



=== Training DQN on LunarLander-v2 ===
Episode 100/800 | Avg Reward (last 100): -86.93 | Epsilon: 0.010
Episode 200/800 | Avg Reward (last 100): 50.43 | Epsilon: 0.010
Episode 300/800 | Avg Reward (last 100): 117.08 | Epsilon: 0.010
Episode 400/800 | Avg Reward (last 100): 172.41 | Epsilon: 0.010
Episode 500/800 | Avg Reward (last 100): 84.06 | Epsilon: 0.010
Episode 600/800 | Avg Reward (last 100): 216.22 | Epsilon: 0.010
Episode 700/800 | Avg Reward (last 100): 243.21 | Epsilon: 0.010
Episode 800/800 | Avg Reward (last 100): 237.53 | Epsilon: 0.010
Model saved: a2_part_2_dqn_lunarlanderv2_TEAMMATE1_TEAMMATE2.pth


In [None]:
# ------------------- 9. Evaluation -------------------
print("\n=== Evaluation (Greedy Policy) ===")
warehouse_eval = evaluate_greedy(warehouse_env, warehouse_model, np.prod(warehouse_env.observation_space.nvec))
cartpole_eval = evaluate_greedy(cartpole_env, cartpole_model, cartpole_env.observation_space.shape)
lander_eval = evaluate_greedy(lander_env, lander_model, lander_env.observation_space.shape)

print(f"Warehouse Eval (5 eps): {warehouse_eval} | Avg: {np.mean(warehouse_eval):.2f}")
print(f"CartPole Eval (5 eps): {cartpole_eval} | Avg: {np.mean(cartpole_eval):.2f}")
print(f"LunarLander Eval (5 eps): {lander_eval} | Avg: {np.mean(lander_eval):.2f}")


=== Evaluation (Greedy Policy) ===
Episode 1: Total Reward = -200
Episode 2: Total Reward = -200
Episode 3: Total Reward = -200
Episode 4: Total Reward = -200
Episode 5: Total Reward = -200

Average Reward over 5 eval episodes: -200.00
Episode 1: Total Reward = 139.0
Episode 2: Total Reward = 129.0
Episode 3: Total Reward = 198.0
Episode 4: Total Reward = 141.0
Episode 5: Total Reward = 130.0

Average Reward over 5 eval episodes: 147.40
Episode 1: Total Reward = 258.184149972611
Episode 2: Total Reward = 25.79503780883431
Episode 3: Total Reward = 287.6463769103842
Episode 4: Total Reward = 271.0297323584044
Episode 5: Total Reward = 31.820248904891685

Average Reward over 5 eval episodes: 174.90
Warehouse Eval (5 eps): [-200, -200, -200, -200, -200] | Avg: -200.00
CartPole Eval (5 eps): [139.0, 129.0, 198.0, 141.0, 130.0] | Avg: 147.40
LunarLander Eval (5 eps): [np.float64(258.184149972611), np.float64(25.79503780883431), np.float64(287.6463769103842), np.float64(271.0297323584044), 

In [None]:
# ------------------- 10. Save Video (Warehouse) -------------------
render_and_save_video(warehouse_env, warehouse_model, warehouse_env.observation_space.nvec,
                        "a2_part_2_warehouse_eval_TEAMMATE1_TEAMMATE2.mp4")

✅ Video saved: a2_part_2_warehouse_eval_TEAMMATE1_TEAMMATE2.mp4


In [None]:
# ------------------- 11. Plotting -------------------
plt.figure(figsize=(15, 10))

plt.subplot(2, 3, 1)
plt.plot(warehouse_rewards)
plt.title("Warehouse - Reward per Episode")
plt.xlabel("Episode")
plt.ylabel("Total Reward")

plt.subplot(2, 3, 2)
plt.plot(cartpole_rewards)
plt.title("CartPole - Reward per Episode")

plt.subplot(2, 3, 3)
plt.plot(lander_rewards)
plt.title("LunarLander - Reward per Episode")

plt.subplot(2, 3, 4)
plt.plot(warehouse_eps)
plt.title("Epsilon Decay (Warehouse)")

plt.subplot(2, 3, 5)
plt.plot(cartpole_eps)
plt.title("Epsilon Decay (CartPole)")

plt.subplot(2, 3, 6)
plt.plot(lander_eps)
plt.title("Epsilon Decay (LunarLander)")

plt.tight_layout()
plt.savefig("a2_part_2_training_plots_TEAMMATE1_TEAMMATE2.png")
plt.show()

In [None]:
# Save rewards for report
with open("a2_part_2_rewards_TEAMMATE1_TEAMMATE2.pkl", "wb") as f:
    pickle.dump({
        "warehouse": warehouse_rewards,
        "cartpole": cartpole_rewards,
        "lander": lander_rewards,
        "eval": {"warehouse": warehouse_eval, "cartpole": cartpole_eval, "lander": lander_eval}
    }, f)