In [2]:
!pip install gymnasium[atari,accept-rom-license] autorom[accept-rom-license]






In [3]:
!pip install torch torchvision numpy matplotlib



In [4]:
!pip install gym



In [5]:
import gym
print("Gym version:", gym.__version__)

Gym version: 0.26.2


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


In [6]:
%run assignment3_utils.py

In [7]:
# 1) IMPORTS & CONFIG
import random, collections, math, numpy as np
import torch, torch.nn as nn, torch.optim as optim
import matplotlib.pyplot as plt
from collections import deque

import gymnasium as gym  # use classic 'gym' if you prefer; adjust reset/step returns

# ---- your helper (placed in the same folder) ----
from assignment3_utils import process_frame   # crop -> downsample -> grayscale -> normalize

# -------------------------
# GLOBAL CONFIG
# -------------------------
ENV_ID = "PongDeterministic-v4"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GAMMA = 0.95
EPS_START = 1.0
EPS_DECAY = 0.995
EPS_MIN = 0.05

BATCH_SIZE = 8             # (experiment: also try 16)
TARGET_UPDATE_EP = 10      # (experiment: also try 3)
LR = 1e-4
REPLAY_CAPACITY = 100_000
MIN_REPLAY_TO_LEARN = 20 * BATCH_SIZE

STACK_N = 4
FRAME_H, FRAME_W = 84, 80  # per your utils pipeline
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)


<torch._C.Generator at 0x1b92097b730>

In [8]:
# 2) FRAME STACKING HELPERS
def make_initial_stack(env, image_shape=(FRAME_H, FRAME_W)):
    obs, _ = env.reset(seed=SEED)
    f = process_frame(obs, image_shape=image_shape)  # -> (1, H, W, 1)
    f = f.squeeze(0).squeeze(-1)                     # -> (H, W)
    stack = deque([f for _ in range(STACK_N)], maxlen=STACK_N)
    return stack

def append_frame(stack, obs, image_shape=(FRAME_H, FRAME_W)):
    f = process_frame(obs, image_shape=image_shape)  # (1,H,W,1)
    f = f.squeeze(0).squeeze(-1)                     # (H,W)
    stack.append(f)

def stack_to_tensor(stack):
    s = np.stack(stack, axis=0)        # (4, H, W)
    s = torch.from_numpy(s).float()    # float32
    return s.to(DEVICE)



In [9]:
# 3) REPLAY BUFFER
Transition = collections.namedtuple("Transition",
    ["state", "action", "reward", "next_state", "done"])

class ReplayBuffer:
    def __init__(self, capacity):
        self.buf = collections.deque(maxlen=capacity)
    def __len__(self):
        return len(self.buf)
    def push(self, *args):
        self.buf.append(Transition(*args))
    def sample(self, batch_size):
        batch = random.sample(self.buf, batch_size)
        return Transition(*zip(*batch))



In [10]:
# 4) DQN MODEL
class DQN(nn.Module):
    def __init__(self, num_actions):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(STACK_N, 32, kernel_size=8, stride=4), nn.ReLU(inplace=True),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),       nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),       nn.ReLU(inplace=True),
        )
        # infer conv output size
        with torch.no_grad():
            dummy = torch.zeros(1, STACK_N, FRAME_H, FRAME_W)
            conv_out = self.features(dummy).view(1, -1).shape[1]
        self.head = nn.Sequential(
            nn.Linear(conv_out, 512), nn.ReLU(inplace=True),
            nn.Linear(512, num_actions)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.flatten(start_dim=1)
        return self.head(x)



In [11]:
# 6) TRAINING STEP
def train_step(qnet, tgt, optimizer, replay, batch_size, gamma):
    batch = replay.sample(batch_size)

    state_batch = torch.stack(batch.state).to(DEVICE)       # (B, 4, 84, 80)
    next_batch  = torch.stack(batch.next_state).to(DEVICE)  # (B, 4, 84, 80)
    action_batch = torch.tensor(batch.action, device=DEVICE).long()   # (B,)
    reward_batch = torch.tensor(batch.reward, device=DEVICE).float()  # (B,)
    done_batch   = torch.tensor(batch.done,   device=DEVICE).float()  # (B,)

    q_sa = qnet(state_batch).gather(1, action_batch.unsqueeze(1)).squeeze(1)

    with torch.no_grad():
        next_q_max = tgt(next_batch).max(dim=1)[0]
        target = reward_batch + gamma * next_q_max * (1.0 - done_batch)

    loss = nn.functional.smooth_l1_loss(q_sa, target)
    optimizer.zero_grad()
    loss.backward()
    nn.utils.clip_grad_norm_(qnet.parameters(), 10.0)
    optimizer.step()
    return float(loss.item()

SyntaxError: incomplete input (3856864595.py, line 22)

In [None]:
# 7) TRAINING LOOP
def run_training(episodes=20, batch_size=BATCH_SIZE, target_update_ep=TARGET_UPDATE_EP,
                 render=False):
    env = gym.make(ENV_ID, render_mode="human" if render else None)
    num_actions = env.action_space.n

    qnet = DQN(num_actions).to(DEVICE)
    tgt  = DQN(num_actions).to(DEVICE)
    tgt.load_state_dict(qnet.state_dict()); tgt.eval()

    optimizer = optim.Adam(qnet.parameters(), lr=LR)
    replay = ReplayBuffer(REPLAY_CAPACITY)

    epsilon = EPS_START
    scores, avg5, losses = [], [], []

    for ep in range(1, episodes + 1):
        stack = make_initial_stack(env)
        state_t = stack_to_tensor(stack)
        done = False
        ep_reward = 0.0
        ep_losses = []

        while not done:
            action = select_action(qnet, state_t, epsilon, num_actions)
            obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            ep_reward += reward

            append_frame(stack, obs)
            next_t = stack_to_tensor(stack)

            replay.push(state_t, action, reward, next_t, float(done))
            state_t = next_t

            if len(replay) >= max(MIN_REPLAY_TO_LEARN, batch_size):
                ep_losses.append(train_step(qnet, tgt, optimizer, replay, batch_size, GAMMA))

        epsilon = max(EPS_MIN, epsilon * EPS_DECAY)
        if ep % target_update_ep == 0:
            tgt.load_state_dict(qnet.state_dict())

        scores.append(ep_reward)
        avg5.append(np.mean(scores[-5:]))
        losses.append(np.mean(ep_losses) if ep_losses else 0.0)

        print(f"Ep {ep:03d} | score={ep_reward:.1f} | avg5={avg5[-1]:.2f} | "
              f"eps={epsilon:.3f} | loss={losses[-1]:.4f}")

    env.close()
    return {"scores": scores, "avg5": avg5, "losses": losses, "model": qnet.state_dict()}



In [None]:
# 8) PLOTTING HELPERS
def plot_metric(values, title, ylabel, xlabel="Episode"):
    plt.figure()
    plt.plot(values)
    plt.title(title)
    plt.xlabel(xlabel); plt.ylabel(ylabel)
    plt.grid(True)
    plt.tight_layout()



In [None]:
import os
import pandas as pd

# 10) SAVE RESULTS
def save_results_csv(folder, name, scores, avg5, losses):
    """Save episode metrics to CSV for report inclusion."""
    os.makedirs(folder, exist_ok=True)
    df = pd.DataFrame({
        "Episode": range(1, len(scores)+1),
        "Score": scores,
        "Avg5": avg5,
        "Loss": losses
    })
    csv_path = os.path.join(folder, f"{name}.csv")
    df.to_csv(csv_path, index=False)
    print(f" Results saved to {csv_path}")

def save_plot_png(folder, name, values, title, ylabel, xlabel="Episode"):
    """Save metric plot as PNG."""
    os.makedirs(folder, exist_ok=True)
    plt.figure()
    plt.plot(values, label=ylabel)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    img_path = os.path.join(folder, f"{name}.png")
    plt.savefig(img_path)
    plt.close()
    print(f" Plot saved to {img_path}")



In [None]:
def run_all_experiments():
    os.makedirs("results", exist_ok=True)

    # ====== Baseline ======
    print("\n== Baseline: batch=8, target_update=10 ==")
    res_base = run_training(episodes=20, batch_size=8, target_update_ep=10)
    save_results_csv("results", "baseline_b8_t10", res_base["scores"], res_base["avg5"], res_base["losses"])
    save_plot_png("results", "baseline_b8_t10_score", res_base["scores"],
                  "Score per Episode (batch=8, target=10)", "Score")
    save_plot_png("results", "baseline_b8_t10_avg5", res_base["avg5"],
                  "Avg Reward (last 5) (batch=8, target=10)", "Avg(5) Reward")

    # ====== Batch Size Experiment ======
    print("\n== Batch Size Experiment: batch=16, target_update=10 ==")
    res_b16 = run_training(episodes=20, batch_size=16, target_update_ep=10)
    save_results_csv("results", "batch16_t10", res_b16["scores"], res_b16["avg5"], res_b16["losses"])
    save_plot_png("results", "batch16_t10_score", res_b16["scores"],
                  "Score per Episode (batch=16, target=10)", "Score")
    save_plot_png("results", "batch16_t10_avg5", res_b16["avg5"],
                  "Avg Reward (last 5) (batch=16, target=10)", "Avg(5) Reward")

    # ====== Target Update Experiment ======
    print("\n== Target Update Experiment: batch=8, target_update=3 ==")
    res_t3 = run_training(episodes=20, batch_size=8, target_update_ep=3)
    save_results_csv("results", "batch8_t3", res_t3["scores"], res_t3["avg5"], res_t3["losses"])
    save_plot_png("results", "batch8_t3_score", res_t3["scores"],
                  "Score per Episode (batch=8, target=3)", "Score")
    save_plot_png("results", "batch8_t3_avg5", res_t3["avg5"],
                  "Avg Reward (last 5) (batch=8, target=3)", "Avg(5) Reward")

    print("\n All results saved under the 'results' folder!")



In [None]:
# =============================================================
#  TASK: Plot effects of deliberate parameter changes
# =============================================================

# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------------------------------------------
# Step 1: Load the results from previous experiments
# -------------------------------------------------------------
# Make sure the CSV files are in your working directory
df_base = pd.read_csv("D:/2 Level\Reinforcement/Assignment 3/Results/baseline_b8_t10.csv")   # Default setup
df_b16 = pd.read_csv("D:/2 Level/Reinforcement/Assignment 3/Results/batch16_t10.csv")        # Changing batch size
df_t3 = pd.read_csv("D:/2 Level/Reinforcement/Assignment 3/Results/batch8_t3.csv")           # Changing target update rate

# -------------------------------------------------------------
# Step 2: Plot 1 — Effect of changing mini-batch size [8 → 16]
# -------------------------------------------------------------

# (a) Score per Episode
plt.figure(figsize=(10,6))
plt.plot(df_base["Episode"], df_base["Score"], label="Batch Size = 8 (Default)", marker='o')
plt.plot(df_b16["Episode"], df_b16["Score"], label="Batch Size = 16", marker='x')
plt.title("Effect of Changing Mini-Batch Size on Score per Episode")
plt.xlabel("Episode")
plt.ylabel("Score per Episode")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("mini_batch_score_comparison.png")
plt.show()

# (b) Average Reward (Last 5 Episodes)
plt.figure(figsize=(10,6))
plt.plot(df_base["Episode"], df_base["Avg5"], label="Batch Size = 8 (Default)", marker='o')
plt.plot(df_b16["Episode"], df_b16["Avg5"], label="Batch Size = 16", marker='x')
plt.title("Effect of Changing Mini-Batch Size on Average Cumulative Reward (Last 5 Episodes)")
plt.xlabel("Episode")
plt.ylabel("Average Reward (5-Episode Window)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("mini_batch_avg5_comparison.png")
plt.show()

# -------------------------------------------------------------
# Step 3: Plot 2 — Effect of changing target update rate [3 → 10]
# -------------------------------------------------------------

# (a) Score per Episode
plt.figure(figsize=(10,6))
plt.plot(df_base["Episode"], df_base["Score"], label="Target Update = 10 (Default)", marker='o')
plt.plot(df_t3["Episode"], df_t3["Score"], label="Target Update = 3", marker='s')
plt.title("Effect of Changing Target Update Rate on Score per Episode")
plt.xlabel("Episode")
plt.ylabel("Score per Episode")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("target_update_score_comparison.png")
plt.show()

# (b) Average Reward (Last 5 Episodes)
plt.figure(figsize=(10,6))
plt.plot(df_base["Episode"], df_base["Avg5"], label="Target Update = 10 (Default)", marker='o')
plt.plot(df_t3["Episode"], df_t3["Avg5"], label="Target Update = 3", marker='s')
plt.title("Effect of Changing Target Update Rate on Average Cumulative Reward (Last 5 Episodes)")
plt.xlabel("Episode")
plt.ylabel("Average Reward (5-Episode Window)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("target_update_avg5_comparison.png")
plt.show()


  df_base = pd.read_csv("D:/2 Level\Reinforcement/Assignment 3/Results/baseline_b8_t10.csv")   # Default setup
