<a href="https://colab.research.google.com/github/Helena26-ai/Rainforcement_project/blob/Helena/Dueling_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install gymnasium[atari,accept-rom-license]==0.29.1

Collecting gymnasium==0.29.1 (from gymnasium[accept-rom-license,atari]==0.29.1)
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license,atari]==0.29.1)
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting shimmy<1.0,>=0.1.0 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[accept-rom-license,atari]==0.29.1)
  Downloading Shimmy-0.2.1-py3-none-any.whl.metadata (2.3 kB)
Collecting AutoROM.accept-rom-license (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license,atari]==0.29.1)
  Downloading AutoROM.accept-rom-license-0.6.1.tar.gz (434 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m434.7/434.7 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  

In [None]:
!pip install stable-baselines3 tensorboard matplotlib -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m184.5/184.5 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m119.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m40.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# DUELING DQN implementation for Pong
import os
import gymnasium as gym
from gymnasium import spaces
import collections
import typing as tt
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.tensorboard.writer import SummaryWriter
from gymnasium.wrappers import RecordVideo
import matplotlib.pyplot as plt
from stable_baselines3.common.atari_wrappers import AtariWrapper
from dataclasses import dataclass

In [None]:
os.makedirs("my_runs", exist_ok=True)
os.makedirs("my_runs/video", exist_ok=True)

In [None]:
# ==== DUELING DQN MODEL ====
class DuelingDQN(nn.Module):
    def __init__(self, input_shape: tt.Tuple[int, ...], n_actions: int):
        super(DuelingDQN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )
        conv_out_size = self.conv(torch.zeros(1, *input_shape)).shape[1]
        self.fc_adv = nn.Sequential(
            nn.Linear(conv_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, n_actions)
        )
        self.fc_val = nn.Sequential(
            nn.Linear(conv_out_size, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        x = x.float() / 255.0
        conv_out = self.conv(x)
        adv = self.fc_adv(conv_out)
        val = self.fc_val(conv_out)
        return val + (adv - adv.mean(dim=1, keepdim=True))

    #def adv_val(self, x):
     #   x = x.float() / 255.0
      #  conv_out = self.conv(x)
       # return self.fc_adv(conv_out), self.fc_val(conv_out)

In [None]:
# === WRAPPERS ===
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs = self.observation_space
        self.observation_space = spaces.Box(
            low=obs.low.min(), high=obs.high.max(),
            shape=(obs.shape[2], obs.shape[0], obs.shape[1]), dtype=obs.dtype)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super().__init__(env)
        self.buffer = collections.deque(maxlen=n_steps)
        obs = env.observation_space
        self.observation_space = spaces.Box(
            low=np.repeat(obs.low, n_steps, axis=0),
            high=np.repeat(obs.high, n_steps, axis=0),
            dtype=obs.dtype)

    def reset(self, **kwargs):
        for _ in range(self.buffer.maxlen - 1):
            self.buffer.append(self.env.observation_space.low)
        obs, info = self.env.reset(**kwargs)
        return self.observation(obs), info

    def observation(self, observation):
        self.buffer.append(observation)
        return np.concatenate(self.buffer)

def make_env(env_name: str, **kwargs):
    env = gym.make(env_name, **kwargs)
    env = AtariWrapper(env, clip_reward=False, noop_max=0)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, n_steps=4)
    return env

In [None]:
# === PARAMS ===
DEFAULT_ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19
GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000
EPSILON_DECAY_LAST_FRAME = 150000
EPSILON_START = 1.0
EPSILON_FINAL = 0.01

@dataclass
class Experience:
    state: np.ndarray
    action: int
    reward: float
    done_trunc: bool
    new_state: np.ndarray

class ExperienceBuffer:
    def __init__(self, capacity: int):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience: Experience):
        self.buffer.append(experience)

    def sample(self, batch_size: int) -> list:
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        return [self.buffer[idx] for idx in indices]

class Agent:
    def __init__(self, env: gym.Env, exp_buffer: ExperienceBuffer):
        self.env = env
        self.exp_buffer = exp_buffer
        self.state = None
        self.total_reward = 0.0
        self._reset()

    def _reset(self):
        self.state, _ = self.env.reset()
        self.total_reward = 0.0

    @torch.no_grad()
    def play_step(self, net: DuelingDQN, device: torch.device, epsilon=0.0):
        if np.random.random() < epsilon:
            action = self.env.action_space.sample()
        else:
            state_v = torch.tensor(np.array([self.state]), device=device)
            q_vals = net(state_v)
            action = torch.argmax(q_vals).item()

        new_state, reward, done, trunc, _ = self.env.step(action)
        self.total_reward += reward
        self.exp_buffer.append(Experience(self.state, action, reward, done or trunc, new_state))
        self.state = new_state

        if done or trunc:
            r = self.total_reward
            self._reset()
            return r
        return None

def batch_to_tensors(batch, device):
    states = torch.tensor(np.array([e.state for e in batch]), device=device)
    actions = torch.tensor([e.action for e in batch], dtype=torch.long, device=device)
    rewards = torch.tensor([e.reward for e in batch], dtype=torch.float32, device=device)
    dones = torch.tensor([e.done_trunc for e in batch], dtype=torch.bool, device=device)
    next_states = torch.tensor(np.array([e.new_state for e in batch]), device=device)
    return states, actions, rewards, dones, next_states

def calc_loss(batch, net, tgt_net, device):
    states, actions, rewards, dones, next_states = batch_to_tensors(batch, device)
    state_action_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_state_values = tgt_net(next_states).max(1)[0]
        next_state_values[dones] = 0.0
    expected_values = rewards + GAMMA * next_state_values
    return nn.MSELoss()(state_action_values, expected_values)

In [None]:
# === TRAIN ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = make_env(DEFAULT_ENV_NAME)
net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
tgt_net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
writer = SummaryWriter(log_dir="my_runs")

epsilon = EPSILON_START
frame_idx = 0
best_mean_reward = None
rewards = []
ts = time.time()
ts_frame = 0

while True:
    frame_idx += 1
    epsilon = max(EPSILON_FINAL, EPSILON_START - frame_idx / EPSILON_DECAY_LAST_FRAME)
    reward = agent.play_step(net, device, epsilon)
    if reward is not None:
        rewards.append(reward)
        speed = (frame_idx - ts_frame) / (time.time() - ts)
        ts_frame = frame_idx
        ts = time.time()
        m_reward = np.mean(rewards[-100:])
        print(f"{frame_idx}: games {len(rewards)}, reward {m_reward:.3f}, eps {epsilon:.2f}, speed {speed:.2f} f/s")
        writer.add_scalar("epsilon", epsilon, frame_idx)
        writer.add_scalar("speed", speed, frame_idx)
        writer.add_scalar("reward_100", m_reward, frame_idx)
        writer.add_scalar("reward", reward, frame_idx)

        if best_mean_reward is None or best_mean_reward < m_reward:
            torch.save(net.state_dict(), f"my_runs/best_{m_reward:.0f}.dat")
            best_mean_reward = m_reward
        if m_reward > MEAN_REWARD_BOUND:
            print("Solved!")
            break

    if len(buffer) < REPLAY_START_SIZE:
        continue

    if frame_idx % SYNC_TARGET_FRAMES == 0:
        tgt_net.load_state_dict(net.state_dict())

    batch = buffer.sample(BATCH_SIZE)
    optimizer.zero_grad()
    loss = calc_loss(batch, net, tgt_net, device)
    loss.backward()
    optimizer.step()

writer.close()

852: games 1, reward -21.000, eps 0.99, speed 587.81 f/s
1730: games 2, reward -21.000, eps 0.99, speed 824.56 f/s
2739: games 3, reward -20.667, eps 0.98, speed 818.86 f/s
3563: games 4, reward -20.750, eps 0.98, speed 819.11 f/s
4431: games 5, reward -20.600, eps 0.97, speed 814.95 f/s
5696: games 6, reward -20.333, eps 0.96, speed 782.37 f/s
6486: games 7, reward -20.429, eps 0.96, speed 787.73 f/s
7489: games 8, reward -20.375, eps 0.95, speed 779.50 f/s
8435: games 9, reward -20.222, eps 0.94, speed 772.21 f/s
9272: games 10, reward -20.300, eps 0.94, speed 790.76 f/s
10212: games 11, reward -20.364, eps 0.93, speed 319.74 f/s
11052: games 12, reward -20.333, eps 0.93, speed 148.70 f/s
11992: games 13, reward -20.385, eps 0.92, speed 148.28 f/s
12783: games 14, reward -20.429, eps 0.91, speed 149.53 f/s
13792: games 15, reward -20.400, eps 0.91, speed 146.60 f/s
14610: games 16, reward -20.438, eps 0.90, speed 147.08 f/s
15538: games 17, reward -20.412, eps 0.90, speed 147.35 f/s


In [None]:
# === RECORD VIDEO ===
from gymnasium.wrappers import RecordVideo
import glob
from IPython.display import Video, display

VIDEO_PATH = "my_runs/video"
MODEL_PATH = sorted(glob.glob("my_runs/*.dat"))[-1]

env = make_env(DEFAULT_ENV_NAME, render_mode="rgb_array")
env = RecordVideo(env, video_folder=VIDEO_PATH, episode_trigger=lambda x: True)

net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
net.load_state_dict(torch.load(MODEL_PATH, map_location=device))

state, _ = env.reset()
total_reward = 0.0

while True:
    state_v = torch.tensor(np.expand_dims(state, 0)).to(device)
    q_vals = net(state_v)
    action = torch.argmax(q_vals).item()
    state, reward, done, trunc, _ = env.step(action)
    total_reward += reward
    if done or trunc:
        break

env.close()
print(f"Total reward: {total_reward}")

video_file = glob.glob(f"{VIDEO_PATH}/*.mp4")[0]
display(Video(video_file, embed=True))

  logger.warn(
  logger.warn(
  """


Moviepy - Building video /content/my_runs/video/rl-video-episode-0.mp4.
Moviepy - Writing video /content/my_runs/video/rl-video-episode-0.mp4



                                                                  

Moviepy - Done !
Moviepy - video ready /content/my_runs/video/rl-video-episode-0.mp4
Total reward: 19.0




In [None]:
# === Wykresy z TensorBoard ===
from tensorboard.backend.event_processing import event_accumulator

log_dir = writer.log_dir if hasattr(writer, 'log_dir') else "my_runs"
event_files = []
for root, dirs, files in os.walk(log_dir):
    for file in files:
        if file.startswith("events.out.tfevents"):
            event_files.append(os.path.join(root, file))

if event_files:
    ea = event_accumulator.EventAccumulator(event_files[0])
    ea.Reload()

    for tag, title, fname in [
        ("reward", "Nagroda", "reward_plot.png"),
        ("reward_100", "Średnia nagroda (100 gier)", "reward_100_plot.png"),
        ("speed", "Prędkość (kl/s)", "speed_plot.png"),
        ("epsilon", "Epsilon", "epsilon_plot.png")
    ]:
        try:
            scalars = ea.Scalars(tag)
            steps = [s.step for s in scalars]
            values = [s.value for s in scalars]
            plt.figure(figsize=(10, 5))
            plt.plot(steps, values)
            plt.xlabel("Iteracja")
            plt.ylabel(title)
            plt.title(f"Trening – {title}")
            plt.grid()
            plt.tight_layout()
            plt.savefig(fname)
            plt.close()
        except KeyError:
            print(f"⚠️ Brak danych dla tagu: {tag}")
else:
    print("⚠️ Nie znaleziono plików TensorBoard.")