<a href="https://colab.research.google.com/github/Helena26-ai/Rainforcement_project/blob/Helena/Noisy_DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# Zainstaluj PyTorch z obsługą CUDA 12.4
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124


In [11]:
!pip install gymnasium[atari,accept-rom-license]==0.29.1



In [12]:
# Zainstaluj biblioteki RL i narzędzia
!pip install torchrl
!pip install stable-baselines3 tensorboard matplotlib -q



In [13]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import collections
import gymnasium as gym
from gymnasium import spaces
from torchrl.modules import NoisyLinear
from dataclasses import dataclass
import imageio.v2 as imageio
from stable_baselines3.common.atari_wrappers import AtariWrapper

In [14]:
# === Wrappery ===
class ImageToPyTorch(gym.ObservationWrapper):
    def __init__(self, env):
        super().__init__(env)
        obs = self.observation_space
        self.observation_space = spaces.Box(
            low=obs.low.min(), high=obs.high.max(),
            shape=(obs.shape[2], obs.shape[0], obs.shape[1]), dtype=obs.dtype)

    def observation(self, observation):
        return np.moveaxis(observation, 2, 0)

class BufferWrapper(gym.ObservationWrapper):
    def __init__(self, env, n_steps):
        super().__init__(env)
        self.buffer = collections.deque(maxlen=n_steps)
        obs = env.observation_space
        self.observation_space = spaces.Box(
            low=np.repeat(obs.low, n_steps, axis=0),
            high=np.repeat(obs.high, n_steps, axis=0),
            dtype=obs.dtype)

    def reset(self, **kwargs):
        for _ in range(self.buffer.maxlen - 1):
            self.buffer.append(self.env.observation_space.low)
        obs, info = self.env.reset(**kwargs)
        return self.observation(obs), info

    def observation(self, observation):
        self.buffer.append(observation)
        return np.concatenate(self.buffer)

def make_env(env_name: str, **kwargs):
    env = gym.make(env_name, **kwargs)
    env = AtariWrapper(env, clip_reward=False, noop_max=0)
    env = ImageToPyTorch(env)
    env = BufferWrapper(env, n_steps=4)
    return env

In [15]:
# === Parametry ===
ENV_NAME = "PongNoFrameskip-v4"
MEAN_REWARD_BOUND = 19
GAMMA = 0.99
BATCH_SIZE = 32
REPLAY_SIZE = 100000
LEARNING_RATE = 1e-4 #7.14e-5
SYNC_TARGET_FRAMES = 1000
REPLAY_START_SIZE = 10000
TEST_VIDEO_PATH = "noisy_dqn_test_run.mp4"
MODEL_SAVE_PATH = "noisy_dqn_best_model.dat"

In [16]:
# === Sieć Noisy DQN ===
class NoisyDQN(nn.Module):
    def __init__(self, input_shape, n_actions):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten()
        )
        conv_out_size = self.conv(torch.zeros(1, *input_shape)).shape[1]
        self.fc1 = NoisyLinear(conv_out_size, 512)
        self.fc2 = NoisyLinear(512, n_actions)

    def forward(self, x):
        x = x.float() / 255.0
        x = self.conv(x)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

    def reset_noise(self):
        self.fc1.reset_noise()
        self.fc2.reset_noise()

In [17]:
# === Bufor doświadczeń ===
@dataclass
class Experience:
    state: np.ndarray
    action: int
    reward: float
    done: bool
    new_state: np.ndarray

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, exp):
        self.buffer.append(exp)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        batch = [self.buffer[idx] for idx in indices]
        return batch

# === Agent ===
class Agent:
    def __init__(self, env, buffer):
        self.env = env
        self.buffer = buffer
        self.state, _ = env.reset()
        self.total_reward = 0.0

    def play_step(self, net, device):
        state_v = torch.tensor(np.array([self.state]), device=device)
        q_vals = net(state_v)
        action = torch.argmax(q_vals).item()

        new_state, reward, terminated, truncated, _ = self.env.step(action)
        done = terminated or truncated
        exp = Experience(self.state, action, reward, done, new_state)
        self.buffer.append(exp)
        self.total_reward += reward
        self.state = new_state

        if done:
            r = self.total_reward
            self.state, _ = self.env.reset()
            self.total_reward = 0.0
            return r
        return None

# === Przetwarzanie batcha ===
def batch_to_tensors(batch, device):
    states = torch.tensor(np.array([e.state for e in batch]), device=device)
    actions = torch.tensor([e.action for e in batch], dtype=torch.long, device=device)
    rewards = torch.tensor([e.reward for e in batch], dtype=torch.float32, device=device)
    dones = torch.tensor([e.done for e in batch], dtype=torch.bool, device=device)
    next_states = torch.tensor(np.array([e.new_state for e in batch]), device=device)
    return states, actions, rewards, dones, next_states

def calc_loss(batch, net, tgt_net, device):
    states, actions, rewards, dones, next_states = batch_to_tensors(batch, device)
    state_action_values = net(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
    with torch.no_grad():
        next_state_values = tgt_net(next_states).max(1)[0]
        next_state_values[dones] = 0.0
    expected_values = rewards + GAMMA * next_state_values
    return nn.MSELoss()(state_action_values, expected_values)

In [18]:
# === Główna pętla treningowa ===
if __name__ == "__main__":
    import time
    from torch.utils.tensorboard import SummaryWriter
    import matplotlib.pyplot as plt

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = make_env(ENV_NAME)

    net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = NoisyDQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net.load_state_dict(net.state_dict())

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)
    optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    writer = SummaryWriter()
    best_mean_reward = None
    rewards = []
    speeds = []
    snr_1_list = []
    snr_2_list = []
    frame_idx = 0
    ts = time.time()
    ts_frame = 0

    while True:
        frame_idx += 1
        reward = agent.play_step(net, device)
        net.reset_noise()

        if reward is not None:
            rewards.append(reward)
            speed = (frame_idx - ts_frame) / (time.time() - ts)
            ts_frame = frame_idx
            ts = time.time()
            m_reward = np.mean(rewards[-100:])
            writer.add_scalar("reward", reward, frame_idx)
            writer.add_scalar("reward_100", m_reward, frame_idx)
            writer.add_scalar("speed", speed, frame_idx)

            print(f"{frame_idx}: games {len(rewards)}, reward {m_reward:.3f}, speed {speed:.2f} f/s")
            if best_mean_reward is None or best_mean_reward < m_reward:
                torch.save(net.state_dict(), MODEL_SAVE_PATH)
                best_mean_reward = m_reward
            if m_reward > MEAN_REWARD_BOUND:
                print("Solved!")
                break

        if len(buffer) < REPLAY_START_SIZE:
            continue

        if frame_idx % SYNC_TARGET_FRAMES == 0:
            tgt_net.load_state_dict(net.state_dict())

        batch = buffer.sample(BATCH_SIZE)
        loss = calc_loss(batch, net, tgt_net, device)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        writer.add_scalar("loss", loss.item(), frame_idx)

        # SNR monitoring
        snr_1 = (net.fc1.weight_mu.pow(2).mean().sqrt() / net.fc1.weight_sigma.pow(2).mean().sqrt()).item()
        snr_2 = (net.fc2.weight_mu.pow(2).mean().sqrt() / net.fc2.weight_sigma.pow(2).mean().sqrt()).item()
        writer.add_scalar("SNR/layer1", snr_1, frame_idx)
        writer.add_scalar("SNR/layer2", snr_2, frame_idx)
        snr_1_list.append(snr_1)
        snr_2_list.append(snr_2)

    writer.close()

762: games 1, reward -21.000, speed 411.41 f/s
1524: games 2, reward -21.000, speed 417.69 f/s
2364: games 3, reward -20.667, speed 441.49 f/s
3246: games 4, reward -20.750, speed 439.78 f/s
4008: games 5, reward -20.800, speed 442.72 f/s
4830: games 6, reward -20.833, speed 429.59 f/s
5592: games 7, reward -20.857, speed 433.75 f/s
6354: games 8, reward -20.875, speed 434.75 f/s
7116: games 9, reward -20.889, speed 438.45 f/s
7878: games 10, reward -20.900, speed 445.38 f/s
8718: games 11, reward -20.818, speed 440.12 f/s
9480: games 12, reward -20.833, speed 443.70 f/s
10242: games 13, reward -20.846, speed 230.51 f/s
11306: games 14, reward -20.857, speed 119.31 f/s
12202: games 15, reward -20.800, speed 118.74 f/s
13114: games 16, reward -20.812, speed 118.97 f/s
14250: games 17, reward -20.765, speed 118.40 f/s
15109: games 18, reward -20.722, speed 117.77 f/s
15950: games 19, reward -20.684, speed 117.91 f/s
16786: games 20, reward -20.650, speed 118.80 f/s
17835: games 21, rewar

In [23]:
    # === Testowanie i nagrywanie ===
    test_env = make_env(ENV_NAME, render_mode="rgb_array")
    net.load_state_dict(torch.load(MODEL_SAVE_PATH))
    net.eval()

    video_frames = []
    obs, _ = test_env.reset()
    done = False

    while not done:
        obs_v = torch.tensor([obs], device=device)
        q_vals = net(obs_v)
        action = torch.argmax(q_vals, dim=1).item()
        obs, _, terminated, truncated, _ = test_env.step(action)
        done = terminated or truncated
        frame = test_env.render()
        video_frames.append(frame)

    with imageio.get_writer(TEST_VIDEO_PATH, fps=30, codec='libx264') as writer:
        for frame in video_frames:
            writer.append_data(frame)

    print(f"✅ Zapisano nagranie testu agenta do pliku: {TEST_VIDEO_PATH}")



✅ Zapisano nagranie testu agenta do pliku: noisy_dqn_test_run.mp4


In [24]:
# === Wykresy ===
from tensorboard.backend.event_processing import event_accumulator

log_dir = writer.log_dir if hasattr(writer, 'log_dir') else "runs"
event_files = []
for root, dirs, files in os.walk(log_dir):
    for file in files:
        if file.startswith("events.out.tfevents"):
            event_files.append(os.path.join(root, file))

if len(event_files) > 0:
    ea = event_accumulator.EventAccumulator(event_files[0])
    ea.Reload()

    for tag, title, fname in [
        ("reward_100", "Średnia nagroda (100 gier)", "reward_mean_plot.png"),
        ("reward", "Nagroda", "reward_plot.png"),
        ("speed", "Prędkość (klatki/s)", "speed_plot.png"),  # <-- DODANE
    ]:
        try:
            scalars = ea.Scalars(tag)
            steps = [s.step for s in scalars]
            values = [s.value for s in scalars]
            plt.figure(figsize=(12, 5))
            plt.plot(steps, values)
            plt.xlabel("Iteracja")
            plt.ylabel(title)
            plt.title(f"Trening – {title}")
            plt.grid()
            plt.tight_layout()
            plt.savefig(fname)
            plt.close()
        except KeyError:
            print(f"⚠️ Nie znaleziono danych dla tagu: {tag}")

    # SNR warstwy 1 i 2 na jednym wykresie
    try:
        snr1 = ea.Scalars("SNR/layer1")
        snr2 = ea.Scalars("SNR/layer2")
        steps1 = [s.step for s in snr1]
        values1 = [s.value for s in snr1]
        steps2 = [s.step for s in snr2]
        values2 = [s.value for s in snr2]

        plt.figure(figsize=(12, 5))
        plt.plot(steps1, values1, label="SNR warstwa 1")
        plt.plot(steps2, values2, label="SNR warstwa 2")
        plt.xlabel("Iteracja")
        plt.ylabel("SNR")
        plt.title("Trening – SNR warstw Noisy DQN")
        plt.legend()
        plt.grid()
        plt.tight_layout()
        plt.savefig("snr_plot.png")
        plt.close()
    except KeyError:
        print("⚠️ Nie znaleziono danych SNR dla warstw noisy.")
else:
    print("⚠️ Brak plików logów TensorBoard do wygenerowania wykresów.")