<a href="https://colab.research.google.com/github/ManupatiEshwar/reniforecement/blob/main/Assignment6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

!pip install gymnasium==0.29.1 torch torchvision



import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from collections import deque, namedtuple


Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

class ReplayBuffer:
    def __init__(self, capacity, obs_shape, device):
        self.capacity = capacity
        self.device = device
        self.pos = 0
        self.full = False
        self.obs_buf = np.zeros((capacity, *obs_shape), dtype=np.float32)
        self.next_obs_buf = np.zeros((capacity, *obs_shape), dtype=np.float32)
        self.acts = np.zeros((capacity,), dtype=np.int64)
        self.rews = np.zeros((capacity,), dtype=np.float32)
        self.dones = np.zeros((capacity,), dtype=np.bool_)

    def push(self, state, action, reward, next_state, done):
        self.obs_buf[self.pos] = state
        self.acts[self.pos] = action
        self.rews[self.pos] = reward
        self.next_obs_buf[self.pos] = next_state
        self.dones[self.pos] = done
        self.pos = (self.pos + 1) % self.capacity
        self.full = self.full or self.pos == 0

    def __len__(self):
        return self.capacity if self.full else self.pos

    def sample(self, batch_size):
        idxs = np.random.randint(0, len(self), size=batch_size)
        s = torch.from_numpy(self.obs_buf[idxs]).to(self.device)
        ns = torch.from_numpy(self.next_obs_buf[idxs]).to(self.device)
        a = torch.from_numpy(self.acts[idxs]).to(self.device)
        r = torch.from_numpy(self.rews[idxs]).to(self.device)
        d = torch.from_numpy(self.dones[idxs]).to(self.device)
        return s, a, r, ns, d


class MLPQ(nn.Module):
    def __init__(self, obs_dim, n_actions):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, 128), nn.ReLU(),
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, n_actions)
        )
    def forward(self, x):
        return self.net(x)


def train_dqn(env_name="CartPole-v1", total_steps=20_000, batch_size=64, gamma=0.99, lr=1e-3,
              buffer_size=50_000, start_steps=1000, update_every=50, target_update=1000):

    env = gym.make(env_name)
    obs_shape = env.observation_space.shape
    n_actions = env.action_space.n
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    policy_net = MLPQ(obs_dim=obs_shape[0], n_actions=n_actions).to(device)
    target_net = MLPQ(obs_dim=obs_shape[0], n_actions=n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())

    optimizer = optim.Adam(policy_net.parameters(), lr=lr)
    buffer = ReplayBuffer(buffer_size, obs_shape, device)

    obs, _ = env.reset()
    episode_reward = 0
    rewards = []

    for step in range(1, total_steps + 1):
        # epsilon-greedy
        epsilon = max(0.01, 1 - step / 20_000)
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                q_vals = policy_net(torch.tensor(obs, dtype=torch.float32, device=device))
                action = q_vals.argmax().item()

        next_obs, reward, done, truncated, _ = env.step(action)
        buffer.push(obs, action, reward, next_obs, done or truncated)
        obs = next_obs
        episode_reward += reward

        if done or truncated:
            rewards.append(episode_reward)
            obs, _ = env.reset()
            episode_reward = 0

        # train step
        if step > start_steps and step % update_every == 0 and len(buffer) > batch_size:
            s, a, r, ns, d = buffer.sample(batch_size)
            q_values = policy_net(s).gather(1, a.unsqueeze(1)).squeeze()
            with torch.no_grad():
                max_next_q = target_net(ns).max(1)[0]
                target = r + gamma * max_next_q * (1 - d.float())
            loss = nn.MSELoss()(q_values, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # update target net
        if step % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if step % 5000 == 0:
            avg_r = np.mean(rewards[-10:]) if rewards else 0
            print(f"Step {step}, Avg Reward (last 10 episodes): {avg_r}")

    env.close()
    return rewards


rewards = train_dqn("CartPole-v1", total_steps=20_000)

print("Training finished. Final average reward:", np.mean(rewards[-10:]))


Step 5000, Avg Reward (last 10 episodes): 19.0
Step 10000, Avg Reward (last 10 episodes): 19.0
Step 15000, Avg Reward (last 10 episodes): 50.7
Step 20000, Avg Reward (last 10 episodes): 64.1
Training finished. Final average reward: 64.1
