<a href="https://colab.research.google.com/github/Ilaharshith/Reinforcement-Learning-/blob/main/Untitled16.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import math
import numpy as np
import random
import argparse  # for command-line arguments

# Try Gymnasium first; fallback to Gym
try:
    import gymnasium as gym
    GYMN = "gymnasium"
except Exception:
    import gym
    GYMN = "gym"

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque

# ---------------------------
# Q-Network
# ---------------------------
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

# ---------------------------
# Replay Buffer
# ---------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.array, zip(*batch))
        return state, action, reward, next_state, done

    def __len__(self):
        return len(self.buffer)

# ---------------------------
# Training Loop
# ---------------------------
def train(env_id, total_steps, start_learning, buffer_size, batch_size,
          gamma, lr, target_update, eps_start, eps_end, eps_decay_steps):

    env = gym.make(env_id)
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    q_net = QNetwork(state_dim, action_dim).to(device)
    target_net = QNetwork(state_dim, action_dim).to(device)
    target_net.load_state_dict(q_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(q_net.parameters(), lr=lr)
    replay_buffer = ReplayBuffer(buffer_size)

    epsilon = eps_start
    epsilon_decay = (eps_start - eps_end) / eps_decay_steps

    state, _ = env.reset()
    total_reward = 0

    for step in range(1, total_steps + 1):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0).to(device)
                q_values = q_net(state_tensor)
                action = q_values.argmax().item()

        next_state, reward, done, truncated, _ = env.step(action)
        replay_buffer.push(state, action, reward, next_state, done or truncated)

        state = next_state
        total_reward += reward

        # If episode ends
        if done or truncated:
            print(f"Step {step}, Episode Reward: {total_reward}")
            state, _ = env.reset()
            total_reward = 0

        # Training step
        if step > start_learning and len(replay_buffer) >= batch_size:
            states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)

            states = torch.FloatTensor(states).to(device)
            actions = torch.LongTensor(actions).to(device)
            rewards = torch.FloatTensor(rewards).to(device)
            next_states = torch.FloatTensor(next_states).to(device)
            dones = torch.FloatTensor(dones).to(device)

            q_values = q_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
            with torch.no_grad():
                max_next_q = target_net(next_states).max(1)[0]
                target = rewards + gamma * max_next_q * (1 - dones)

            loss = F.mse_loss(q_values, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Update target network
        if step % target_update == 0:
            target_net.load_state_dict(q_net.state_dict())

        # Decay epsilon
        if epsilon > eps_end:
            epsilon -= epsilon_decay
            epsilon = max(eps_end, epsilon)

    env.close()

# ---------------------------
# Main with argparse
# ---------------------------
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, default="CartPole-v1")
    parser.add_argument("--total-steps", type=int, default=10000)
    parser.add_argument("--start-learning", type=int, default=1000)
    parser.add_argument("--buffer-size", type=int, default=10000)
    parser.add_argument("--batch-size", type=int, default=32)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--lr", type=float, default=1e-3)
    parser.add_argument("--target-update", type=int, default=1000)
    parser.add_argument("--eps-start", type=float, default=1.0)
    parser.add_argument("--eps-end", type=float, default=0.1)
    parser.add_argument("--eps-decay-steps", type=int, default=10000)

    # ✅ Fix for Jupyter/Colab
    args, unknown = parser.parse_known_args()

    train(env_id=args.env,
          total_steps=args.total_steps,
          start_learning=args.start_learning,
          buffer_size=args.buffer_size,
          batch_size=args.batch_size,
          gamma=args.gamma,
          lr=args.lr,
          target_update=args.target_update,
          eps_start=args.eps_start,
          eps_end=args.eps_end,
          eps_decay_steps=args.eps_decay_steps)


Step 27, Episode Reward: 27.0
Step 40, Episode Reward: 13.0
Step 56, Episode Reward: 16.0
Step 74, Episode Reward: 18.0
Step 114, Episode Reward: 40.0
Step 127, Episode Reward: 13.0
Step 147, Episode Reward: 20.0
Step 199, Episode Reward: 52.0
Step 211, Episode Reward: 12.0
Step 227, Episode Reward: 16.0
Step 255, Episode Reward: 28.0
Step 265, Episode Reward: 10.0
Step 277, Episode Reward: 12.0
Step 287, Episode Reward: 10.0
Step 306, Episode Reward: 19.0
Step 342, Episode Reward: 36.0
Step 376, Episode Reward: 34.0
Step 391, Episode Reward: 15.0
Step 411, Episode Reward: 20.0
Step 424, Episode Reward: 13.0
Step 444, Episode Reward: 20.0
Step 462, Episode Reward: 18.0
Step 489, Episode Reward: 27.0
Step 517, Episode Reward: 28.0
Step 534, Episode Reward: 17.0
Step 565, Episode Reward: 31.0
Step 581, Episode Reward: 16.0
Step 594, Episode Reward: 13.0
Step 606, Episode Reward: 12.0
Step 637, Episode Reward: 31.0
Step 655, Episode Reward: 18.0
Step 671, Episode Reward: 16.0
Step 704, Ep