<a href="https://colab.research.google.com/github/ManupatiEshwar/reniforecement/blob/main/Lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# REINFORCE on CartPole-v1 (PyTorch + gymnasium)
# Save as reinforce_cartpole.py or run in a notebook cell.

import math
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
from collections import deque

# ---------- Hyperparameters ----------
ENV_NAME = "CartPole-v1"
GAMMA = 0.99
LR = 1e-3
HIDDEN = 128
BATCH_SIZE = 1          # for REINFORCE with full-episode updates, batch=1 (one episode)
MAX_EPISODES = 2000
MAX_STEPS_PER_EP = 1000
REWARD_TO_GO = True     # True: use reward-to-go; False: use full-episode return
ENTROPY_COEF = 0.0      # small entropy bonus can help exploration
SOLVED_SCORE = 475.0    # CartPole-v1 considered solved around 475-500 over 500 episode avg
LOG_INTERVAL = 10
# -------------------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Simple policy network -> returns action probabilities
class PolicyNet(nn.Module):
    def __init__(self, obs_dim, action_dim, hidden=HIDDEN):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_dim, hidden),
            nn.ReLU(),
            nn.Linear(hidden, hidden),
            nn.ReLU(),
            nn.Linear(hidden, action_dim),
        )
    def forward(self, x):
        logits = self.net(x)
        return torch.softmax(logits, dim=-1), logits  # return probs and logits (for entropy)

def discount_rewards(rewards, gamma=GAMMA):
    """Compute discounted returns for an episode (full return for each time or reward-to-go)."""
    if REWARD_TO_GO:
        # reward-to-go: for each t, G_t = sum_{t'=t..T-1} gamma^{t'-t} r_{t'}
        n = len(rewards)
        rtg = np.zeros(n, dtype=np.float32)
        running = 0.0
        for i in reversed(range(n)):
            running = rewards[i] + gamma * running
            rtg[i] = running
        return rtg
    else:
        # full-episode return: same return for all steps
        total = 0.0
        for r in rewards:
            total = total * gamma + r if False else total + r  # not used; simpler below
        # simpler: standard undiscounted full return but let's use discounted full return
        # compute discounted sum from start:
        total = 0.0
        pow = 1.0
        for r in rewards:
            total += pow * r
            pow *= gamma
        return np.array([total] * len(rewards), dtype=np.float32)

def train():
    env = gym.make(ENV_NAME, render_mode=None)
    obs_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    policy = PolicyNet(obs_dim, action_dim).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=LR)

    running_rewards = deque(maxlen=100)
    best_avg = -float('inf')

    for episode in range(1, MAX_EPISODES + 1):
        obs, _ = env.reset()
        obs = torch.tensor(obs, dtype=torch.float32, device=device)

        log_probs = []
        logits_list = []
        rewards = []
        episode_reward = 0.0

        # collect one episode
        for step in range(MAX_STEPS_PER_EP):
            probs, logits = policy(obs.unsqueeze(0))   # shape (1, action_dim)
            probs = probs.squeeze(0)
            logits = logits.squeeze(0)

            m = torch.distributions.Categorical(probs)
            action = m.sample()
            log_prob = m.log_prob(action)
            entropy = m.entropy()

            next_obs, reward, terminated, truncated, info = env.step(int(action.item()))
            done = terminated or truncated

            log_probs.append(log_prob)
            logits_list.append(logits)   # for optional entropy calc
            rewards.append(float(reward))
            episode_reward += float(reward)

            if done:
                break

            obs = torch.tensor(next_obs, dtype=torch.float32, device=device)

        running_rewards.append(episode_reward)

        # compute returns (discounted)
        returns = discount_rewards(rewards, gamma=GAMMA)
        # normalize returns for stability (common trick)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        returns = torch.tensor(returns, dtype=torch.float32, device=device)

        # compute policy loss: -sum_t log_prob_t * return_t
        policy_loss = 0.0
        entropy_term = 0.0
        for lp, logit, G in zip(log_probs, logits_list, returns):
            policy_loss += -lp * G
            if ENTROPY_COEF > 0:
                # entropy from logits (stable)
                probs = torch.softmax(logit, dim=-1)
                ent = -(probs * torch.log(probs + 1e-8)).sum()
                entropy_term += ent

        if ENTROPY_COEF > 0:
            policy_loss = policy_loss - ENTROPY_COEF * entropy_term

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        # logging
        if episode % LOG_INTERVAL == 0:
            avg100 = np.mean(running_rewards) if running_rewards else 0.0
            print(f"Episode {episode}\tEpisode reward: {episode_reward:.2f}\tAvg100: {avg100:.2f}")

            if avg100 > best_avg:
                best_avg = avg100
                # optionally save
                torch.save(policy.state_dict(), "reinforce_policy.pth")

            if avg100 >= SOLVED_SCORE and len(running_rewards) >= 100:
                print(f"Solved! Avg reward {avg100:.2f} over 100 episodes. Stopping.")
                break

    env.close()
    return policy

def evaluate(policy, episodes=10, render=False):
    env = gym.make(ENV_NAME, render_mode="human" if render else None)
    total = 0.0
    for ep in range(episodes):
        obs, _ = env.reset()
        ep_r = 0.0
        for _ in range(1000):
            obs_v = torch.tensor(obs, dtype=torch.float32).to(device)
            probs, _ = policy(obs_v.unsqueeze(0))
            action = torch.argmax(probs, dim=-1).item()
            obs, reward, terminated, truncated, _ = env.step(action)
            ep_r += reward
            if terminated or truncated:
                break
        total += ep_r
        print(f"Eval episode {ep+1}: reward = {ep_r}")
    env.close()
    print(f"Average eval reward: {total / episodes:.2f}")

if __name__ == "__main__":
    trained_policy = train()
    print("Evaluation (no render):")
    evaluate(trained_policy, episodes=5, render=False)


Episode 10	Episode reward: 33.00	Avg100: 21.30
Episode 20	Episode reward: 36.00	Avg100: 26.55
Episode 30	Episode reward: 12.00	Avg100: 27.87
Episode 40	Episode reward: 88.00	Avg100: 33.00
Episode 50	Episode reward: 163.00	Avg100: 41.28
Episode 60	Episode reward: 96.00	Avg100: 43.47
Episode 70	Episode reward: 196.00	Avg100: 51.90
Episode 80	Episode reward: 39.00	Avg100: 56.66
Episode 90	Episode reward: 87.00	Avg100: 59.92
Episode 100	Episode reward: 203.00	Avg100: 72.72
Episode 110	Episode reward: 133.00	Avg100: 93.01
Episode 120	Episode reward: 500.00	Avg100: 113.04
Episode 130	Episode reward: 110.00	Avg100: 121.27
Episode 140	Episode reward: 64.00	Avg100: 125.45
Episode 150	Episode reward: 47.00	Avg100: 124.41
Episode 160	Episode reward: 227.00	Avg100: 134.01
Episode 170	Episode reward: 30.00	Avg100: 139.85
Episode 180	Episode reward: 50.00	Avg100: 136.43
Episode 190	Episode reward: 73.00	Avg100: 133.04
Episode 200	Episode reward: 198.00	Avg100: 126.60
Episode 210	Episode reward: 131.