In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import sys
import os
# Add the project root directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../..')))

from src.ParObsSnakeEnv import ParObsSnakeEnv
from src.FullObsSnakeEnv import FullObsSnakeEnv

In [3]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

In [None]:
class REINFORCEAgent:
    def __init__(self, state_dim, action_dim, lr=0.001, gamma=0.99, device='cpu'):
        self.device = device
        self.policy = PolicyNetwork(state_dim, action_dim).to(self.device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma

    def act(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(self.device)  # Add batch dimension
        action_probs = self.policy(state).detach().cpu().numpy().squeeze()
        action = np.random.choice(len(action_probs), p=action_probs)
        return action

    def compute_returns(self, rewards):
        """Compute discounted returns for an episode."""
        returns = []
        G = 0
        for reward in reversed(rewards):
            G = reward + self.gamma * G
            returns.insert(0, G)
        returns = torch.FloatTensor(returns).to(self.device)
        # Normalize returns to improve training stability
        if len(returns) > 1 and returns.std() > 1e-5:
            returns = (returns - returns.mean()) / (returns.std() + 1e-5)
        return returns

    def update_policy(self, log_probs, returns):
        """Perform policy gradient update."""
        loss = -torch.sum(torch.stack(log_probs) * returns)  # Negative log-prob * return
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

# Train the REINFORCE Agent
def train_reinforce(env, agent, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        total_reward = 0
        done = False

        while not done:
            action = agent.act(state)
            state_tensor = torch.FloatTensor(state).to(agent.device)
            action_prob = agent.policy(state_tensor)[action]
            log_prob = torch.log(action_prob)
            log_probs.append(log_prob)

            next_state, reward, done, _ = env.step(action)
            rewards.append(reward)
            total_reward += reward
            state = next_state

        # Compute returns and update policy
        returns = agent.compute_returns(rewards)
        agent.update_policy(log_probs, returns)

        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}")

In [7]:
# Environment parameters
grid_size = 10

# Agent parameters
input_size = 11
hidden_size = 16
output_size = 4
learning_rate = 0.001
gamma = 0.99
num_episodes = 1000

# Initialize environment
env = ParObsSnakeEnv(grid_size=grid_size, interact=False)

# Initialize policy network
policy = PolicyNetwork(input_size, hidden_size, output_size)

# Initialize optimizer
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [8]:
# Train the agent
episode_rewards = reinforce(env, policy, optimizer, num_episodes, gamma)

Episode 1, Total Reward: -73.00
Episode 2, Total Reward: -80.00
Episode 3, Total Reward: -74.00
Episode 4, Total Reward: -80.00
Episode 5, Total Reward: -70.00
Episode 6, Total Reward: -78.00
Episode 7, Total Reward: -74.00
Episode 8, Total Reward: -77.00
Episode 9, Total Reward: -73.00
Episode 10, Total Reward: -68.00
Episode 11, Total Reward: -79.00
Episode 12, Total Reward: -75.00
Episode 13, Total Reward: -75.00
Episode 14, Total Reward: -75.00
Episode 15, Total Reward: -83.00
Episode 16, Total Reward: -76.00
Episode 17, Total Reward: 2.00
Episode 18, Total Reward: -2.00
Episode 19, Total Reward: -75.00


  returns = (returns - returns.mean()) / (returns.std() + 1e-9)


ValueError: Expected parameter probs (Tensor of shape (1, 4)) of distribution Categorical(probs: torch.Size([1, 4])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[nan, nan, nan, nan]], grad_fn=<DivBackward0>)