In [5]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import matplotlib.pyplot as plt
torch.__version__, torch.cuda.is_available()

('2.5.1+cu118', True)

In [7]:
episode_rewards = []

In [9]:
class PolicyNetwork(nn.Module):
    def __init__(self):
        super(PolicyNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(4, 128),
            nn.ReLU(),
            nn.Linear(128, 2),
            nn.Softmax(dim=-1),
        )

    def forward(self, x):
        return self.fc(x)

In [11]:
def compute_discounted_rewards(rewards, gamma=0.99):
    discounted_rewards = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    discounted_rewards = torch.tensor(discounted_rewards)
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-5)
    return discounted_rewards

In [13]:
def train(env, policy, optimizer, episodes=1000):
    for episode in range(episodes):
        state = env.reset()
        log_probs = []
        rewards = []
        done = False

        while not done:
            state = torch.FloatTensor(state).unsqueeze(0)
            probs = policy(state)
            m = Categorical(probs)
            action = m.sample()
            state, reward, done, _ = env.step(action.item())

            log_probs.append(m.log_prob(action))
            rewards.append(reward)
            # Inside the train function, after an episode ends:

            if done:
                episode_rewards.append(sum(rewards))
                discounted_rewards = compute_discounted_rewards(rewards)
                policy_loss = []
                for log_prob, Gt in zip(log_probs, discounted_rewards):
                    policy_loss.append(-log_prob * Gt)
                optimizer.zero_grad()
                policy_loss = torch.cat(policy_loss).sum()
                policy_loss.backward()
                optimizer.step()

                if episode % 50 == 0:
                    print(f"Episode {episode}, Total Reward: {sum(rewards)}")
                break

env = gym.make('CartPole-v1')
policy = PolicyNetwork()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)

train(env, policy, optimizer)

  state = torch.FloatTensor(state).unsqueeze(0)


ValueError: expected sequence of length 4 at dim 1 (got 0)

In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import numpy as np

# Define environment parameters
N = 5  # Grid size (N x N)
A = (0, 0)  # Start point
B = (4, 4)  # Target point

# Possible actions: up, down, left, right
actions = ["up", "down", "left", "right"]
action_to_idx = {"up": 0, "down": 1, "left": 2, "right": 3}
num_actions = len(actions)

# Hyperparameters
epsilon = 0.1  # Exploration rate
gamma = 0.99  # Discount factor
learning_rate = 0.01
episodes = 500
max_steps = 50

# Simple helper functions to interact with the environment
def take_action(state, action):
    x, y = state
    if action == "up":
        x = max(0, x - 1)
    elif action == "down":
        x = min(N - 1, x + 1)
    elif action == "left":
        y = max(0, y - 1)
    elif action == "right":
        y = min(N - 1, y + 1)
    return (x, y)


def get_reward(state):
    return 10 if state == B else -1


# Q-network for state-action mapping
class QNetwork(nn.Module):
    def __init__(self):
        super(QNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(2, 32),
            nn.ReLU(),
            nn.Linear(32, num_actions)
        )

    def forward(self, state):
        return self.fc(state)


# Initialize the Q-network and optimizer
q_net = QNetwork()
optimizer = optim.Adam(q_net.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()

# Training loop
for episode in range(episodes):
    state = A
    total_reward = 0

    for step in range(max_steps):
        # Select action using epsilon-greedy policy
        if random.random() < epsilon:
            action = random.choice(actions)
        else:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = q_net(state_tensor)
            action_idx = torch.argmax(q_values).item()
            action = actions[action_idx]

        # Take action, get next state and reward
        next_state = take_action(state, action)
        reward = get_reward(next_state)
        total_reward += reward

        # Compute target Q value
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
        target_q = q_net(state_tensor).clone().detach()
        next_q_values = q_net(next_state_tensor).detach()

        target_q[0, action_to_idx[action]] = reward + gamma * torch.max(next_q_values)

        # Update Q-network
        optimizer.zero_grad()
        loss = loss_fn(q_net(state_tensor), target_q)
        loss.backward()
        optimizer.step()

        # Check if goal is reached
        if next_state == B:
            break

        state = next_state

    # Print episode result
    if (episode + 1) % 50 == 0:
        print(f"Episode {episode + 1}, Total Reward: {total_reward}")

print("Training complete.")

# Test the learned policy
state = A
print("Testing policy from start state:")
path = [state]

for step in range(max_steps):
    state_tensor = torch.FloatTensor(state).unsqueeze(0)
    action_idx = torch.argmax(q_net(state_tensor)).item()
    action = actions[action_idx]
    state = take_action(state, action)
    path.append(state)
    if state == B:
        break

print("Path taken by the agent:")
print(path)

Episode 50, Total Reward: 3
Episode 100, Total Reward: -50
Episode 150, Total Reward: -50
Episode 200, Total Reward: -50
Episode 250, Total Reward: -50
Episode 300, Total Reward: -12
Episode 350, Total Reward: -50
Episode 400, Total Reward: -50
Episode 450, Total Reward: -50
Episode 500, Total Reward: -38
Training complete.
Testing policy from start state:
Path taken by the agent:
[(0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (3, 2), (3, 3), (4, 3), (4, 4)]
