In [None]:
!pip install gym
!pip install torch



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym
import numpy as np

# Definisikan model kebijakan (policy network)
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return torch.softmax(self.fc2(x), dim=-1)

# Definisikan baseline model (value network)
class ValueNetwork(nn.Module):
    def __init__(self, state_size):
        super(ValueNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return self.fc2(x)

# Fungsi untuk menghitung return
def compute_returns(rewards, gamma=0.99):
    returns = []
    R = 0
    for r in reversed(rewards):
        R = r + gamma * R
        returns.insert(0, R)
    return returns

# Inisialisasi lingkungan, model, dan optimizer
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

policy = PolicyNetwork(state_size, action_size)
value = ValueNetwork(state_size)

policy_optimizer = optim.Adam(policy.parameters(), lr=0.01)
value_optimizer = optim.Adam(value.parameters(), lr=0.01)

# Algoritma Vanilla Policy Gradient with Baseline
num_episodes = 1000
gamma = 0.99  # discount factor

for episode in range(num_episodes):
    state = env.reset()
    states, actions, rewards = [], [], []
    done = False

    while not done:
        state_tensor = torch.FloatTensor(state)
        action_probs = policy(state_tensor)
        action = np.random.choice(action_size, p=action_probs.detach().numpy())

        next_state, reward, done, _ = env.step(action)

        # Simpan data
        states.append(state)
        actions.append(action)
        rewards.append(reward)

        state = next_state

    # Hitung return dan advantage
    returns = compute_returns(rewards, gamma)
    returns_tensor = torch.FloatTensor(returns)

    policy_loss = []
    value_loss = []

    for t in range(len(states)):
        state_tensor = torch.FloatTensor(states[t])
        action_taken = actions[t]

        # Hitung advantage
        value_estimate = value(state_tensor)
        advantage = returns_tensor[t] - value_estimate.item()

        # Hitung policy gradient
        log_prob = torch.log(policy(state_tensor)[action_taken])
        policy_loss.append(-log_prob * advantage)  # Menggunakan advantage

        # Hitung loss untuk memperbarui baseline (value network)
        value_loss.append((value_estimate - returns_tensor[t]) ** 2)

    # Update policy
    policy_optimizer.zero_grad()
    policy_loss = torch.stack(policy_loss).sum()
    policy_loss.backward()
    policy_optimizer.step()

    # Update baseline (value network)
    value_optimizer.zero_grad()
    value_loss = torch.stack(value_loss).sum()
    value_loss.backward()
    value_optimizer.step()

    if episode % 100 == 0:
        print(f"Episode {episode}, Total Reward: {sum(rewards)}")

env.close()

  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Total Reward: 18.0
Episode 100, Total Reward: 500.0
Episode 200, Total Reward: 338.0
Episode 300, Total Reward: 482.0
Episode 400, Total Reward: 368.0
Episode 500, Total Reward: 500.0
Episode 600, Total Reward: 500.0
Episode 700, Total Reward: 500.0
Episode 800, Total Reward: 500.0
Episode 900, Total Reward: 500.0
