<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/RL/6-3-Policy-Gradient.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Policy Network 정의 (정책 신경망)
class PolicyNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action_probs = torch.softmax(self.fc3(x), dim=-1)  # 각 행동의 확률
        return action_probs

# 에이전트의 행동을 선택하는 함수 (정책을 통해 행동 선택)
def select_action(state, policy_network):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)  # 배치 차원을 맞추기 위해 추가
    action_probs = policy_network(state)
    action = np.random.choice(np.arange(action_size), p=action_probs.detach().numpy().squeeze())
    return action, action_probs[0, action]

# 학습을 위한 하이퍼파라미터
learning_rate = 0.001
gamma = 0.99  # 할인율
state_size = 4  # CartPole의 상태 크기
action_size = 2  # CartPole의 행동 공간 크기

# CartPole 환경 설정
env = gym.make('CartPole-v1')
policy_network = PolicyNetwork(state_size, action_size)
optimizer = optim.Adam(policy_network.parameters(), lr=learning_rate)

# 에피소드에서 수집한 보상을 할인해서 반환하는 함수
def discount_rewards(rewards, gamma):
    discounted_rewards = np.zeros_like(rewards)
    cumulative = 0
    for i in reversed(range(len(rewards))):
        cumulative = cumulative * gamma + rewards[i]
        discounted_rewards[i] = cumulative
    return discounted_rewards

# REINFORCE 알고리즘을 사용한 학습 루프
episodes = 1000

for episode in range(episodes):
    state = env.reset()
    log_probs = []
    rewards = []
    total_reward = 0
    done = False

    while not done:
        # 정책에 따라 행동 선택
        action, log_prob = select_action(state, policy_network)

        # 선택된 행동을 실행하여 다음 상태, 보상 등을 얻음
        next_state, reward, done, _ = env.step(action)

        # 로그 확률과 보상을 저장
        log_probs.append(torch.log(log_prob))
        rewards.append(reward)
        total_reward += reward

        # 상태 업데이트
        state = next_state

    # 에피소드가 끝난 후 보상을 할인하여 얻음
    discounted_rewards = discount_rewards(rewards, gamma)
    discounted_rewards = torch.tensor(discounted_rewards, dtype=torch.float32)

    # 보상의 표준화를 통해 학습의 안정성 개선
    discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9)

    # 정책 네트워크 업데이트 (정책 경사)
    policy_gradient = []
    for log_prob, reward in zip(log_probs, discounted_rewards):
        policy_gradient.append(-log_prob * reward)

    # 역전파 및 최적화
    optimizer.zero_grad()
    policy_gradient = torch.cat(policy_gradient).sum()  # 모든 손실을 합산
    policy_gradient.backward()  # 손실을 기준으로 역전파 수행
    optimizer.step()  # 정책 네트워크의 가중치 업데이트

    # 에피소드 결과 출력
    print(f"Episode {episode}, Total Reward: {total_reward}")

# 학습 완료 후 에이전트 테스트
for i in range(10):
    state = env.reset()
    done = False
    while not done:
        env.render()
        action, _ = select_action(state, policy_network)
        next_state, _, done, _ = env.step(action)
        state = next_state
env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated