<a href="https://colab.research.google.com/github/JSJeong-me/AI-Innovation-2024/blob/main/RL/6-3-Actor-Critic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim

# Actor 네트워크 정의
class Actor(nn.Module):
    def __init__(self, state_size, action_size):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        action_probs = torch.softmax(self.fc3(x), dim=-1)
        return action_probs

# Critic 네트워크 정의
class Critic(nn.Module):
    def __init__(self, state_size):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 1)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        state_value = self.fc3(x)
        return state_value

# 환경 설정 및 하이퍼파라미터 정의
env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n

actor = Actor(state_size, action_size)
critic = Critic(state_size)

# 학습 하이퍼파라미터 설정
learning_rate = 0.001
gamma = 0.99
optimizer_actor = optim.Adam(actor.parameters(), lr=learning_rate)
optimizer_critic = optim.Adam(critic.parameters(), lr=learning_rate)

# 정책에 따라 행동을 선택하는 함수
def choose_action(state):
    state = torch.tensor(state, dtype=torch.float32).unsqueeze(0)
    action_probs = actor(state)
    action = np.random.choice(np.arange(action_size), p=action_probs.detach().numpy().squeeze())
    return action, action_probs[0, action]

# 학습 루프
episodes = 1000

for episode in range(episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        # 현재 상태에서 행동 선택
        action, action_prob = choose_action(state)

        # 환경에서 한 단계 진행
        next_state, reward, done, _ = env.step(action)
        total_reward += reward

        # 상태 평가 (Critic)
        state_value = critic(torch.tensor(state, dtype=torch.float32).unsqueeze(0))
        next_state_value = critic(torch.tensor(next_state, dtype=torch.float32).unsqueeze(0)) if not done else torch.tensor([[0.0]])

        # 타겟 값 및 TD 에러 계산
        target_value = reward + gamma * next_state_value
        td_error = target_value - state_value

        # Critic 업데이트 (MSE Loss)
        critic_loss = td_error.pow(2).mean()
        optimizer_critic.zero_grad()
        critic_loss.backward()
        optimizer_critic.step()

        # Actor 업데이트 (Policy Gradient)
        actor_loss = -torch.log(action_prob) * td_error.detach()
        optimizer_actor.zero_grad()
        actor_loss.backward()
        optimizer_actor.step()

        # 상태 업데이트
        state = next_state

    print(f"Episode {episode}, Total Reward: {total_reward}")

# 학습 완료 후 에이전트 테스트
for i in range(10):
    state = env.reset()
    done = False
    while not done:
        env.render()
        action, _ = choose_action(state)
        next_state, _, done, _ = env.step(action)
        state = next_state
env.close()


  deprecation(
  deprecation(
  if not isinstance(terminated, (bool, np.bool8)):


Episode 0, Total Reward: 21.0
Episode 1, Total Reward: 25.0
Episode 2, Total Reward: 24.0
Episode 3, Total Reward: 65.0
Episode 4, Total Reward: 39.0
Episode 5, Total Reward: 34.0
Episode 6, Total Reward: 16.0
Episode 7, Total Reward: 52.0
Episode 8, Total Reward: 14.0
Episode 9, Total Reward: 22.0
Episode 10, Total Reward: 58.0
Episode 11, Total Reward: 17.0
Episode 12, Total Reward: 92.0
Episode 13, Total Reward: 44.0
Episode 14, Total Reward: 87.0
Episode 15, Total Reward: 20.0
Episode 16, Total Reward: 30.0
Episode 17, Total Reward: 50.0
Episode 18, Total Reward: 65.0
Episode 19, Total Reward: 93.0
Episode 20, Total Reward: 32.0
Episode 21, Total Reward: 23.0
Episode 22, Total Reward: 25.0
Episode 23, Total Reward: 23.0
Episode 24, Total Reward: 24.0
Episode 25, Total Reward: 31.0
Episode 26, Total Reward: 35.0
Episode 27, Total Reward: 106.0
Episode 28, Total Reward: 63.0
Episode 29, Total Reward: 40.0
Episode 30, Total Reward: 25.0
Episode 31, Total Reward: 40.0
Episode 32, Total