# 바닐라 Actor-Critic

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import gym

In [2]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.tanh(self.fc2(x))
        return x

class Critic(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = self.fc2(x)
        return x

In [7]:
def train_actor_critic(env, actor, critic, actor_optimizer, critic_optimizer, gamma, num_episodes, print_interval=10):
    episode_rewards = []
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        total_reward = 0
        while not done:
            state = torch.FloatTensor(state)
            action_prob = actor(state)
            action_prob = torch.softmax(action_prob, dim=-1)
            action_dist = torch.distributions.Categorical(action_prob)
            action = action_dist.sample()
            next_state, reward, done, _ = env.step(action.item())
            next_state = torch.FloatTensor(next_state)

            critic_value = critic(state)
            next_critic_value = critic(next_state)
            td_target = reward + gamma * next_critic_value * (1 - done)
            td_error = td_target - critic_value

            actor_loss = -action_dist.log_prob(action) * td_error.detach()
            critic_loss = td_error.pow(2)

            actor_optimizer.zero_grad()
            critic_optimizer.zero_grad()
            actor_loss.backward()
            critic_loss.backward()
            actor_optimizer.step()
            critic_optimizer.step()

            state = next_state
            total_reward += reward

        episode_rewards.append(total_reward)

        if (episode + 1) % print_interval == 0:
            mean_reward = sum(episode_rewards[-print_interval:]) / print_interval
            print(f"Episode {episode + 1}/{num_episodes}, Mean Reward: {mean_reward}")


In [8]:
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128

actor = Actor(state_dim, action_dim, hidden_dim)
critic = Critic(state_dim, hidden_dim)
actor_optimizer = optim.Adam(actor.parameters(), lr=0.001)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.001)
gamma = 0.99
num_episodes = 500

train_actor_critic(env, actor, critic, actor_optimizer, critic_optimizer, gamma, num_episodes)


Episode 10/1000, Mean Reward: 32.0
Episode 20/1000, Mean Reward: 31.7
Episode 30/1000, Mean Reward: 29.6
Episode 40/1000, Mean Reward: 38.5
Episode 50/1000, Mean Reward: 50.3
Episode 60/1000, Mean Reward: 67.5
Episode 70/1000, Mean Reward: 106.3
Episode 80/1000, Mean Reward: 74.9
Episode 90/1000, Mean Reward: 37.2
Episode 100/1000, Mean Reward: 73.0
Episode 110/1000, Mean Reward: 93.7
Episode 120/1000, Mean Reward: 95.7
Episode 130/1000, Mean Reward: 99.1
Episode 140/1000, Mean Reward: 77.5
Episode 150/1000, Mean Reward: 116.5
Episode 160/1000, Mean Reward: 85.4
Episode 170/1000, Mean Reward: 75.3
Episode 180/1000, Mean Reward: 71.5
Episode 190/1000, Mean Reward: 156.2
Episode 200/1000, Mean Reward: 172.1
Episode 210/1000, Mean Reward: 66.6
Episode 220/1000, Mean Reward: 83.9
Episode 230/1000, Mean Reward: 95.2
Episode 240/1000, Mean Reward: 189.8
Episode 250/1000, Mean Reward: 165.7
Episode 260/1000, Mean Reward: 103.9
Episode 270/1000, Mean Reward: 202.4
Episode 280/1000, Mean Reward

In [9]:
torch.save(actor.state_dict(), 'actor_model.pth')
torch.save(critic.state_dict(), 'critic_model.pth')

In [10]:
actor = Actor(state_dim, action_dim, hidden_dim)
actor.load_state_dict(torch.load('actor_model.pth'))
actor.eval()  # 평가 모드로 설정 (드롭아웃 및 배치 정규화 등을 평가 모드로 설정)

critic = Critic(state_dim, hidden_dim)
critic.load_state_dict(torch.load('critic_model.pth'))
critic.eval()  # 평가 모드로 설정 (드롭아웃 및 배치 정규화 등을 평가 모드로 설정)

Critic(
  (fc1): Linear(in_features=4, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [12]:
import gym
import torch
import numpy as np
import imageio

# 환경 설정
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128

# 모델 불러오기
actor = Actor(state_dim, action_dim, hidden_dim)
actor.load_state_dict(torch.load('actor_model.pth'))
actor.eval()

# 테스트를 위한 함수 정의
def test_actor(env, actor, num_episodes=10):
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        frames = []  # 프레임을 저장할 리스트
        while not done:
            frames.append(env.render(mode='rgb_array'))  # 시각화용 프레임 저장
            state = torch.FloatTensor(state)
            action_prob = actor(state)
            action_prob = torch.softmax(action_prob, dim=-1)
            action = torch.argmax(action_prob).item()
            state, _, done, _ = env.step(action)
        env.close()
        save_video(frames, f'test_episode_{episode}.gif')  # 영상 저장

# 영상 저장 함수 정의
def save_video(frames, filename, fps=30):
    imageio.mimsave(filename, [np.array(frame) for frame in frames], fps=fps)

# 테스트 실행
test_actor(env, actor)




In [19]:
from IPython.display import display, Image
import os
import ipywidgets as widgets

def display_gif_sequentially(folder_path):
    filenames = sorted(file for file in os.listdir(folder_path) if file.endswith('.gif'))
    image_widget = widgets.Image()
    display(image_widget)

    index = 0

    def update_image(change):
        nonlocal index
        index += 1
        if index >= len(filenames):
            index = 0
        image_path = os.path.join(folder_path, filenames[index])
        with open(image_path, 'rb') as f:
            image_widget.value = f.read()

    next_button = widgets.Button(description='Next GIF')
    next_button.on_click(update_image)
    display(next_button)

# 저장된 GIF 파일이 있는 폴더 경로
folder_path = '/content/'

# GIF 파일을 순차적으로 보여줌
display_gif_sequentially(folder_path)

Image(value=b'')

Button(description='Next GIF', style=ButtonStyle())

# Advantage Actor-Critic

In [27]:
import gym
import torch
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [34]:
# Actor 신경망 정의
class Actor(torch.nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim):
        super(Actor, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        action_prob = F.softmax(x, dim=-1)  # softmax 함수를 이용하여 각 행동에 대한 확률 생성
        return action_prob


# Critic 신경망 정의
class Critic(torch.nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(Critic, self).__init__()
        self.fc1 = torch.nn.Linear(state_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [30]:
def compute_gae(next_value, rewards, masks, values, gamma, lam):
    values = values + [next_value]
    gae = 0
    returns = []

    for step in reversed(range(len(rewards))):
        delta = rewards[step] + gamma * values[step + 1] * masks[step] - values[step]
        gae = delta + gamma * lam * masks[step] * gae
        returns.insert(0, gae + values[step])

    return returns

In [42]:
def train_actor_critic_with_gae(env, actor, critic, actor_optimizer, critic_optimizer, gamma, lam, num_episodes):
    for episode in range(num_episodes):
        log_probs = []
        values = []
        rewards = []
        masks = []
        entropy = 0
        episode_reward = 0  # 각 에피소드의 총 보상을 기록하기 위한 변수

        state = env.reset()
        state = torch.FloatTensor(state)

        for t in range(MAX_EPISODE_LENGTH):
            action_prob = actor(state)
            value = critic(state)

            action_dist = Categorical(action_prob)
            action = action_dist.sample()
            next_state, reward, done, _ = env.step(action.item())

            log_prob = action_dist.log_prob(action)
            entropy += action_dist.entropy().mean()

            log_probs.append(log_prob.unsqueeze(0))  # 텐서를 리스트에 추가할 때 unsqueeze를 사용하여 차원을 추가
            values.append(value)
            rewards.append(reward)
            masks.append(1 - done)

            episode_reward += reward  # 에피소드의 총 보상 업데이트

            state = torch.FloatTensor(next_state)

            if done:
                print(f"Episode {episode}: Total Reward = {episode_reward}")  # 각 에피소드의 총 보상 출력
                next_value = torch.tensor([0.0], device=device)
                break

        next_value = critic(state)
        returns = compute_gae(next_value, rewards, masks, values, gamma, lam)

        # 리스트를 텐서로 변환 후 연결
        log_probs = torch.cat(log_probs, dim=0)
        returns = torch.cat(returns, dim=0).detach()
        values = torch.cat(values, dim=0)

        advantage = returns - values

        actor_loss = -(log_probs * advantage.detach()).mean()
        critic_loss = F.mse_loss(returns, values)

        actor_optimizer.zero_grad()
        critic_optimizer.zero_grad()
        actor_loss.backward()
        critic_loss.backward()
        actor_optimizer.step()
        critic_optimizer.step()


In [43]:
# 하이퍼파라미터 설정
MAX_EPISODE_LENGTH = 1000
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 환경 설정
env = gym.make('CartPole-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
hidden_dim = 128

actor_optimizer = optim.Adam(actor.parameters(), lr=0.01) </br>
critic_optimizer = optim.Adam(critic.parameters(), lr=0.01) </br>
해당 코드 부분에서 lr을 0.0001, 0.001, 0.01, 0.1로 학습해보았고 0.001은 느리지만 학습이 되었고 0.0001, 0.1은 학습 아예 안됬다 0.01은 빠르게 학습할 수 있어 best다.</br>
덕분에 에피소드를 1000에서 150까지 줄일 수 있었다. </br>
감마의 경우 미래의 보상이 더 중점을 두어 0.99로 </br>
람다의 경우 advantage의 가중치이므로 0.95로 </br>
각각 설정했지만 0~1사이값으로 테스트를 해보았다. </br>
하지만 0.9 아래의 값으로 내려가자 둘다 학습이 전혀 안되어 0.9 ~ 1 사이값이 가장 좋다는 것을 알았다.

In [53]:
# 모델 및 옵티마이저 초기화
actor = Actor(state_dim, action_dim, hidden_dim).to(device)
critic = Critic(state_dim, hidden_dim).to(device)
actor_optimizer = optim.Adam(actor.parameters(), lr=0.01)
critic_optimizer = optim.Adam(critic.parameters(), lr=0.01)
gamma = 0.95
lam = 0.95
num_episodes = 150

# 학습 진행
train_actor_critic_with_gae(env, actor, critic, actor_optimizer, critic_optimizer, gamma, lam, num_episodes)

Episode 0: Total Reward = 20.0
Episode 1: Total Reward = 14.0
Episode 2: Total Reward = 34.0
Episode 3: Total Reward = 15.0
Episode 4: Total Reward = 16.0
Episode 5: Total Reward = 12.0
Episode 6: Total Reward = 14.0
Episode 7: Total Reward = 11.0
Episode 8: Total Reward = 13.0
Episode 9: Total Reward = 17.0
Episode 10: Total Reward = 16.0
Episode 11: Total Reward = 11.0
Episode 12: Total Reward = 19.0
Episode 13: Total Reward = 16.0
Episode 14: Total Reward = 15.0
Episode 15: Total Reward = 23.0
Episode 16: Total Reward = 18.0
Episode 17: Total Reward = 21.0
Episode 18: Total Reward = 25.0
Episode 19: Total Reward = 20.0
Episode 20: Total Reward = 13.0
Episode 21: Total Reward = 59.0
Episode 22: Total Reward = 44.0
Episode 23: Total Reward = 37.0
Episode 24: Total Reward = 22.0
Episode 25: Total Reward = 70.0
Episode 26: Total Reward = 28.0
Episode 27: Total Reward = 25.0
Episode 28: Total Reward = 52.0
Episode 29: Total Reward = 54.0
Episode 30: Total Reward = 25.0
Episode 31: Total 