In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym
import numpy as np
import matplotlib.pyplot as plt
import sys, os
from IPython.display import clear_output
from typing import List

In [35]:
class Network(nn.Module):
    def __init__(self, obs_dim: int, act_dim: int):
        super(Network, self).__init__()

        self.obs_dim = obs_dim
        self.act_dim = act_dim
        self.fc1 = nn.Linear(self.obs_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, self.act_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [91]:
class ReplayBuffer:
    def __init__(self, memory_size: int, batch_size: int):
        self.size = 0
        self.transition_size = 5    ## transition s, a, r, s', d
        self.batch_size = batch_size
        self.max_size = memory_size
        self.replay = np.zeros([memory_size, self.transition_size], dtype=np.float32)
        self.sample_index = []
        self.train_batch = []
        self.ptr = 0
        

    def store(self, 
              prev_obs: List[float],
              action: float,
              reward: float,
              obs: List[float],
              done: bool,
              ):
        transition = [prev_obs, action, reward, obs, done]
        self.replay[self.ptr] = transition   ####### .insert() 제거함
        self.ptr = (self.ptr + 1) % self.max_size
        self.size = min(self.size + 1, self.max_size)


    def sample_batch(self) -> List[int]:
        sample_index = np.random.randint(low=0, high=self.size, size=self.batch_size) ##### self 제거함
        train_batch = [self.replay[idx] for idx in sample_index] ##### self 제거함
        return train_batch
    

In [92]:
class Agent:
    def __init__(self, 
                 env: gym.Env,
                 memory_size: int,
                 batch_size: int,
                 target_update: int,
                 epsilon_decay: float,
                 max_epsilon: float = 1.0,
                 min_epsilon: float = 0.1,
                 gamma: float = 0.99):
        self.env = env
        self.batch_size = batch_size
        self.memory = ReplayBuffer(memory_size, batch_size)
        self.target_update = target_update
        self.epsilon = max_epsilon
        self.epsilon_decay = epsilon_decay
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.gamma = gamma

        self.obs_dim = env.observation_space.shape[0]
        self.act_dim = env.action_space.n

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  ####### 

        self.dqn = Network(self.obs_dim, self.act_dim).to(self.device)
        self.dqn_target = Network(self.obs_dim, self.act_dim).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()

        self.optimizer = optim.Adam(self.dqn.parameters())  ####### optimizer 선언을 안 해줬었네


    def update_model(self):
        train_batch = self.memory.sample_batch()

        loss = self.compute_loss(train_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        return loss.item()


    def compute_loss(self, train_batch):

        with torch.no_grad(): #########
            s = torch.FloatTensor(np.array([i[0] for i in train_batch])).to(self.device)
            a = torch.LongTensor(np.array([i[1] for i in train_batch])).view([-1, 1]).to(self.device)
            r = torch.FloatTensor(np.array([i[2] for i in train_batch])).view([-1, 1]).to(self.device)
            next_s = torch.FloatTensor(np.array([i[4] for i in train_batch])).to(self.device)
            d = torch.BoolTensor(np.array([i[3] for i in train_batch])).view([-1, 1]).to(self.device)
            
            curr_value = self.dqn(s).gather(1, a)
            next_value = self.dqn_target(next_s).max(dim=1, keepdim=True)[0].detach()

            mask = 1 - done
            target = (r + self.gamma * next_value * mask).to(self.device)

        loss = F.smooth_l2_loss(curr_value, target)

        return loss


    def train(self, num_episodes: int, plot_interval: int):
        
        update_cnt = 0
        losses = []
        score = 0
        scores = []

        for iter in range(num_episodes):

            s = self.env.reset()
            d = False
            
            while (not d):
                #self.env.render() #######

                if self.epsilon > np.random.rand(): ### rand() 로 변경
                    a = self.env.action_space.sample()
                else:
                    a = self.dqn.forward(torch.FloatTensor(s).to(self.device)).argmax() #######
                #   a = self.dqn(torch.FloatTensor(s).to(self.device)).argmax()
                    a = a.detach().cpu().numpy()    

                next_s, r, d, _ = self.env.step(a)
                self.memory.store(s, a, r, next_s, d)
                s = next_s
                score += r
                
                if d:
                    scores.append(score)
                    score = 0

                if self.memory.size >= self.batch_size:  ## len 제거함
                    loss = self.update_model()
                    losses.append(loss)
                    update_cnt += 1

                    self.epsilon = max(self.min_epsilon,
                                    self.epsilon - self.epsilon_decay * (self.max_epsilon - min_epsilon))

                    if update_cnt % self.target_update == 0:
                        self.dqn_target.load_state_dict(self.dqn.state_dict())

                if iter % plot_interval == 0:
                    clear_output(True)
                    plt.figure(figsize=(20, 5))
                    plt.subplot(121)
                    plt.title('iter %s. score: %s' % (iter, np.mean(scores[-10:]))) ## 가장 최근 10개 에피소드 평균 score 출력
                    plt.plot(scores)
                    plt.subplot(122)
                    plt.title('loss')
                    plt.plot(losses)
                    plt.show()

In [93]:
env = gym.make('CartPole-v0')
memory_size = 1000
batch_size = 32
target_update = 100
epsilon_decay = 1 / 2000

agent = Agent(env, memory_size, batch_size, target_update, epsilon_decay)

In [94]:
num_episodes = 100
plot_interval = 200

agent.train(num_episodes, plot_interval)

ValueError: ignored

In [97]:
size = 10
tran_size = 5

replay = [[0]* size for _ in range(tran_size)]

print(replay)

s0 = [[1, 2], 3, 4, [5, 6], True] 
s1 = [[2, 3], 4, 5, [6, 7], False]

replay[0] = s0
replay[1] = s1

print(replay)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
[[[1, 2], 3, 4, [5, 6], True], [[2, 3], 4, 5, [6, 7], False], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
