In [13]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import time  # 시간 측정을 위한 모듈

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 50
T_horizon     = 100

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1   = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0 ):# softmax_dim 0 (inference) or 1(training, batch state input)
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition
            
            s_lst.append(s) # shape 처리 시 유의 (numpy array)
            a_lst.append([a]) # value
            r_lst.append([r]) # value
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            
        s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                          torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                          torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a # batch 출력
        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):# 업데이트를 K_epoch 만큼 진행  
            # ==== GAE 구현 ==== #
            td_target = r + gamma * self.v(s_prime) * done_mask # 여기를 배치로 처리하는 것이 핵심
            delta = td_target - self.v(s) # NN (v)를 한번에 불러서 계산 (속도향상)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]: # delta vector에서 마지막부터 꺼내어씀 
                advantage = gamma * lmbda * advantage + delta_t[0] # NN 호출이 없으니 for loop 
                advantage_lst.append([advantage])
            advantage_lst.reverse() 
            advantage = torch.tensor(advantage_lst, dtype=torch.float)
            # ====================#
            

            # ==== Clipped Loss ==== #
            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage # surrogate loss (tensor)
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())
            # ====================#

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [14]:
env = gym.make('CartPole-v1')
model = PPO()
score = 0.0
print_interval = 20

for n_epi in range(10000):
    start_time = time.time()  # 에피소드 시작 시간 기록
    s, _ = env.reset()
    done = False
    while not done:
        for t in range(T_horizon):            

            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(a)

            model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done)) 
            s = s_prime

            score += r
            if done:
                break

        model.train_net()
        
    end_time = time.time()  # 에피소드 종료 시간 기록
    episode_duration = end_time - start_time  # 소요 시간 계산

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode: {}, avg score: {:.1f}, duration: {:.2f} sec".format(n_epi, score/print_interval, episode_duration))
        score = 0.0

env.close()

# of episode: 20, avg score: 21.4, duration: 0.04 sec
# of episode: 40, avg score: 17.2, duration: 0.04 sec
# of episode: 60, avg score: 14.9, duration: 0.03 sec
# of episode: 80, avg score: 13.5, duration: 0.04 sec
# of episode: 100, avg score: 12.3, duration: 0.04 sec
# of episode: 120, avg score: 13.1, duration: 0.04 sec
# of episode: 140, avg score: 12.8, duration: 0.04 sec
# of episode: 160, avg score: 13.2, duration: 0.03 sec
# of episode: 180, avg score: 14.4, duration: 0.04 sec
# of episode: 200, avg score: 14.1, duration: 0.03 sec
# of episode: 220, avg score: 11.9, duration: 0.04 sec
# of episode: 240, avg score: 14.6, duration: 0.04 sec
# of episode: 260, avg score: 23.7, duration: 0.04 sec
# of episode: 280, avg score: 25.2, duration: 0.04 sec
# of episode: 300, avg score: 83.8, duration: 0.12 sec
# of episode: 320, avg score: 200.8, duration: 0.17 sec
# of episode: 340, avg score: 367.9, duration: 0.68 sec
# of episode: 360, avg score: 1052.0, duration: 0.54 sec
# of episo

KeyboardInterrupt: 

# GPU버전으로 수정

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 50
T_horizon     = 100

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []
        
        self.fc1   = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0 ):# softmax_dim 0 (inference) or 1(training, batch state input)
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition

            # 여기서 s와 s_prime을 PyTorch 텐서로 변환합니다.
            s = torch.from_numpy(s).float() if isinstance(s, np.ndarray) else s
            s_prime = torch.from_numpy(s_prime).float() if isinstance(s_prime, np.ndarray) else s_prime

            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])

        s, a, r, s_prime, done_mask, prob_a = torch.stack(s_lst, dim=0).to(device), \
                                              torch.tensor(a_lst, dtype=torch.int64).to(device), \
                                              torch.tensor(r_lst, dtype=torch.float).to(device), \
                                              torch.stack(s_prime_lst, dim=0).to(device), \
                                              torch.tensor(done_lst, dtype=torch.float).to(device), \
                                              torch.tensor(prob_a_lst, dtype=torch.float).to(device)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a

        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):# 업데이트를 K_epoch 만큼 진행  
            # ==== GAE 구현 ==== #
            td_target = r + gamma * self.v(s_prime) * done_mask # 여기를 배치로 처리하는 것이 핵심
            delta = td_target - self.v(s) # NN (v)를 한번에 불러서 계산 (속도향상)
            delta = delta.detach().cpu().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]: # delta vector에서 마지막부터 꺼내어씀 
                advantage = gamma * lmbda * advantage + delta_t[0] # NN 호출이 없으니 for loop 
                advantage_lst.append([advantage])
            advantage_lst.reverse() 
            advantage = torch.tensor(advantage_lst, dtype=torch.float).to(device)
            # ====================#
            

            # ==== Clipped Loss ==== #
            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage # surrogate loss (tensor)
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())
            # ====================#

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [19]:
env = gym.make('CartPole-v1')
model = PPO().to(device)
score = 0.0
print_interval = 20

for n_epi in range(10000):
    start_time = time.time()  # 에피소드 시작 시간 기록
    s, _ = env.reset()
    done = False
    while not done:
        for t in range(T_horizon):
            s = torch.from_numpy(s).float().to(device)
            prob = model.pi(s)
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(a)

            model.put_data((s, a, r/100.0, s_prime, prob[a].item(), done)) 
            s = s_prime

            score += r
            if done:
                break

        model.train_net()
        
    end_time = time.time()  # 에피소드 종료 시간 기록
    episode_duration = end_time - start_time  # 소요 시간 계산

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode: {}, avg score: {:.1f}, duration: {:.2f} sec".format(n_epi, score/print_interval, episode_duration))
        score = 0.0        

env.close()

# of episode: 20, avg score: 24.9, duration: 0.10 sec
# of episode: 40, avg score: 13.7, duration: 0.11 sec
# of episode: 60, avg score: 15.9, duration: 0.09 sec
# of episode: 80, avg score: 19.3, duration: 0.10 sec
# of episode: 100, avg score: 12.2, duration: 0.09 sec
# of episode: 120, avg score: 13.4, duration: 0.08 sec
# of episode: 140, avg score: 12.9, duration: 0.09 sec
# of episode: 160, avg score: 17.4, duration: 0.12 sec
# of episode: 180, avg score: 21.4, duration: 0.09 sec
# of episode: 200, avg score: 25.7, duration: 0.10 sec
# of episode: 220, avg score: 50.5, duration: 0.11 sec
# of episode: 240, avg score: 84.6, duration: 0.17 sec
# of episode: 260, avg score: 121.5, duration: 0.53 sec
# of episode: 280, avg score: 183.8, duration: 0.31 sec
# of episode: 300, avg score: 171.4, duration: 0.37 sec
# of episode: 320, avg score: 378.4, duration: 0.37 sec
# of episode: 340, avg score: 1114.3, duration: 0.13 sec
# of episode: 360, avg score: 446.1, duration: 0.53 sec
# of ep

# of episode: 2920, avg score: 5866.2, duration: 0.25 sec
# of episode: 2940, avg score: 512.4, duration: 0.25 sec
# of episode: 2960, avg score: 1569.0, duration: 4.19 sec
# of episode: 2980, avg score: 1155.0, duration: 0.66 sec
# of episode: 3000, avg score: 1294.2, duration: 0.46 sec
# of episode: 3020, avg score: 402.1, duration: 0.56 sec
# of episode: 3040, avg score: 277.6, duration: 0.49 sec
# of episode: 3060, avg score: 352.1, duration: 0.58 sec
# of episode: 3080, avg score: 2661.8, duration: 0.27 sec
# of episode: 3100, avg score: 722.3, duration: 1.78 sec
# of episode: 3120, avg score: 242.6, duration: 0.28 sec
# of episode: 3140, avg score: 139.9, duration: 0.35 sec
# of episode: 3160, avg score: 25867.2, duration: 0.67 sec
# of episode: 3180, avg score: 383.1, duration: 0.47 sec
# of episode: 3200, avg score: 335.7, duration: 0.50 sec
# of episode: 3220, avg score: 1080.5, duration: 0.78 sec
# of episode: 3240, avg score: 8565.4, duration: 0.65 sec
# of episode: 3260, av

# of episode: 5800, avg score: 104.6, duration: 0.27 sec
# of episode: 5820, avg score: 94.8, duration: 0.22 sec
# of episode: 5840, avg score: 96.3, duration: 0.27 sec
# of episode: 5860, avg score: 102.2, duration: 0.27 sec
# of episode: 5880, avg score: 113.7, duration: 0.22 sec
# of episode: 5900, avg score: 77.8, duration: 0.09 sec
# of episode: 5920, avg score: 102.7, duration: 0.23 sec
# of episode: 5940, avg score: 110.2, duration: 0.08 sec
# of episode: 5960, avg score: 111.6, duration: 0.27 sec
# of episode: 5980, avg score: 120.4, duration: 0.24 sec
# of episode: 6000, avg score: 114.2, duration: 0.24 sec
# of episode: 6020, avg score: 95.4, duration: 0.21 sec
# of episode: 6040, avg score: 89.0, duration: 0.13 sec
# of episode: 6060, avg score: 88.2, duration: 0.12 sec
# of episode: 6080, avg score: 94.7, duration: 0.21 sec
# of episode: 6100, avg score: 90.5, duration: 0.11 sec
# of episode: 6120, avg score: 96.8, duration: 0.24 sec
# of episode: 6140, avg score: 97.8, dur

# of episode: 8700, avg score: 71.9, duration: 0.08 sec
# of episode: 8720, avg score: 57.6, duration: 0.08 sec
# of episode: 8740, avg score: 69.6, duration: 0.27 sec
# of episode: 8760, avg score: 91.5, duration: 0.09 sec
# of episode: 8780, avg score: 63.1, duration: 0.20 sec
# of episode: 8800, avg score: 24.2, duration: 0.08 sec
# of episode: 8820, avg score: 14.2, duration: 0.08 sec
# of episode: 8840, avg score: 9.3, duration: 0.08 sec
# of episode: 8860, avg score: 9.2, duration: 0.08 sec
# of episode: 8880, avg score: 9.4, duration: 0.09 sec
# of episode: 8900, avg score: 9.3, duration: 0.08 sec
# of episode: 8920, avg score: 9.2, duration: 0.08 sec
# of episode: 8940, avg score: 9.4, duration: 0.09 sec
# of episode: 8960, avg score: 9.4, duration: 0.08 sec
# of episode: 8980, avg score: 9.2, duration: 0.09 sec
# of episode: 9000, avg score: 9.4, duration: 0.09 sec
# of episode: 9020, avg score: 9.4, duration: 0.10 sec
# of episode: 9040, avg score: 9.2, duration: 0.09 sec
# o