In [1]:

import gym
import random
import collections
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [12]:
#Hyperparameters
lr_mu        = 0.0005
lr_q         = 0.001
gamma        = 0.99
batch_size   = 32
buffer_limit = 50000
tau          = 0.005 # for target network soft update

# DDPG 의 핵심

1. 데이터를 ReplayBuffer에 집어 넣은 후 random으로 뽑아 학습을 진행함으로써 Correlation을 완화 함

2. Deterministic Policy Network

3. Q-Network가 이상 Q 함수를 추정해 나가는 과정에서 DQN과 달리 Target Q Target Pi 를 설정함. Target Q 와 Target Pi 는 점진적으로 변한다.

4. Noise for exploration

# ReplayBuffer 구현

In [11]:
class RelayBuffer():
    
    def __init__(self):
        self.buffer=collections.deque(maxlen=buffer_limit)
        
    def put(self,transition):
        self.buffer.append(transition)
        
    def sample(self,n):
        mini_batch=random.sample(self.buffer,n)
        
        s_lst,a_lst,r_lst,s_prime_lst,done_mask_lst=[],[],[],[],[]
        
        for transition in mini_batch:
            
            s,a,r,s_prime,done=transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done=0. if done else 1
            done_mask_lst.append([done])
            
            
        s_batch=torch.tensor(s_lst,dtype=torch.float)
        a_batch=torch.tensor(a_lst,dtype=torch.float)
        r_batch=torch.tensor(r_lst,dtype=torch.float)
        s_prime_batch=torch.tensor(s_prime_lst,dtype=torch.float)
        done_batch=torch.tensor(done_mask_lst,dtype=torch.float)
        
        return s_batch,a_batch,r_batch,s_prime_batch,done_batch
    
    def size(self):
        return len(self.buffer)

# Determinstic Policy Network and Q-Net 구현

In [13]:
class MuNet(nn.Module):
    def __init__(self):
        super(MuNet,self).__init__()
        self.fc1=nn.Linear(3,128)
        self.fc2=nn.Linear(128,64)
        self.fc_mu=nn.Linear(64,1)
        
    def forward(self,x):
        x=F.relu(self.fc1(x))
        x=F.relu(self.fc2(x))
        mu=torch.tanh(self.fc_mu(x))*2
        
        return mu
    
class QNet(nn.Module):
    def __init__(self):
        super(QNet,self).__init__()
        self.fc_s=nn.Linear(3,64)
        self.fc_a=nn.Linear(1,64)
        self.fc_q=nn.Linear(128,32)
        self.fc_out=nn.Linear(32,1)
        
    def forward(self,x,a):
        h1=F.relu(self.fc_s(x))
        h2=F.relu(self.fc_a(a))
        cat=torch.cat([h1,h2],dim=1)
        q=F.relu(self.fc_q(cat))
        q=self.fc_out(q)
        return q

# Noisy 구현

## 목적
DDPG 는 Deterministic Policy 를 사용하기 때문에 Exploration이 발생할 가능성이 낮다. 그러므로 잡음을 더함으로써 Exploration을 유도한다.

## Ornstein-Uhlenbeck
$\epsilon_{t+1}=\epsilon+\alpha(\mu-\epsilon_t)\triangle t + \sqrt{\triangle t} \sigma n_t$

$\mu: mean \;of \;noise$

$\triangle t: \;time \;increment$

$n_t:\; Normal(0,1) \;white \;noise$

In [32]:
class OrnsteinUhlenbeckNoise:
    def __init__(self,mu):
        self.theta,self.dt,self.sigma=0.1,0.01,0.1
        self.mu=mu
        self.x_prev=np.zeros_like(self.mu)
        
    def __call__(self):
        x=self.x_prev+self.theta*(self.mu-self.x_prev)*self.dt+\
        self.sigma*np.sqrt(self.dt)*np.random.normal(size=self.mu.shape)
        
        self.x_prev=x
        return x

## Policy Network and Q-Network updating
$\triangledown_\theta J(\theta)=\sum_i^N \triangledown_\theta Q_{\pi_\theta}(s_i,\pi_\theta(s_i))$
$Loss(\phi)=\frac {1}{2N} \sum_{i=1}^N (r_i+\gamma Q_{target}(s_{i+1},\pi_{target}(s_{i+1}))-Q_\phi(s_i))^2$

## Target Policy and Q updating

$\theta`:=\tau \theta + (1-\tau)\theta`$

$\phi`:=\tau \phi + (1-\tau)\phi`$
    

In [39]:
def train(mu,mu_target,q,q_target,memory,q_optimizer,mu_optimizer):
    s,a,r,s_prime,done_mask  = memory.sample(batch_size)
    
    target=r+gamma*q_target(s_prime,mu_target(s_prime))*done_mask
    q_loss=F.smooth_l1_loss(q(s,a),target.detach())
    q_optimizer.zero_grad()
    q_loss.backward()
    q_optimizer.step()
    
    mu_loss=-q(s,mu(s)).mean()
    mu_optimizer.zero_grad()
    mu_loss.backward()
    mu_optimizer.step()
    
    
def soft_update(net,net_target):
    for param_target,param in zip(net_target.parameters(),net.parameters()):
        param_target.data.copy_(param_target.data*(1.-tau)+tau*param.data)
    

# Main 

In [42]:
def main():
    env=gym.make('Pendulum-v1')
    memory=ReplayBuffer()
    
    q, q_target = QNet(), QNet()
    q_target.load_state_dict(q.state_dict())
    
    mu,mu_target=MuNet(),MuNet()
    mu_target.load_state_dict(mu.state_dict())
    
    score=0.
    print_interval=20
    
    mu_optimizer=optim.Adam(mu.parameters(),lr=lr_mu)
    q_optimizer=optim.Adam(q.parameters(),lr=lr_q)
    ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(1))
    
    for n_epi in range(10000):
        s=env.reset()
        done=False
        
        while not done:
            a=mu(torch.from_numpy(s).float())
            a=a.item()+ou_noise()[0]
            s_prime,r,done,info=env.step([a]) #numpy 로 받는다
            memory.put((s,a,r/100.,s_prime,done))
            score+=r
            s=s_prime
        #Replay buffer가 일정 크기가 될때까지 기다리고 난 후 랜덤으로 샘플을 뽑아 학습을 진행한다.
        if memory.size()>2000:
            for i in range(10):
                train(mu,mu_target,q,q_target,memory,q_optimizer,mu_optimizer)
                soft_update(mu,mu_target)
                soft_update(q,q_target)
                
        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()       

In [41]:
if __name__ == '__main__':
    main()

# of episode :20, avg score : -1408.0
# of episode :40, avg score : -1454.2
# of episode :60, avg score : -1630.4
# of episode :80, avg score : -1529.0
# of episode :100, avg score : -1531.8
# of episode :120, avg score : -1504.9
# of episode :140, avg score : -1332.6
# of episode :160, avg score : -1037.2
# of episode :180, avg score : -1452.9
# of episode :200, avg score : -1128.8
# of episode :220, avg score : -1016.0
# of episode :240, avg score : -983.2
# of episode :260, avg score : -1017.6
# of episode :280, avg score : -1028.3
# of episode :300, avg score : -1015.7
# of episode :320, avg score : -1015.4
# of episode :340, avg score : -965.4
# of episode :360, avg score : -954.4
# of episode :380, avg score : -839.4
# of episode :400, avg score : -776.3
# of episode :420, avg score : -913.3
# of episode :440, avg score : -885.4
# of episode :460, avg score : -1108.7
# of episode :480, avg score : -858.4
# of episode :500, avg score : -813.2
# of episode :520, avg score : -429.5


KeyboardInterrupt: 