In [25]:
import gym
import random
import numpy as np
from collections import deque


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [26]:
device = "cuda" if torch.cuda.is_available() else "cpu"
env = gym.make("MountainCar-v0")
print("state shape : ",env.observation_space.shape)
print("# of action : ",env.action_space.n )

WARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.
state shape :  (2,)
# of action :  3


- state정보는 `position`, `velocity`
- action은 `left','right','nothing(가만히)`

In [27]:
# HyperParameter Setting
# 그때그때 생각나면 추가하도록 하자
N_STATES = env.observation_space.shape[0] # 2
N_ACTIONS = env.action_space.n # 3
MEMORY_CAPACITY = 2000
learning_rate = 0.01
EPSILON = 0.9
TARGET_REPLACE_ITER = 100
batch_size = 32
GAMMA = 0.9
ENV_A_SHAPE = 0 if isinstance(env.action_space.sample(), int) else env.action_space.sample().shape 

### Make Network 

In [28]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.l1 = nn.Linear(N_STATES, 16)
        self.l1.weight.data.normal_(0,0.1) # weight 초기화
        self.l2 = nn.Linear(16,16)
        self.l2.weight.data.normal_(0,0.1)
        self.out = nn.Linear(16, N_ACTIONS)
        self.out.weight.data.normal_(0,0.1)
        
    def forward(self, x):
        x = self.l1(x)
        x = F.relu(x)
        x = self.l2(x)
        x = F.relu(x)
        action_value = self.out(x)
        return action_value # q value 

In [29]:
# 잘 나오는 지 확인
# torch.gather의 의미 파악
# torch.gather : dim = 1에서 index=0의 값을 가져와라! (아래 예시참고)
net = Net()
out = net(torch.Tensor([[1,2]]))
print(out)
print(out.max(dim=1))
out.gather(1,torch.LongTensor([[0]]))

tensor([[0.0640, 0.2357, 0.0432]], grad_fn=<AddmmBackward>)
(tensor([0.2357], grad_fn=<MaxBackward0>), tensor([1]))


tensor([[0.0640]], grad_fn=<GatherBackward>)

### Make Agent (DQN)

In [30]:
class DQNAgent:
    def __init__(self):
        # target, prediction network 분리
        self.target_network, self.prediction_network = Net(), Net()
        
        self.learn_step_counter = 0 # setting for target update
        self.memory_counter = 0 # for storing memory
        self.memory = np.zeros((MEMORY_CAPACITY, 6))
        self.optimizer = optim.Adam(self.prediction_network.parameters(), lr=learning_rate)
        self.loss_func = nn.MSELoss()
    
    # greedy action selection
    def select_action(self, state):
        state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
        value = self.prediction_network(state)
        max_value, arg_max = torch.max(value, 1)
        action = arg_max.item()
        if np.random.rand(1) >= 0.9: # epslion greedy
            action = np.random.choice(range(N_ACTIONS), 1).item()
        return action
            
        return 
    
    def store_transition(self, state, action, reward, next_state):
        transition = np.hstack([state, [action, reward], next_state])
        # replace the old memory with new memory
        idx = self.memory_counter % MEMORY_CAPACITY
        self.memory[idx, :] = transition
        self.memory_counter += 1
        
    def learn(self):
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:
            self.target_network.load_state_dict(self.prediction_network.state_dict())
        
        else:
            self.learn_step_counter += 1
            
        # batch transition sampling
        sample_idx = np.random.choice(MEMORY_CAPACITY, batch_size)
        batch_memory = self.memory[sample_idx, :]
        b_state = torch.FloatTensor(batch_memory[:, :N_STATES])
        b_action = torch.LongTensor(batch_memory[:, N_STATES:N_STATES+1].astype('int'))
        b_reward = torch.FloatTensor(batch_memory[:, N_STATES+1:N_STATES+2])
        b_next_state = torch.FloatTensor(batch_memory[:, N_STATES+2:])
        
        q_prediction = self.prediction_network(b_state).gather(dim=1, index=b_action)
        q_prime = self.target_network(b_next_state).detach() # don't backward with prediction network
        q_target = b_reward + GAMMA * q_prime.max(dim=1)[0].view(batch_size,1)
        loss = self.loss_func(q_prediction, q_target) # prediction, target 순서
        
        self. optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

print("Ready to Go!")

Ready to Go!


In [31]:
dqn = DQNAgent()

In [35]:
print("Collecting Expriences...")
ep_reward_ls = []
for i_episode in range(200):
    state = env.reset()
    episode_reward = 0
    while True:
        env.render()
        action = dqn.select_action(state)
        next_state, reward, done, info = env.step(action)
        
        dqn.store_transition(state, action, reward, next_state)
        
        episode_reward += reward
        
        if dqn.memory_counter > MEMORY_CAPACITY:
#             print("Learning Start!")
            dqn.learn()
            
            if done:
                print("Episode : {}, Episode reward : {}..".format(i_episode, round(episode_reward,2)) )
                ep_reward_ls.append(episode_reward)
        
        if done:
            break
        state = next_state
env.close()

Collecting Expriences...
Episode : 0, Episode reward : -200.0..
Episode : 1, Episode reward : -200.0..
Episode : 2, Episode reward : -200.0..
Episode : 3, Episode reward : -200.0..
Episode : 4, Episode reward : -200.0..
Episode : 5, Episode reward : -200.0..
Episode : 6, Episode reward : -200.0..
Episode : 7, Episode reward : -200.0..
Episode : 8, Episode reward : -200.0..
Episode : 9, Episode reward : -200.0..
Episode : 10, Episode reward : -200.0..
Episode : 11, Episode reward : -200.0..
Episode : 12, Episode reward : -200.0..
Episode : 13, Episode reward : -200.0..
Episode : 14, Episode reward : -200.0..
Episode : 15, Episode reward : -200.0..
Episode : 16, Episode reward : -200.0..
Episode : 17, Episode reward : -200.0..
Episode : 18, Episode reward : -200.0..
Episode : 19, Episode reward : -200.0..
Episode : 20, Episode reward : -200.0..
Episode : 21, Episode reward : -200.0..
Episode : 22, Episode reward : -200.0..
Episode : 23, Episode reward : -200.0..
Episode : 24, Episode rew

In [34]:
env.close()