In [20]:
import numpy as np
import gym
from collections import namedtuple
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))
ENV = "CartPole-v0"
GAMMA = 0.99
MAX_STEPS = 200
NUM_EPISODES = 500

In [None]:
###DDQN

In [3]:
class ReplayMemory:
    def __init__(self, CAPACITY):
        self.capacity = CAPACITY
        self.memory = []
        self.index = 0
        
    def push(self, state, action, state_next, reward):
        if len(self.memory)<self.capacity:
            self.memory.append(None)
        self.memory[self.index] = Transition(state, action, state_next, reward)
        self.index = (self.index+1)% self.capacity
        
    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)


In [4]:
class Net(nn.Module):
    def __init__(self, n_in, n_mid, n_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in, n_mid)
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.fc3 = nn.Linear(n_mid, n_out)
        
    def forward(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        out= self.fc3(h2)
        return out



In [5]:
BATCH_SIZE=32
CAPACITY=10000

class Brain:
    def __init__(self, num_states, num_actions):
        self.num_actions = num_actions
        self.memory = ReplayMemory(CAPACITY)
        
        n_in, n_mid, n_out = num_states, 32, num_actions
        self.main_q_network = Net(n_in, n_mid, n_out)
        self.target_q_network=Net(n_in, n_mid, n_out)
        print(self.main_q_network)
        
        self.optimizer = optim.Adam(self.main_q_network.parameters(), lr=1e-4)
        
    def replay(self):
        if len(self.memory)<BATCH_SIZE:
            return
        
        self.batch, self.state_batch, self.action_batch, self.reward_batch, self.non_final_next_states = self.make_minibatch()
        self.expected_state_action_values = self.get_expected_state_action_values()
        self.update_main_q_network()
        
    def decide_action(self, state, episode):
        epsilon = 0.5*(1/(episode+1))
        
        if epsilon <= np.random.uniform(0,1):
            self.main_q_network.eval()
            with torch.no_grad():
                action = self.main_q_network(state).max(1)[1].view(1,1)
        else:
            action = torch.LongTensor([[random.randrange(self.num_actions)]])
        
        return action
    
    def make_minibatch(self):
        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)
        non_final_next_states = torch.cat([s for s in batch.next_state if s is not None])
        return batch, state_batch, action_batch, reward_batch, non_final_next_states
    
    def get_expected_state_action_values(self):
        self.main_q_network.eval()
        self.target_q_network.eval()
        
        self.state_action_values = self.main_q_network(self.state_batch).gather(1, self.action_batch)
        
        non_final_mask = torch.ByteTensor(tuple(map(lambda s: s is not None, self.batch.next_state)))
        next_state_values = torch.zeros(BATCH_SIZE)
        a_m = torch.zeros(BATCH_SIZE).type(torch.LongTensor)
        a_m[non_final_mask] = self.main_q_network(self.non_final_next_states).detach().max(1)[1]
        a_m_non_final_next_states = a_m[non_final_mask].view(-1,1)
        
        next_state_values[non_final_mask] = self.target_q_network(self.non_final_next_states).gather(1, a_m_non_final_next_states).detach().squeeze()
        
        expected_state_action_values = self.reward_batch+GAMMA*next_state_values
        return expected_state_action_values
    
    def update_main_q_network(self):
        self.main_q_network.train()
        loss = F.smooth_l1_loss(self.state_action_values, self.expected_state_action_values.unsqueeze(1))
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
    def update_target_q_network(self):
        self.target_q_network.load_state_dict(self.main_q_network.state_dict())

In [9]:
class Agent:
    def __init__(self, num_states, num_actions):
        self.brain = Brain(num_states, num_actions)
        
    def update_q_function(self):
        self.brain.replay()
        
    def get_action(self, state, episode):
        action = self.brain.decide_action(state, episode)
        return action
    
    def memorize(self, state, action, state_next, reward):
        self.brain.memory.push(state, action, state_next, reward)
        
    def update_target_q_function(self):
        self.brain.update_target_q_network()

In [12]:
class Environment:
    def __init__(self):
        self.env = gym.make(ENV)
        num_states = self.env.observation_space.shape[0]
        num_actions = self.env.action_space.n
        self.agent = Agent(num_states, num_actions)
        
    def run(self):
        episode_10_list = np.zeros(10)
        complete_episodes=0
        episode_final=False
        
        for episode in range(NUM_EPISODES):
            observation = self.env.reset()
            state = observation
            state = torch.from_numpy(state).type(torch.FloatTensor)
            state = torch.unsqueeze(state, 0)
            
            for step in range(MAX_STEPS):
                action = self.agent.get_action(state, episode)
                observation_next, _, done, _ = self.env.step(action.item())
                
                if done:
                    state_next = None
                    episode_10_list = np.hstack((episode_10_list[1:], step+1))
                    
                    if step<195:
                        reward = torch.FloatTensor([-1.0])
                        complete_episodes=0
                    else:
                        reward = torch.FloatTensor([1.0])
                        complete_episodes = complete_episodes+1
                        
                else:
                    reward = torch.Tensor([0.0])
                    state_next = observation_next
                    state_next = torch.from_numpy(state_next).type(torch.FloatTensor)
                    state_next = torch.unsqueeze(state_next, 0)
                    
                self.agent.memorize(state, action, state_next, reward)
                self.agent.update_q_function()
                state = state_next
                
                if done:
                    print(f'{episode} episode finished after {step+1} steps , average of 10 try {episode_10_list.mean()}')
                    if episode%2 == 0:
                        self.agent.update_target_q_function()
                    break
                    
            if episode_final:
                self.env.close()
                break
                
            if complete_episodes>=10:
                    print('seikou')
                    episode_final=True

In [15]:
# cartpole_env = Environment()
# cartpole_env.run()

In [14]:
##dueling network

In [22]:
class Net(nn.Module):
    def __init__(self, n_in, n_mid, n_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in, n_mid)
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.fc3_adv = nn.Linear(n_mid, n_out)
        self.fc3_v   = nn.Linear(n_mid,1)
        
    def forward(self, x):
        h1 = F.relu(x)
        h2 = F.relu(h1)
        
        adv = self.fc3_adv(h2)
        val = self.fc3_v(h2).expand(-1, adv.size(1))
        
        out = val + adv -adv.mean(1, keepdim=True).expand(-1, adv.size(1))
        return out

In [24]:
#prioritized experience replay

In [26]:
TD_ERROR_EPSILON=0.0001

class TDerrorMemory:
    def __init__(self, CAPACITY):
        self.capacity = CAPACITY
        self.memory = []
        self.index = 0
        
    def push(self, td_error):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        
        self.memory[self.index] = self.td_error
        self.index = (self.index+1)%self.capacity
        
    def __len__(self):
        return len(self.memory)
    
    def get_prioritized_indexes(self, batch_size):
        sum_absolute_td_error = np.sum(np.absolute(self.memory))
        sum_absolute_td_error += TD_ERROR_EPSILON*len(self.memory)
        
        rand_list = np.randomuniform(0, sum_absolute_td_error, batch_size)
        rand_list = np.sort(rand_list)
        
        indexes = []
        idx = 0
        tmp_sum_absolute_td_error = 0
        for randnum in rand_list:
            while tmp_sum_absolute_td_error<randnum:
                tmp_sum_absolute_td_error +=(abs(self.memory[idx]) + TD_ERROR_EPSILON)
                idx +=1
                
            if idx >= len(self.memory):
                idx = len(self.memory)-1
            indexes.append(idx)
            
        return indexes

In [27]:
#a2c

In [28]:
class RolloustStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape):
        self.observations = torch.zeros(num_steps+1, num_processes, 4)
        self.masks = torch.ones(num_steps+1, num_processes, 1)
        self.rewards = torch.zeros(num_steps, num_processes,1)
        self.actions = torch.zeros(num_steps, num_processes, 1).long()
        
        self.returns = torch.zeros(num_steps, 1, num_processes, 1)
        self.index = 0
        
    def insert(self, current_obs, action, reward, mask):
        self.observations[self.index+1].copy_(current_obs)
        self.masks[self.index+1].copy_(mask)
        self.rewards[self.index].copy_(reward)
        self.actions[self.index].copy_(action)
        
        self.index = (self.index+1)%CAPACITY
        
    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])
        
    def compute_returns(self, next_values):
        self.returns[-1] = next_value
        for ad_step in reversed(range(self.rewards.size(0))):
            self.returns[ad_step] = self.returns[ad_step+1]* GAMMA*self.masks[ad_step+1]+self.rewards[ad_step]

In [29]:
class Net(nn.Module):
    def __init__(self, n_in, n_mid, n_out):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(n_in, n_mid)
        self.fc2 = nn.Linear(n_mid, n_mid)
        self.fc3 = nn.Linear(n_mid, n_out)
        self.actor = nn.Linear(n_mid, 1)
    
    def forward(self, x):
        h1 = F.relu(self.fc1(x))
        h2 = F.relu(self.fc2(h1))
        critic_out = self.critic(h2)
        actor_out = self.actor(h2)
        
        return critic_out, actor_out
    
    def act(self, x):
        value, actor_out = self(x)
        action_probs = F.softmax(actor_out, dim=1)
        action = actin_probs.multinomial(num_samples=1)
        return action
    
    def get_values(self, x):
        value, actor_output = self(x)
        return value
    
    def evaluate_actions(self, x, acitons):
        value, actor_out = self(x)
        log_probs = F.log_softmax(actor_out, dim=1)
        action_log_probs = log_probs.gather(1, actions)
        
        probs = F.softmax(actor_out, dim=1)
        entropy = -(log_probs*probs).sum(-1).mean()
        return value, aciton_log_probs, entropy