In [1]:
import gym
import gym_drone
import numpy as np
import random
from copy import deepcopy
from statistics import mean
from collections import deque, namedtuple

import torch
import torch.nn.functional as F
from torch.autograd import Variable
from torch import nn, optim

use_cuda = torch.cuda.is_available()
use_cuda = False
FloatTensor = torch.cuda.FloatTensor if use_cuda else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if use_cuda else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if use_cuda else torch.ByteTensor
Tensor = FloatTensor

In [2]:
parameters = {
            "tau" : 0.05,
            "gamma" : 0.99,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.95,
            "epsilon_minimum": 0.01,
            "buffer_size" : 2000,
            "batch_size" : 64,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.00025,
            "momentum": 0.95,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24
}

env = gym.make("CartPole-v1")
env.reset()
env.render("notebook")

NoSuchDisplayException: Cannot connect to "None"

In [3]:
class ExperienceReplay:
    
    def __init__(self, env, parameters):
        self.env = env
        self.buffer_size = parameters["buffer_size"]
        self.batch_size = parameters["batch_size"]
        # A deque is a buffer which can append and pop from
        # both sides. If it is at max length, any appends
        # will discard the first value to maintain size.
        self.experience_buffer = deque(maxlen=self.buffer_size)
        # Every transition is stores as a named tuple.
        self.experience_tuple = namedtuple("Experience", 
                                           field_names=["state", 
                                                        "action", 
                                                        "reward", 
                                                        "done", 
                                                        "next_state"])
        
    def push(self, state, action, reward, done, next_state):
        transition = self.experience_tuple(state=state, 
                                           action=action, 
                                           reward=reward, 
                                           done=done, 
                                           next_state=next_state)
        self.experience_buffer.append(transition)
        
    def sample(self):
        sample_size = min(len(self.experience_buffer), self.batch_size)
        experiences = random.sample(self.experience_buffer, sample_size)
        
        states = np.asarray([experience.state for experience in experiences])
        actions = np.asarray([experience.action for experience in experiences])
        rewards = np.asarray([experience.reward for experience in experiences])
        dones = np.asarray([experience.done for experience in experiences])
        next_states = np.asarray([experience.next_state for experience in experiences])
        
        batch = self.experience_tuple(
            state=torch.from_numpy(states).type(Tensor),
            action=torch.from_numpy(actions).type(LongTensor),
            reward=torch.from_numpy(rewards).type(Tensor),
            done=torch.from_numpy(dones).type(ByteTensor),
            next_state=torch.from_numpy(next_states).type(Tensor)
        )
        
        return batch
        #return (states, actions, rewards, dones, new_states)
        
    def warm_up(self):
        for _ in range(10):
            state = self.env.reset()
            done = False
            
            while not done:
                action = self.env.action_space.sample()
                next_state, reward, done, _ = self.env.step(action)
                self.push(state, action, reward, done, next_state)
                state = next_state
        
    def __len__(self):
        return len(self.experience_buffer)

#test = ExperienceReplay(env, parameters)
#test.warm_up()
#test.sample()

In [4]:
class DQN(nn.Module):
    
    def __init__(self, env, parameters):
        self.observations_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        self.hidden_layer_1 = parameters["hidden_layer_1"]
        self.hidden_layer_2 = parameters["hidden_layer_2"]

        super(DQN, self).__init__()
        # Define network layers
        self.fc1 = nn.Linear(self.observations_size, self.hidden_layer_1)
        self.fc2 = nn.Linear(self.hidden_layer_1, self.hidden_layer_2)
        self.fc3 = nn.Linear(self.hidden_layer_2, self.action_size)
        
    def forward(self, x):
        # Define forward propagation
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x
    
    def get_loss(self):
        return nn.MSELoss()
        
    def get_optim(self, lr):
        return optim.Adam(self.parameters(), lr=lr)

In [5]:
def discrete_to_input_format(env, coordinates):
    # observation_space is a tuple of discrete numbers
    # with a certain amount of possible numbers.
    observation_size = sum([x.n for x in env.observation_space])
    output = [0] * observation_size
    
    start_at = 0
    for index, coordinate in enumerate(coordinates):
        output[coordinate + start_at] = 1
        start_at += env.observation_space[index].n
    return output

$Q(s,a) = r(s,a) + \gamma \max\limits_{a}Q(s',a)$ \
$θ_{target}=τ*θ_{local}+(1-τ)*θ_{target}$

In [None]:
class Agent:

    def __init__(self, env, parameters):
        self.env = env
        self.local_network = DQN(env, parameters)
        self.target_network = deepcopy(self.local_network)
        #self.target_network = DQN(env, parameters)
        self.experience_replay = ExperienceReplay(env, parameters)
        
        self.epsilon = parameters["epsilon_init"]
        self.epsilon_decay = parameters["epsilon_decay"]
        self.epsilon_minimum = parameters["epsilon_minimum"]
        self.tau = parameters["tau"]
        self.gamma = parameters["gamma"]
        self.epochs = parameters["epochs"]
        self.loss_metric = parameters["loss_metric"]
        self.learning_rate = parameters["learning_rate"]
        self.learning_rate_decay = parameters["learning_rate_decay"]
        
    def update_local_network(self):
        #states, actions, rewards, dones, next_states = self.experience_replay.sample()
        batch = self.experience_replay.sample()
        
        state_batch = Variable(batch.state)
        action_batch = Variable(batch.action)
        reward_batch = Variable(batch.reward)
        
        
        non_final = LongTensor([i for i, done in enumerate(batch.done) if not done])
        non_final_mask = (1 - batch.done).bool()
        # To prevent backprop through the target action values, set volatile=False (also sets requires_grad=False)
        with torch.no_grad():
            non_final_next_states = Variable(batch.next_state.index_select(0, non_final))

            # Compute Q(s_t, a), the estimated Q-values, using local network
            Q_state_action = self.local_network(state_batch).gather(1, action_batch.unsqueeze(1))

            # Compute V(s_{t+1}) for all next states.
            V_next_state = Variable(torch.zeros(self.experience_replay.batch_size).type(Tensor))
            _, next_state_actions = self.local_network(non_final_next_states).max(1, keepdim=True)
            V_next_state[non_final_mask] = self.target_network(non_final_next_states).gather(1, next_state_actions)
        
        return
        
        # Compute the target Q values
        target_Q_state_action = reward_batch + (self.gamma * V_next_state)
        
        # Update Q_values with "correct" Q-values calculated using the Q-learning algorithm    
        Q_local_expected = Q_local.clone()
        for row, col_id in enumerate(actions):
            Q_local_expected[row, col_id[0]] = Q_calc[row]
        
        # Train network by minimizing the difference between Q_local and modified Q_local 
        loss = self.local_network.get_loss()
        optimizer = self.local_network.get_optim(self.learning_rate)
        
        optimizer.zero_grad()
        
        loss_size = loss(Q_local, Q_local_expected)
        #print(loss_size)
        loss_size.backward()
        optimizer.step()
        
    def update_target_network(self):
        local_weights = self.local_network.state_dict()
        target_weights = self.target_network.state_dict()
        
        for layer in local_weights:
            target_weights[layer] = self.tau * local_weights[layer] + (1 - self.tau) * target_weights[layer]
        
        self.target_network.load_state_dict(target_weights)
        
    def update_epsilon(self):
        if self.epsilon >= self.epsilon_minimum:
            self.epsilon *= self.epsilon_decay
            
    def select_action(self, state):
        if self.epsilon > np.random.uniform():
            action = self.env.action_space.sample()
        else:
            #network_input = discrete_to_input_format(self.env, state)
            network_input = state
            input_tensor = torch.Tensor(network_input).squeeze(0)
            action = self.local_network(input_tensor).max(0)[1].item()
            #print(self.local_network(input_tensor))
        return action

    def step(self, state):
        action = self.select_action(state)
        next_state, reward, done, _ = self.env.step(action)
        return action, reward, done, next_state

test = Agent(env, parameters)
test.experience_replay.warm_up()
#test.epsilon = 0
#test.select_action(test.env.reset())
print("lol")
test.update_local_network()

In [7]:
def train(agent, iterations, episodes):
    
    total_reward = 0
    total_reward_list, iterations_list = [], []
    agent.experience_replay.warm_up()
    
    for episode in range(episodes):
        
        state = env.reset()
        total_reward=0
        
        if (episode != 0): 
            agent.update_epsilon()
    
        #agent.target_network.load_state_dict(agent.local_network.state_dict())
        
        for iteration in range(iterations):
            action, reward, done, new_state = agent.step(state)
            agent.experience_replay.push(state, action, reward, done, new_state)
            
            state = new_state
            
            agent.update_local_network()
            agent.update_target_network()
            total_reward += reward
            
            if done: 
                break
        
        total_reward_list.append(total_reward)
        iterations_list.append(iteration+1)
        
        if episode % 5 == 0 and episode != 0:
            print("Episode: {0:d}-{1:d} | Avg. iterations: {2:0.2f}  | Max total reward: {3:0.2f} | Avg. total reward: {4:0.2f} | Epsilon: {5:0.4f}" \
                  .format(episode-10, episode, mean(iterations_list), max(total_reward_list), mean(total_reward_list), agent.epsilon))
            total_reward_list.clear()
            iterations_list.clear()

In [8]:
parameters = {
            "tau" : 0.05,
            "gamma" : 0.99,
            "epsilon_init" : 1,
            "epsilon_decay" : 0.95,
            "epsilon_minimum": 0.01,
            "buffer_size" : 2000,
            "batch_size" : 64,
            "epochs": 1,
            "loss_metric" : "mse",
            "learning_rate" : 0.01,
            "learning_rate_decay": 0.01,
            "hidden_layer_1": 24,
            "hidden_layer_2": 24
}

env = gym.make("CartPole-v1")

dqn_agent = Agent(env, parameters)

In [9]:
#train(dqn_agent, 500, 150)