TODO
- DDQN
- Dueling architecture
- Test
- Code Cleaning
- Tqdm (print statements)
- Saving model

In [10]:
import numpy as np
from collections import namedtuple
import random

import gym
import rubiks

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import tqdm

import copy

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Replay Memory

In [12]:
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward','done'))

In [13]:
class ReplayMemory(object):
    """"""
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0
        
    def push(self, *args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)

        self.memory[self.position] = Transition(*args)
        
        self.position = (self.position + 1) % self.capacity
        
    def sample(self,batch_size):
        return random.sample(self.memory, batch_size)
    
    def __len__(self):
        return len(self.memory)

In [14]:
class PrioritizedReplayMemory(object):
    """"""
    def __init__(self):
        raise NotImplementedError

# Hyper Parameters
TODO:
    - Tune

In [15]:
gamma = 0.9
max_episodes = 3000000
batch_size = 32
max_epsilon_steps = 90000
max_test_episodes = 100000
max_tau = 10000
replay_memory_size = 32000
learning_rate = 1e-5

# Epsilon Decay

In [16]:
def get_epsilon(global_steps, max_epsilon_steps=5000, final_probability=0.05):
    """Epsilon is linearly decayed over n steps from 1 to the final probability. """
    if global_steps < max_epsilon_steps:
        return 1 - (global_steps/max_epsilon_steps)*(1-final_probability)
    else:
        return final_probability

# Action Selection

In [17]:
def select_action(number_actions, state, network, global_steps, max_epsilon_steps):
    """With probability epsilon a random action is retured. With probability 1-epsilon the actor network returns the action that maximizes the q-value for the given state."""
    epsilon = get_epsilon(global_steps, max_epsilon_steps)
    if np.random.rand() > epsilon:
        with torch.no_grad():
            return network(state).max(1)[1].view(1, 1).item(), epsilon
    else:
        return np.random.randint(number_actions), epsilon

# Network

In [18]:
class QNetwork(nn.Module):
    """Network that maps states to actions."""
    def __init__(self, input_size, num_actions=2):
        super(QNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size,64)
        self.layer2 = nn.Linear(64,64)
        self.layer3 = nn.Linear(64,num_actions)
        
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)
        
        return x

# DQN Functions

In [19]:
def compute_q_val(model, state, action):
    """Computes the q-value for a certain action and state."""
    qactions = model(state)
    return torch.gather(qactions,1,action.view(-1,1))

In [20]:
def compute_target_dqn(model, reward, next_state, done, gamma):
    """Computes the target. When done is true 0 is added to the reward."""
    m = torch.cat(((gamma*torch.max(model(next_state),1)[0]).view(-1,1),torch.zeros(reward.size(), device=device).view(-1,1)),1)
    return reward.view(-1,1) + torch.gather(m, 1, done.long().view(-1,1))

In [21]:
def compute_target_ddqn(model, target_network, reward, next_state, done, gamma):
    """Computes the target. When done is true 0 is added to the reward."""
    m = torch.cat(((gamma*torch.gather(target_network(next_state),1,(torch.max(model(next_state),1)[1]).view(-1,1))).view(-1,1),torch.zeros(reward.size(), device=device).view(-1,1)),1)
    return reward.view(-1,1) + torch.gather(m, 1, done.long().view(-1,1))

In [22]:
def train_dqn(q1, target_network, memory, optimizer, batch_size, gamma, training_type):
    if len(memory) < batch_size:
        return None
    
    transitions = memory.sample(batch_size)
    
    batch = Transition(*zip(*transitions))
    
    state_batch = torch.cat(batch.state).view(batch_size,-1)
    action_batch = torch.cat(batch.action)
    next_state_batch = torch.cat(batch.next_state).view(batch_size,-1)
    reward_batch = torch.cat(batch.reward)

    done_batch = torch.cat(batch.done)

    q_val = compute_q_val(q1, state_batch, action_batch)
    
    if training_type is 'vanilla':
        # Vanilla
        with torch.no_grad():
            target = compute_target_dqn(q1, reward_batch, next_state_batch, done_batch, gamma)
    if training_type is 'target':
        # DQN
        with torch.no_grad():
            target = compute_target_dqn(target_network, reward_batch, next_state_batch, done_batch, gamma)
    
    if training_type is 'ddqn':
        #DDQN
        with torch.no_grad():
            target = compute_target_ddqn(q1, target_network, reward_batch, next_state_batch, done_batch, gamma)
        
    loss = F.smooth_l1_loss(q_val, target)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

# Environment

In [26]:
env = rubiks.RubiksEnv(2)

144

# Train

In [30]:
def run_dqn(max_episodes, max_epsilon_steps, max_tau, learning_rate, replay_memory_size, gamma, training_type, seed = None):
    #Initialise networks
    q1 = QNetwork(6*6*env.size*env.size,env._n_actions).to(device)
    target_network = copy.deepcopy(q1)
    
    optimizer = optim.Adam(q1.parameters(),lr=learning_rate,amsgrad=True)    

    env.seed(seed)
    np.random.seed(seed)
    
    memory = ReplayMemory(replay_memory_size)
    
    difficulty = 1
    max_tries = 1
    tries = 0
    
    times_done = 0
    global_steps = 0
    tau = 0
    
    sample_success = np.array([])
    
    for episode in range(max_episodes):
        state = torch.tensor([env.reset(difficulty)], dtype=torch.float, device=device)

        done = False
        loss_episode = 0.0
        tries = 0
        while tries < max_tries and not done:
            action, epsilon = select_action(env._n_actions,state,q1,global_steps, max_epsilon_steps)

            next_state, reward, done, info = env.step(action)       
            next_state = torch.tensor([next_state], dtype=torch.float, device=device)
            memory.push(state, torch.tensor([action], dtype=torch.int64, device=device), next_state, torch.tensor([reward],dtype=torch.float,device=device), torch.tensor([done],dtype=torch.uint8,device=device))
            loss = train_dqn(q1,target_network,memory,optimizer,batch_size,gamma, training_type)
            
            if loss is not None:
                loss_episode += loss

            global_steps += 1
            tries += 1

            state = next_state
            
        if global_steps > 1000: 
            sample_success = np.append(sample_success,[float(done)])
        
        if len(sample_success) > 1000:
            sample_success = np.delete(sample_success,0,0)
#             if episode % 1000 == 0:
#                 print(episode,np.sum(sample_success)/1000, epsilon,loss_episode)
            if np.sum(sample_success)/1000 > 0.8:
                global_steps = 0
                max_tries += 1
                difficulty += 1
                sample_success = np.array([])
                max_epsilon_steps *= 4
                print('Level up!', difficulty)
                
        if tau==max_tau:
            tau = 0
            target_network = copy.deepcopy(q1)
    return np.sum(sample_success)/1000

# Run

In [31]:
# max_episodes = 500000

# max_epsilon_steps_list = [1000,10000,100000]
# max_tau_list = [100,1000,10000]
# learning_rate_list = [1e-4,5e-4,1e-5]
# replay_memory_size_list = [32,1024,32768]
# gamma_list = [0.95, 0.925, 0.9, 0.8875]
# training_type_list = ['vanilla', 'target', 'ddqn'] 

# for i in range(100):
#     experiment = [max_episodes,random.sample(max_epsilon_steps_list,1)[0],random.sample(max_tau_list,1)[0],random.sample(learning_rate_list,1)[0],random.sample(replay_memory_size_list,1)[0],random.sample(gamma_list,1)[0],random.sample(training_type_list,1)[0],None]
#     print('Max_episodes:{}, max_epsilon_steps:{}, max_tau:{}, learning_rate:{}, replay_memory_size:{}, gamma:{}, training_type: {}, seed:{}'.format(*experiment))
#     sample_succ = run_dqn(experiment[0], experiment[1], experiment[2], experiment[3], experiment[4], experiment[5], experiment[6])
#     print(sample_succ)
#     # experiments = [[50000, 100000, 100, 1e-5, 256, 0.95, 45],
# #               [50000, 100000, 100, 1e-5, 256, 0.9, 45],
# #               [50000, 100000, 100, 1e-5, 256, 0.85, 45],
# #               [50000, 100000, 100, 1e-5, 256, 0.8, 45]
# #               ]

# # for experiment in experiments:
# #     print('Max_episodes:{}, max_epsilon_steps:{}, max_tau:{}, learning_rate:{}, replay_memory_size:{}, gamma:{}, seed:{}'.format(*experiment))
# #     run_dqn(experiment[0], experiment[1], experiment[2], experiment[3], experiment[4], experiment[5])


In [32]:
sample_succ = run_dqn(5000000,10000,1000,5e-4,32768,0.925,'vanilla',42)

print(sample_succ)

Level up! 2
Level up! 3
Level up! 4
Level up! 5


KeyboardInterrupt: 

In [1]:
import gym

In [2]:
env = gym.make('CartPole-v0')

In [32]:
epochs = 100
global_steps = 0
q1 = QNetwork(4,2).to(device)
target_network = copy.deepcopy(q1)
memory = ReplayMemory(replay_memory_size)
    
optimizer = optim.Adam(q1.parameters(),lr=learning_rate,amsgrad=True) 

In [39]:
memory = ReplayMemory(replay_memory_size)

for epoch in range(epochs):
    done = False
    state = torch.tensor([env.reset()], dtype=torch.float, device=device)
    print(state)
    while not done:
        action, epsilon = select_action(2, state,q1,global_steps, max_epsilon_steps)

        next_state, reward, done, info = env.step(action)       
        next_state = torch.tensor([next_state], dtype=torch.float, device=device)
        memory.push(state, torch.tensor([action], dtype=torch.int64, device=device), next_state, torch.tensor([reward],dtype=torch.float,device=device), torch.tensor([done],dtype=torch.uint8,device=device))
        loss = train_dqn(q1,target_network,memory,optimizer,batch_size,gamma, 'vanilla')
        
        state = next_state
        global_steps += 1

tensor([[-0.0456, -0.0171,  0.0280, -0.0107]], device='cuda:0')
tensor([[ 0.0323, -0.0277, -0.0267, -0.0106]], device='cuda:0')
tensor([[-0.0049, -0.0455, -0.0173,  0.0446]], device='cuda:0')
tensor([[ 0.0378,  0.0136, -0.0474, -0.0261]], device='cuda:0')
tensor([[-0.0293, -0.0496, -0.0261, -0.0483]], device='cuda:0')
tensor([[-0.0308, -0.0054, -0.0158, -0.0389]], device='cuda:0')
tensor([[-0.0079,  0.0408,  0.0228,  0.0409]], device='cuda:0')
tensor([[-0.0089, -0.0380,  0.0277,  0.0242]], device='cuda:0')
tensor([[0.0331, 0.0428, 0.0265, 0.0062]], device='cuda:0')
tensor([[ 0.0244, -0.0445,  0.0154,  0.0244]], device='cuda:0')
tensor([[-0.0310,  0.0078,  0.0391,  0.0462]], device='cuda:0')
tensor([[ 0.0018, -0.0078,  0.0168, -0.0013]], device='cuda:0')
tensor([[0.0419, 0.0160, 0.0416, 0.0454]], device='cuda:0')
tensor([[ 0.0251,  0.0175, -0.0178,  0.0236]], device='cuda:0')
tensor([[-0.0404,  0.0341, -0.0339, -0.0079]], device='cuda:0')
tensor([[ 0.0358, -0.0471, -0.0262,  0.0122]], d