<a href="https://colab.research.google.com/github/JoeJBenton/machine-learning/blob/master/DQN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is based of the algorithm desribed [here](https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf) and some implementation details are guided by this [tutorial](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html).

(Note that the current version has the algorithm implemented essentially correctly, but has an issue with the learning rate, which means that it currently doesn't perfom as expected.)

In [None]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gym

env = gym.make('CartPole-v0')

In [None]:
# Define hyperparameters

memory_capacity = 50000
epochs = 30
exploration_rate = 0.1
mini_batch_size = 100
learning_rate = 1
gamma = 0.9

In [None]:
# Define a network that will fit the optimal action-value function Q(s,a)
# DQN takes in the state s and returns a vector of predictions for Q(s,a)

class DQN(nn.Module):
  def __init__(self):
    super(DQN, self).__init__()
    self.fc1 = nn.Linear(4,30)
    self.fc2 = nn.Linear(30,30)
    self.fc3 = nn.Linear(30,2)

  def forward(self, x):
    x = F.relu(self.fc1(x))
    x = F.relu(self.fc2(x))
    x = self.fc3(x)
    return x

criterion = nn.MSELoss()

In [None]:
# Define the replay memory class that will store our data. It will store a
# certain number of memory instances from which we can train, and eventually be
# overwritten with new data as we train.

class ReplayMemory(object):
  def __init__(self, capacity):
    self.capacity = capacity
    self.memory = []
    self.position = 0

  def append(self, data):
    if len(self.memory) < self.capacity:
      self.memory.append(None)
    self.memory[self.position] = data
    self.position = (self.position+1)%self.capacity

  def sample(self, batch_size):
    return random.sample(self.memory, batch_size)

In [None]:
# Augment training data using current policy derived from network

def experience(runs, run_length):
  for _ in range(runs):
    state = torch.Tensor(env.reset())
    for _ in range(run_length):
      with torch.no_grad():
        Qsa = policy_net(state)

      # Take the value-maximising action with probability 1-epsilon and a
      # random action with probability epsilon
      if random.random() > exploration_rate:
        action = int(torch.argmax(Qsa))
      else:
        action = env.action_space.sample()

      (new_state, reward, done, _) = env.step(action)
      training_data.append((state, action, reward, new_state, done))
      state = torch.Tensor(new_state)
      if done == True:
        break

In [None]:
def evaluate(runs, max_run_length):
  total_reward = 0
  for _ in range(runs):
    state = torch.Tensor(env.reset())
    for _ in range(max_run_length):
      with torch.no_grad():
        Qsa = policy_net(state)
      action = int(torch.argmax(Qsa))

      (new_state, reward, done, _) = env.step(action)
      total_reward += reward
      state = torch.Tensor(new_state)
      if done == True:
        break
  return total_reward/runs

In [None]:
optimizer = optim.SGD(policy_net.parameters(), lr=learning_rate)

def calculate_loss(Qsa, reward, action, terminal, Qs_a):
  # Calculates the loss for a given Q(s,a), Q(s',a), reward and action
  Vs = torch.stack([q[int(a)] for q, a in zip(Qsa, action)])
  Vs = torch.unsqueeze(Vs,1)

  # Calculate recursion for y from the Bellman equation
  ys = reward.clone()
  for y, q, done in zip(ys, Qs_a, terminal):
    if not done:
      y += gamma*torch.max(q)

  #print(ys)
  #print(Vs)

  return criterion(Vs, ys)

def train(mini_batches):
  for _ in range(mini_batches):
    if len(training_data.memory) < mini_batch_size:
      return
    batch = list(zip(*training_data.sample(mini_batch_size)))

    state = torch.stack(batch[0])
    action = torch.unsqueeze(torch.Tensor(batch[1]),1)
    reward = torch.unsqueeze(torch.Tensor(batch[2]),1)
    new_state = torch.Tensor(batch[3])
    terminal = batch[4]

    Qsa = policy_net(state)
    with torch.no_grad():
      Qs_a = target_net(new_state)
    loss = calculate_loss(Qsa, reward, action, terminal, Qs_a)

    optimizer.zero_grad()
    loss.backward()
    #print(policy_net.fc1.bias)
    for param in policy_net.parameters():
      print(param)
      param.data.sub_(learning_rate*param.grad.data)
    #print(policy_net.fc1.bias)

    Qsa = policy_net(state)
    loss = calculate_loss(Qsa, reward, action, terminal, Qs_a)

In [None]:
policy_net = DQN()
target_net = DQN()
target_net.load_state_dict(policy_net.state_dict())
training_data = ReplayMemory(memory_capacity)

In [None]:
for epoch in range(6):
  print("Epoch %s..." % epoch)
  for _ in range (1):
    experience(20,1000)
    train(1)
  target_net.load_state_dict(policy_net.state_dict())
  print(evaluate(100,100))


Epoch 0...
Parameter containing:
tensor([[-0.0014,  0.1104,  0.0917, -0.0529],
        [-0.1474, -0.4785,  0.0810, -0.4862],
        [-0.1049,  0.4567,  0.2953, -0.3772],
        [-0.1990, -0.3975,  0.3357,  0.2423],
        [-0.0062,  0.2326, -0.2451, -0.3911],
        [ 0.4457, -0.0493,  0.0308, -0.1332],
        [ 0.0166,  0.1502, -0.4831, -0.3788],
        [ 0.0062,  0.0420, -0.2295,  0.0233],
        [ 0.3791, -0.2776,  0.0121, -0.4076],
        [-0.0388, -0.1102, -0.3458, -0.3129],
        [-0.4218, -0.0393, -0.4031,  0.2503],
        [-0.3093,  0.4943, -0.4191, -0.2536],
        [-0.0135,  0.1565,  0.1261,  0.4285],
        [ 0.2831, -0.4420,  0.4586, -0.3235],
        [-0.0766, -0.0742,  0.3456, -0.4317],
        [-0.2831, -0.3578,  0.3507,  0.1903],
        [ 0.4373,  0.0527, -0.4728, -0.3266],
        [ 0.0520, -0.0043, -0.4145, -0.0245],
        [-0.4776, -0.1430,  0.0891,  0.3319],
        [ 0.2335,  0.0724,  0.1256,  0.2972],
        [ 0.1992,  0.4098,  0.4172, -0.3988],
 