### Introduction

Here I'm testing a DQN on the lunar landing environment. I'm using 

1. Experience replay
2. A target network.

I update the target network every $C$ timesteps.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import gym
from agent import Agent
from keras.utils import to_categorical
%matplotlib inline


#Environment
env = gym.make('LunarLander-v2')
env.seed(1)  # for comparison
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

#Agent
lr,gamma = 0.01, 0.99
agent = Agent(num_states, num_actions, lr, gamma)
agent.C = 5000  #update the target network every K timestesp
agent.epsilon = 1.0
decay_factor = 0.999
epsilon_min = 0.05
agent.memory_size = 10**5
learning_start = 5000

#Train
EPISODES = 5000
scores = []
step = 0
for e in range(1,EPISODES+1):
    state = env.reset()
    reward_sum = 0
    done = False
    while not done:
        
        # env.render()
        state = np.reshape(state, [1, num_states])  #reshape for keras
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward_sum += reward
        agent.remember(state[0], action, reward, next_state, done)
        state = next_state
        
        #learn
        if len(agent.memory) > learning_start:
            agent.replay()            #update the behavior model        
            agent.soft_update_target_network()
            
        #iterate
        step += 1
        if done:
            break
    
    #Learn & print results
    agent.epsilon = max(agent.epsilon*decay_factor,epsilon_min)
    scores.append(reward_sum)
    if e % 50 == 0:
        print '(episode, score) = ' + str((e,reward_sum))

plt.plot(scores)
np.savetxt('stats/scores_lunar_landing.txt',scores)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m
(episode, score) = (50, -168.58561383231918)
(episode, score) = (100, -116.77211795472316)
(episode, score) = (150, -359.6249641787523)
(episode, score) = (200, -173.49422601363773)
(episode, score) = (250, -150.4842843904401)
(episode, score) = (300, -245.7594000933272)
(episode, score) = (350, -262.5435740658975)
(episode, score) = (400, -176.10793701279613)
(episode, score) = (450, -185.84825662432925)
(episode, score) = (500, -160.55387919736097)
(episode, score) = (550, -142.04657338273975)
(episode, score) = (600, -202.92165043208155)
(episode, score) = (650, -139.27986224695815)
(episode, score) = (700, -346.02744225962056)
(episode, score) = (750, -492.85309113446334)
(episode, score) = (800, -321.3729580406914)
(episode, score) = (850, -262.43039354948155)


Must compare to regular policy gradient


### Compare to regular policy gradients

In [None]:
#I computed the scores for the regular policy gradient in the, well, 'policy-gradient' foldee ;P
filename = '/home/kokeeffe/research/robocab/RL_practice/policy_gradients/stats/scores_lunar_landing.txt'
scores_regular = np.loadtxt(filename)
plt.plot(scores_regular)
plt.plot(scores)
plt.legend(['policy gradient','DQN'])

Looks good -- there is indeed smaller variance.

### Watch a smart agent

In [10]:
state = env.reset()
state = np.reshape(state, [1, num_states])
reward_sum = 0
done = False
while not done:
    env.render()
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    reward_sum += reward
    next_state = np.reshape(next_state, [1, num_states])
    #agent.remember(state[0], action, reward)
    state = next_state
env.close()

### Roughwork