[Nice Blog](http://www.danielslater.net/2016/03/deep-q-learning-pong-with-tensorflow.html)

In [1]:
import tensorflow as tf
import numpy as np

NUM_STATES = 10
NUM_ACTIONS = 2
GAMMA = 0.5


def hot_one_state(index):
    array = np.zeros(NUM_STATES)
    array[index] = 1.
    return array

# we will create a set of states, 
# the agent get a reward for getting to the 5th one(4 in zero based array).

# the agent can go forward or backward by one state with wrapping
# (so if you go back from the 1st state you go to the end

states = [(x == 4) for x in range(NUM_STATES)]
# [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]

print ('-----')
print ('states : {}'.format(states))

session = tf.Session()
state = tf.placeholder("float", [None, NUM_STATES])
targets = tf.placeholder("float", [None, NUM_ACTIONS])

hidden_weights = tf.Variable(tf.constant(0., shape=[NUM_STATES, NUM_ACTIONS]))

output = tf.matmul(state, hidden_weights)

loss = tf.reduce_mean(tf.square(output - targets))
train_operation = tf.train.AdamOptimizer(0.1).minimize(loss)

session.run(tf.global_variables_initializer())

for i in range(50):
    state_batch = []
    rewards_batch = []

    # create a batch of states
    for state_index in range(NUM_STATES):
        state_batch.append(hot_one_state(state_index))

        minus_action_index = (state_index - 1) % NUM_STATES
        plus_action_index = (state_index + 1) % NUM_STATES

        minus_action_state_reward = session.run(output, feed_dict={state: [hot_one_state(minus_action_index)]})[0]
        plus_action_state_reward = session.run(output, feed_dict={state: [hot_one_state(plus_action_index)]})[0]

        # these action rewards are the results of the Q function 
        # for this state and the actions minus or plus
        action_rewards = [states[minus_action_index] + GAMMA * np.max(minus_action_state_reward),
                          states[plus_action_index] + GAMMA * np.max(plus_action_state_reward)]
        rewards_batch.append(action_rewards)
    print ('---')
    print ('State_batch : {}'.format(state_batch))
    print ('Rewards_batch : {}'.format(rewards_batch))

    session.run(train_operation, feed_dict={
        state: state_batch,
        targets: rewards_batch})

    print([states[x] + np.max(session.run(output, feed_dict={state: [hot_one_state(x)]}))
           for x in range(NUM_STATES)])

-----
states : [False, False, False, False, True, False, False, False, False, False]
---
State_batch : [array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.]), array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.])]
Rewards_batch : [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 1.0], [0.0, 0.0], [1.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]]
[0.0, 0.0, 0.0, 0.099999689, 1.0, 0.099999689, 0.0, 0.0, 0.0, 0.0]
---
State_batch : [array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]), array([ 0.,  1.,  0.,  0

In [None]:
Initialize replay memory D to size N
Initialize action-value function Q with random weights
for episode = 1, M do
    Initialize state s_1
    for t = 1, T do
        With probability ϵ select random action a_t
        otherwise select a_t=max_a  Q(s_t,a; θ_i)
        Execute action a_t in emulator and observe r_t and s_(t+1)
        Store transition (s_t,a_t,r_t,s_(t+1)) in D
        Sample a minibatch of transitions (s_j,a_j,r_j,s_(j+1)) from D
        Set y_j:=
            r_j for terminal s_(j+1)
            r_j+γ*max_(a^' )  Q(s_(j+1),a'; θ_i) for non-terminal s_(j+1)
        Perform a gradient step on (y_j-Q(s_j,a_j; θ_i))^2 with respect to θ
    end for
end for

# Table of Contents

1. Introduction to RL problems & OpenAI Gym
2. MDPs and Bellman Equations
3. Dynamic Programming: Model-Based RL, Policy Iteration and Value Iteration
4. Monte Carlo Model-Free Prediction & Control
5. Temporal Difference Model-Free Prediction & Control
6. Function Approximation
7. Deep Q Learning (WIP)
8. Policy Gradient Methods (WIP)
9. Learning and Planning (WIP)
10. Exploration and Exploitation (WIP)


# List of Implemented Algorithms

1. Dynamic Programming Policy Evaluation
2. Dynamic Programming Policy Iteration
3. Dynamic Programming Value Iteration
4. Monte Carlo Prediction
5. Monte Carlo Control with Epsilon-Greedy Policies
6. Monte Carlo Off-Policy Control with Importance Sampling
7. SARSA (On Policy TD Learning)
8. Q-Learning (Off Policy TD Learning)
9. Q-Learning with Linear Function Approximation
10. Deep Q-Learning for Atari Games
11. Double Deep-Q Learning for Atari Games
12. Deep Q-Learning with Prioritized Experience Replay (WIP)
13. Policy Gradient: REINFORCE with Baseline
14. Policy Gradient: Actor Critic with Baseline
15. Policy Gradient: Actor Critic with Baseline for Continuous Action Spaces
16. Deterministic Policy Gradients for Continuous Action Spaces (WIP)
17. Deep Deterministic Policy Gradients (DDPG) (WIP)
18. Asynchronous Advantage Actor Critic (A3C)


In [2]:
action = {'UP':1, 'RT':2, 'DW':3,'LF':4}

In [3]:
import gym
env = gym.make('CartPole-v0')
print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

[2017-01-20 17:10:22,096] Making new env: CartPole-v0


Discrete(2)
Box(4,)
[  4.80000000e+00   3.40282347e+38   4.18879020e-01   3.40282347e+38]
[ -4.80000000e+00  -3.40282347e+38  -4.18879020e-01  -3.40282347e+38]


In [1]:
import gym
env = gym.make('CartPole-v0')


[2017-01-20 17:09:21,138] Making new env: CartPole-v0


[-0.03394223  0.03624113 -0.03346802  0.03178627]
[-0.03321741 -0.15838529 -0.0328323   0.31372465]
[-0.03638512  0.03718861 -0.02655781  0.01087093]
[-0.03564134  0.23268118 -0.02634039 -0.29007158]
[-0.03098772  0.03794453 -0.03214182 -0.00581112]
[-0.03022883  0.23351235 -0.03225804 -0.30845939]
[-0.02555858  0.42907872 -0.03842723 -0.61113856]
[-0.01697701  0.62471611 -0.05065    -0.91567283]
[-0.00448269  0.82048508 -0.06896346 -1.22383443]
[ 0.01192702  1.01642417 -0.09344015 -1.53730361]
[ 0.0322555   1.21253835 -0.12418622 -1.85762305]
[ 0.05650627  1.40878572 -0.16133868 -2.18614407]
[ 0.08468198  1.21555351 -0.20506156 -1.94728803]
Episode finished after 13 timesteps
[ 0.02605873  0.00962829  0.03034136  0.00092943]
[ 0.0262513   0.20430226  0.03035995 -0.28202813]
[ 0.03033734  0.00876071  0.02471939  0.02007348]
[ 0.03051256 -0.18670686  0.02512085  0.32045202]
[ 0.02677842 -0.38217739  0.0315299   0.62095006]
[ 0.01913487 -0.57772515  0.0439489   0.92339416]
[ 0.00758037 -

In [2]:
env.P

AttributeError: 'CartPoleEnv' object has no attribute 'P'