# Introduction

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
import keras.optimizers 
from keras import backend as K
from agent import Agent
%matplotlib inline


#Env
env = gym.make('CartPole-v0')
env.seed(1)  #for comparison
num_states = env.observation_space.shape[0]
num_actions = env.action_space.n

#Agent
input_dim, output_dim = num_states, num_actions
lr, gamma, tau, clipnorm, verbose = 0.001, 0.99, 0.01, True, False
agent = Agent(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose)

#Train
EPISODES = 2000
scores = []
for e in range(1,EPISODES+1):
    state = env.reset()
    state = np.reshape(state, [1, num_states])
    reward_sum = 0
    done = False
    step = 0
    while not done:
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward_sum += reward
        next_state = np.reshape(next_state, [1, num_states])
        agent.remember(state[0], action, reward, next_state[0], done)
        state = next_state
        step += 1
        if step % 10 == 0:
            agent.learn()
    scores.append(reward_sum)
    if e % 50 == 0:
        print '(episode, score) = ' + str((e,reward_sum))
plt.plot(scores)

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m
(episode, score) = (50, 14.0)
(episode, score) = (100, 24.0)
(episode, score) = (150, 13.0)
(episode, score) = (200, 10.0)
(episode, score) = (250, 12.0)
(episode, score) = (300, 10.0)
(episode, score) = (350, 9.0)
(episode, score) = (400, 11.0)


### Roughwork

In [24]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import gym
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense
from keras.optimizers import Adam 
from keras.optimizers import RMSprop
from keras import backend as K
from keras.utils import to_categorical
from keras.layers import Dense, Input, concatenate


class Critic:
    
    """ Critic for A2C  """
    
    def __init__(self,input_dim, output_dim,lr,gamma,tau, clipnorm, verbose = False):
        
        #Pars
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.lr = lr  #learning rate for optimizer
        self.gamma = gamma
        self.tau = tau
        self.verbose = verbose
        self.clipnorm = clipnorm
        
        #Make models
        self.model = self._make_network()
        self.target_model = self._make_network()                       
        self.target_model.set_weights(self.model.get_weights()) 
        
        #optimizer
        self.opt = self.optimizer()
        
        
    def learn(self,S,R,D,V1):
        V1 = self.opt([S,R,D,V1])
        return V1
    
    
    def _make_network(self):        
        S = Input(shape=(self.input_dim,))
        x = Dense(128, activation = 'relu')(S)
        out = Dense(1, activation = 'linear')(x)
        model = Model(inputs = S, outputs = out)
        model.compile(loss = 'mse', optimizer = Adam( lr = self.lr, clipnorm = self.clipnorm))
        return model
       
 
    def optimizer(self):
        
        """ 
            The loss function for the critic is
           
            L_i = \sum_{batch}  ( V_i - y_i )^2 
            
            where,
            
            y_i = r_i + (1-done) gamma* V_i(s)  for non-terminal \vec{x'}
            r_i = reward to agent i
            gamma = discount factor
            done = 1 if episode finished, 0 otherwise
        """
        
        #Placeholders (think of these as inputs)
        S_pl = self.model.input
        V_pl = self.model.output
        R_pl = K.placeholder(name='reward',shape=(None,))
        D_pl = K.placeholder(name='done', shape=(None,))
        V1_pl = K.placeholder(name='V1',shape=(None,))

        #Find yi
        V1 = K.sqrt(K.square(V1_pl))
        Y = R_pl + (1.0-D_pl)*self.gamma*V1_pl  #1D array
        #Y = np.array([ [i] for i in Y])
        
        #Find loss
        loss = K.mean(K.square(V_pl - Y))     #scalar
        
        #Define optimizer
        adam_critic = RMSprop(lr = self.lr, epsilon = 0.1, rho = 0.99)  #arbitray
        pars = self.model.trainable_weights
        updates = adam_critic.get_updates(params=pars,loss=loss)
        
        return K.function([S_pl, R_pl, D_pl,V1_pl], [], updates=updates)  

    
    
critic = Critic(input_dim, output_dim,lr,gamma,tau, clipnorm, verbose = False)
S,A,R,S1,D = agent.get_batch()
D, R = np.array([[x] for x in D]), np.array([[x] for x in R])
V1 = critic.model.predict(S1)
[out] = critic.learn(S,R,D,V1)
out

ValueError: need more than 0 values to unpack

In [13]:
D1 = np.array([[x] for x in D])
(1-D1)*V1

array([[ 0.0625844 ],
       [-0.00400997],
       [ 0.06013154],
       [-0.00398179],
       [-0.06215289],
       [-0.00379891],
       [ 0.05594378],
       [-0.00509506],
       [-0.06334426],
       [-0.00536425],
       [-0.06347042],
       [-0.00653257],
       [-0.06363468],
       [-0.00741415],
       [ 0.04320167],
       [ 0.10233249],
       [ 0.03926365],
       [-0.01085381],
       [ 0.03663721],
       [-0.01235111],
       [ 0.03460442],
       [ 0.09667682],
       [ 0.0352065 ],
       [ 0.09708382],
       [ 0.03705866],
       [-0.01417046],
       [ 0.03740181],
       [-0.01367935],
       [-0.06499278],
       [-0.12066351],
       [-0.06460891],
       [-0.01387813],
       [-0.06484658],
       [-0.12029552],
       [-0.064547  ],
       [-0.01523867],
       [ 0.01773329],
       [-0.01853967],
       [-0.0671837 ],
       [-0.0213013 ],
       [ 0.00775268],
       [ 0.05705822],
       [ 0.10912711],
       [ 0.05070865],
       [ 0.010302  ],
       [ 0

### Grab batches

In [3]:
agent = Agent(input_dim, output_dim, lr, gamma, tau, clipnorm, verbose)
state = env.reset()
state = np.reshape(state, [1, num_states])
reward_sum = 0
done = False
while not done:
    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)
    reward_sum += reward
    next_state = np.reshape(next_state, [1, num_states])
    agent.remember(state[0], action, reward, next_state[0], done)
    state = next_state 