In [5]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
import keras.backend as K
import random

class Agent:
    def __init__(self, input_dim, output_dim):
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.hidden_dim = 32
        self.lr = 0.001
        self.gamma = 0.99
        self.tau = 0.05
        self.batchsize = 32
        self.epsilon = 0.05
        self.memory_size = 10**4
        self.memory = []

        #Make models
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.target_model.set_weights(self.model.get_weights())

        #self.train = self.optimizer()
        
        
    def _build_model(self):
        model = Sequential()
        model.add( Dense(self.hidden_dim, input_dim = self.input_dim, activation = 'relu') )
        model.add( Dense(self.hidden_dim, activation = 'relu') )
        model.add( Dense(self.output_dim, activation = 'linear') )  #Q function
        return model
    
    
    def act(self,state):
        
        #Do epsilon greedy
        temp = np.random.rand()
        if temp <= self.epsilon:
            action = np.random.choice(range(self.output_dim))
            return action
        else:
            Qs = self.model.predict(state)[0]
            action = np.argmax(Qs)
            return action
        
        
    def remember(self,event):
        # (S,A,R,S1,D) = event
        # D = done
        if self.memory < self.memory_size:
            self.memory.append(event)
        else:
            self.memory[0] = event
            
            
    def learn(self):
        
        #Train behavior network
        (S,A,R,S1,D) = random.sample(self.memory, self.batchsize)
        self.train([S,A,R,S1,D])
            
        #Soft update on target network
        self.soft_update()
        
        
    def soft_update(self):
        """
        
        \theta_target = \tau(\theta_behavior) + (1-\tau)*\theta_target
        
        """
        
        pars_behavior = self.model.get_weights()
        pars_target = self.target_model.get_weights()
        
        ctr = 0
        for par_behavior, par_target in zip(pars_behavior, pars_target):
            par_target = self.tau*par_behavior + (1-self.tau)*par_target
            pars_target[ctr] = par_target
            ctr += 1
            
        self.target_model.set_weights(pars_target)
            
            
    """
    def optimizer(self):
        #Do experience replay
        
        #Loss = E_t ( Q(s_t, a_t) - y_t^2  )^2
        
        #where y_t = r + (1-D)*gamma*max*Q_target(s1_t)
        
        #So I go over the target network
                
        #Placeholders
        S_pl = K.placeholder(shape=(None,self.input_dim))
        A_pl = K.placeholder(shape = (None,))
        R_pl = K.placeholder(shape = (None,))
        S1_pl = K.placeholder(shape=(None,self.input_dim))
        D_pl = K.placeholder(shape = (None,))
        
        #Find y_i
        Q_target_next_vec = self.target_model.predict(S1_pl)     #(N_samples, dim_action)
        Q_target_next_max = max(Q_target_next_vec, axis = 1)  #(N_samples,)
        yi = R + (1-D)*self.gamma*Q_target_next_max           #(N_samples,)
        
        #Setup Q_want
        A_onehot = to_categorical(A, self.output_dim)
        y_temp = np.reshape()
        Q = self.model.predict(S)
        
        #All elements of Q, except the one corresponding to the chosen action,
        #Stay zero. How to do this efficiently?
        
        
        #Do loss
        Q_behavior_pl = self.model.output
        loss = K.mean( K.square( Q_behavior_pl - Q_want ))
        
        #Get updates
        adam = Adam(self.lr)
        pars = self.model.trainable_weights
        updates = adam.get_updates(loss = loss, params = pars)
        
        return K.function(inputs = [S_pl, A_pl, R_pl, S1_pl, D_pl], outputs = [], updates = updates)
    """

In [6]:
import gym
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Setup
env = gym.make('CartPole-v0')
input_dim, output_dim = env.observation_space.shape[0], env.action_space.n
agent = Agent(input_dim, output_dim)

EPISODES = 100
scores = []
for e in range(1,EPISODES+1):
    state = env.reset()
    state = np.reshape(state,[1,  input_dim])
    reward_sum = 0
    done = False
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, input_dim])
        event = (state)
        agent.remember(state[0], action, reward)
        state = next_state
        reward_sum += reward
        if done:
            break
    #agent.learn()
    scores.append(reward_sum)
    if e % 10 == 0:
        print('episode, reward = {}, {}'.format(e,reward_sum))
plt.plot(scores)

TypeError: remember() takes 2 positional arguments but 4 were given