###### Imports

In [1]:
import gym
import theano
import keras
import numpy as np
import matplotlib.pyplot as plt
import scipy
import math
import json
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import sgd
from keras.models import model_from_json
import time
import operator

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


###### Agent Class

In [2]:
class Agent(object):
     
    def __init__(self, max_memory=100, discount=.9):
        """Define max length of memory and gamma"""
        self.max_memory = max_memory
        self.memory = list()
        self.discount = discount

    def remind(self, states, game_over):      
        """Add experience to memory"""
        self.memory.append([states, game_over])
        if len(self.memory) > self.max_memory:         #Delete the first experience if the memory is too long
            del self.memory[0]

    def get_batch(self, target_model, model, batch_size=10):
        """Get the batch input and targets we will train on"""
        len_memory = len(self.memory)           #length of memory vector
        num_actions = model.output_shape[-1]    #number of actions in action space
        
                                                #states is an experience : [input_t_minus_1, action, reward, input_t],
        env_dim = self.memory[0][0][0].shape[1] #so memory[0] is state and memory[0][0][0].shape[1] is the size of the input
        
                                                
        inputs = np.zeros((min(len_memory, batch_size),#if batch_size<len_memory (it is mostly the case), 
                           env_dim))                   #then input is a matrix with batch_size rows and size of obs columns
        
                                                          #Targets is a matrix with batch_size rows and 
        targets = np.zeros((inputs.shape[0], num_actions))#number of actions columns
        
        for i, idx in enumerate(np.random.randint(0, len_memory,
                                                  size=inputs.shape[0])):
            
            #get experience number idx, idx being a random number in [0,length of memory]
            #There are batch_size experiences that are drawn
            state_t, action_t, reward_t, state_tp1 = self.memory[idx][0]
            
            game_over = self.memory[idx][1]     #Is the game over ? if done in gym

            inputs[i:i+1] = state_t             #The inputs of the NN are the state of the experience drawn
            
                                                          # target_model.predict(state_t)[0] is the 
            targets[i] = target_model.predict(state_t)[0] #vector of Q(state_t) for each action 
            
                                                #Q_sa=Q_target(s,argmax_a'{Q(s',a')}
                                                #index is the action you that maximizes the Q-value of the current network
            index, maxima = max(enumerate(model.predict(state_tp1)[0]), key=operator.itemgetter(1))
                                                            #We take the value of the target
            Q_sa = target_model.predict(state_tp1)[0][index]#network for action index
            
            if game_over:  # if game_over is True
                targets[i, action_t] = reward_t
            else:
                                                                       # the target for this particular experience is : 
                targets[i, action_t] = reward_t + self.discount * Q_sa #reward_t + gamma * max_a' Q(s', a')
        return inputs, targets

###### Envirtonment and Parameters

In [3]:
env = gym.make('CartPole-v0')  

learning_rate=0.001             #learning rate
epsilon = .1                    #exploration parameter
num_actions = env.action_space.n#Number of possible actions
max_memory = 4000000000         #Length of memory
hidden_size = 200               #Number of hidden units
batch_size = 50                 #Size of batch for training
acc_reward=0                    #Accumulated reward over epoch
time_step=0                     #counter of time-steps
max_time_steps=2000             #total number of time-steps to train on
everyC=5                        #Number of times we update the target network
C=0                             #Parameter C

#shape of observations
observation_shape = env.observation_space.shape[0]

###### Adam Optimizer

In [4]:
Adam=keras.optimizers.Adam(lr=learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipvalue=1)

###### Current DNN

In [5]:
model = Sequential()
model.add(Dense(200, input_dim=observation_shape, activation='relu')) #first fully connected layer, activation RELU
model.add(Dense(num_actions))                                         #last fully connected layer, output Q(s,a,theta)

model.compile(optimizer=Adam, loss='mean_squared_error')              #choose optimization parameters

Instructions for updating:
keep_dims is deprecated, use keepdims instead


###### Target DNN

In [6]:
target_model = Sequential()
target_model.add(Dense(200, input_dim=observation_shape, activation='relu'))#first fully connected layer, activation RELU
target_model.add(Dense(num_actions))                                        #last fully connected layer, output Q(s,a,theta)
target_model.compile(optimizer=Adam, loss='mean_squared_error')             #choose optimization parameters

###### Initializing the Experience Replay object

In [7]:
agent = Agent(max_memory=max_memory)

win_cnt = 0      #nb of games won
t0 = time.time() #start of traning time
actual_total=0   #actual training time
e=0              #nb of episodes

print('Parameters :','epsilon :', epsilon,'C :', everyC,', learning rate :', learning_rate, 'batch size for training :', batch_size)

print(model.summary())

print('Training for ',max_time_steps,'time-steps ...')


Parameters : epsilon : 0.1 C : 5 , learning rate : 0.001 batch size for training : 50
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               1000      
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
Total params: 1,402
Trainable params: 1,402
Non-trainable params: 0
_________________________________________________________________
None
Training for  2000 time-steps ...


In [8]:
while time_step<max_time_steps:
    loss = 0.       #Set loss to zero
    acc_reward = 0  #Set accumulated reward to 0
    C=0             #Set C to zero
    e+=1            #Add episode
    
    input_t = env.reset()
    input_t = input_t.reshape((1,observation_shape))
    
    game_over = False #Since it's the beginning of the game, game_over is not True

    while not game_over:
        
        input_tm1 = input_t            #set this state to be the last state
        if np.random.rand() <= epsilon:# get next action according to espilon-greedy policy
            #exploration
            action = np.random.randint(0, num_actions, size=1)[0]
        else:
            #exploitation
            q = model.predict(input_tm1)
            action = np.argmax(q[0])

        input_t, reward, game_over, infodemerde = env.step(action) #apply action, get rewards and new state
        input_t = input_t.reshape((1,observation_shape))
        
        acc_reward += reward    #Accumulate reward

        agent.remind([input_tm1, action, reward, input_t], game_over) # store experience
        
        #Create new target network every C updates, by cloning the current network
        if C%everyC==0:
            model.save_weights("model_cartpole_TARGET", overwrite=True)
            with open("model_cartpole_TARGET.json", "w") as outfile:
                json.dump(model.to_json(), outfile) 
            target_model.load_weights("model_cartpole_TARGET")
            

        C += 1         #Increment C
        # get batch we will train on
        inputs, targets = agent.get_batch(target_model, model, batch_size=batch_size)

        t2 = time.time() #start of actual training time
        loss += model.train_on_batch(inputs, targets)
        t3 = time.time() #end of actual training time
        actual_total += t3-t2
        time_step += 1   #increment time-step

        if acc_reward>=200: #end game if max score is reached
            game_over=True
            win_cnt+=1

    print(time_step,'time steps done, ',e,'episodes done. Reward :', acc_reward, ', loss :', loss)

t1 = time.time() #end of training time
total = t1-t0
print('Total training time :', total,'Actual training time :', actual_total)
print('Win ratio (nb of games won/nb of games played) :', win_cnt/e)

10 time steps done,  1 episodes done. Reward : 10.0 , loss : 4.714401543140411
19 time steps done,  2 episodes done. Reward : 9.0 , loss : 3.815333664417267
28 time steps done,  3 episodes done. Reward : 9.0 , loss : 4.476900339126587
39 time steps done,  4 episodes done. Reward : 11.0 , loss : 7.107419312000275
49 time steps done,  5 episodes done. Reward : 10.0 , loss : 8.374745905399323
59 time steps done,  6 episodes done. Reward : 10.0 , loss : 10.59276008605957
68 time steps done,  7 episodes done. Reward : 9.0 , loss : 12.984484612941742
76 time steps done,  8 episodes done. Reward : 8.0 , loss : 10.284712731838226
86 time steps done,  9 episodes done. Reward : 10.0 , loss : 16.13631683588028
97 time steps done,  10 episodes done. Reward : 11.0 , loss : 25.485287189483643
110 time steps done,  11 episodes done. Reward : 13.0 , loss : 23.956787943840027
119 time steps done,  12 episodes done. Reward : 9.0 , loss : 15.109153747558594
131 time steps done,  13 episodes done. Reward 

In [10]:
nb_e_test=10  #nb of episodes to test
total_rew=0   #Total reward over the episodes

print('Testing for ',nb_e_test,'episodes ...')

for episode in range(nb_e_test):    #set accumulated reward to 0
    acc_reward = 0
    
    input_t = env.reset()
    input_t = input_t.reshape((1,observation_shape))
    
    game_over = False

    while not game_over:
        
        input_tm1 = input_t

        q = model.predict(input_tm1)
        action = np.argmax(q[0])

        input_t, reward, game_over, infodemerde = env.step(action)
        input_t = input_t.reshape((1,observation_shape))

        acc_reward += reward

        if acc_reward>=200:
            game_over=True

    total_rew+=acc_reward

    print(episode,'episodes done. Reward :', acc_reward)

print('The average reward over the test was :',total_rew/nb_e_test)

Testing for  10 episodes ...
0 episodes done. Reward : 186.0
1 episodes done. Reward : 200.0
2 episodes done. Reward : 178.0
3 episodes done. Reward : 165.0
4 episodes done. Reward : 162.0
5 episodes done. Reward : 196.0
6 episodes done. Reward : 200.0
7 episodes done. Reward : 200.0
8 episodes done. Reward : 164.0
9 episodes done. Reward : 200.0
The average reward over the test was : 185.1
