In [None]:
import gym
import numpy as np
from IPython.display import clear_output, display
from collections import deque
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import random
import copy
import matplotlib.pyplot as plt
import os

from collections import deque
from timeit import default_timer as timer

import os
from matplotlib.ticker import FormatStrFormatter


In [None]:
env = gym.make('CartPole-v1')

state_size = env.observation_space.shape[0]  # For cartpole, states are: position, velocity, angle, angular velocity.
action_size = env.action_space.n          # For cartpole, actions are: [-1, 1 ], forwards and back

print('state_vector_size:', state_size)
print('action_vector_size:', action_size)

In [None]:
## a method to create a Deep-Q network for the agent. 
def q_network(numInput, numHidden, numHiddenLayers, numOutput, optimiserFunction=tf.train.AdamOptimizer,
              alpha=0.00025, lossFunction="mse", hiddenActivation="relu", outputActivation="linear"): 
    
    # 0.00005
    
    #Creating a TensorFlow class
    network = Sequential()
    
    #Creating the input layer
    network.add(Dense(numHidden, input_dim=numInput, activation=hiddenActivation))
    
    #Adding 'n' hidden layers
    for _ in range(numHiddenLayers):
        network.add(Dense(numHidden, activation=hiddenActivation))
        
    #Creating output layer
    network.add(Dense(numOutput, activation=outputActivation))
    
    #Defining the loss function, the optimiser and the metrics.
    network.compile(loss=lossFunction, optimizer=Adam(lr=alpha))
    
    return(network)

In [None]:
# The full agent class. Implements dumb/random experience replay. Also take note of how large tau is. 
class DQN_Agent():
    def __init__(self, environment, numObservations, numActions, numHidden, numHiddenLayer, modelFileName,
                     gamma=0.99, alpha=0.00005, epsilon=1, epsilonDecay=0.995, epsilon_min=0, tau=0.5,
                     load=False, save=True):

        self.numObservations = numObservations
        self.numActions = numActions
        self.numHidden = numHidden
        self.numHiddenLayer = numHiddenLayer
        
        self.gamma = gamma
        self.alpha = alpha
        self.epsilon = epsilon
        self.tau = tau
        
        self.epsilon = epsilon
        self.epsilonDecay = epsilonDecay
        self.epsilon_min = epsilon_min
        
        #replay buffer
        self.replay_buffer = []
        
        # The networks
        self.q_network = q_network(self.numObservations,self.numHidden,self.numHiddenLayer,self.numActions)
        self.q_target_network = q_network(self.numObservations,self.numHidden,self.numHiddenLayer,self.numActions)
                
        if(load):
            self.q_network.load_weights(modelFileName)
      
    
        self.soft_target_weight_update(1)
    
    def updateEpsilon(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilonDecay
        
        if(self.epsilon < self.epsilon_min):
            self.epsilon = self.epsilon_min
    
    def select_epsilon_greedy_action(self, state):            
        if np.random.uniform(0, 1) < self.epsilon:
            return(env.action_space.sample())  # random action
        else:
            q_values = self.q_network.predict(state)
            return(np.argmax(q_values[0]))        # greedy action

    def soft_target_weight_update(self, tau):
        # Q network weights
        weights = np.asarray(self.q_network.get_weights())
        
        # Target network weights
        target_weights = np.asarray(self.q_target_network.get_weights())
        
 
        # We set the weights according to XXX. 
        self.q_target_network.set_weights( (weights * tau) + (target_weights * (1 - tau)))

    def learn_from_transition(self, action, state,next_state, reward, done):
        q_values = self.q_network.predict(state)

        target_q_values = q_values
        
        if done == True: 
            target_q_values[0][action] = reward
        else: 
            target_q_values[0][action] = reward + (self.gamma * np.max(self.q_target_network.predict(next_state)) )
        
        self.q_network.fit(state, target_q_values, epochs=4, batch_size=32, verbose=0)

    def learn_from_m_random_transitions_in_replay_buffer(self, number_of_trans = 1):
        
        assert number_of_trans >= 1
        
        size_of_replay_buffer = len(self.replay_buffer)
        for i in range(number_of_trans):                
            transition_index = random.randint(0, size_of_replay_buffer-1)
            action, state, next_state, reward, done_bool = self.replay_buffer[transition_index]
            self.learn_from_transition(action, state, next_state, reward, done_bool)
            
            
        self.soft_target_weight_update(self.tau)  # Always do a target weight update afterwards. 
        
        
    def add_to_replay_buffer(self, action, current_state, next_state, reward, done): 
        transition_tuple = (action, current_state, next_state, reward, done)
        self.replay_buffer.append(transition_tuple)

## Training. This cell contains the agent/environment loop. 

In [None]:
# Size of the state/action space, respectively
numObservation = env.observation_space.shape[0]
numActions = env.action_space.n


numHidden = 24                  # number of neurons. 
numHiddenLayer = 2              # number of hidden layer.


modelFileName = "Polecart_weights.h5"

agent = DQN_Agent(env, numObservation, numActions, numHidden, numHiddenLayer, modelFileName,
                    save=True, load=False)


starting_state = env.reset() # Getting things ready

num_episodes = 2000 
max_steps = 10**6     # Effectively infinite for our purposes, as it should be. 

reward_array = np.ndarray(shape = (num_episodes, max_steps))
steps_array = np.ndarray(shape = (num_episodes))


rewardList = []
meanRewards = []
episodeList = []


for episode in range(num_episodes):
    step = 1
    reward_sum = 0
    
    # reseting the environment and getting the starting episode action. 
    current_state = env.reset().reshape(1, numObservation)
    chosen_action = agent.select_epsilon_greedy_action(current_state)    
    
# The main agent/environment loop for a given episode ------------------------------
    done = False
    while done == False and step < max_steps:   # The problem is here:
            
        next_state, reward, done, info = env.step(chosen_action)
        next_state = next_state.reshape(1, numObservation)  # Tedious reshaping needed.
        
        agent.add_to_replay_buffer(chosen_action, current_state, next_state, reward, done)
        
        # Setting things up for the next iteration of the while loop 
        current_state = next_state
        chosen_action = agent.select_epsilon_greedy_action(current_state)
        
        step += 1
        reward_sum += reward
        
        # If we are on the last episode, render the agent playing
        if(episode == num_episodes-1):
            env.render()
            
    steps_array[episode] = step    
    
    # We randomly sample 50 state-action transitions after every episode. 
    agent.learn_from_m_random_transitions_in_replay_buffer(50)
    rewardList.append(reward_sum)
    episodeList.append(agent.epsilon)
    
    agent.updateEpsilon()
    
    clear_output(wait=True)
    
    
    
    # Down here is just plotting code. it can be safely removed without affecting anything else. 
    if(episode > 20):
        meanRewards.append(np.mean(rewardList[-19:]))
    

    plt.figure(figsize=(20, 10))
    ax = plt.subplot(2,2,1)
    #ax.plot(range(episode+1), rewardList)
    ax2 = ax.twinx()
    ax.plot(range(episode+1), rewardList, 'g-')
    ax2.plot(range(episode+1), episodeList, 'b-')

    ax = plt.subplot(2,2,2)
    ax.plot(range(len(meanRewards)), meanRewards)
    plt.show()
    
    
#Saving the trained model
agent.q_network.save_weights(modelFileName)
env.close()

# More plotting. 
plt.plot(range(num_episodes), rewardList)
plt.show()