In [None]:
import numpy as np
import gym
import matplotlib.pyplot as plt

# STUDENT NUMBERS
# 1886648
# 1851234 
# 1669326

# EpsilonGreedyQPolicy class that follows the design methodology of the Keras-RL 2 library.
class EpsilonGreedyQPolicy():

    # If no epsilon value passed in arguments, it defaults to 0.1    
    def __init__(self, epsilon = 0.1):
        self.epsilon = epsilon

    # Takes in the Q-Values of the state's actions. If exploiting, returns the action with the highest Q-Value, else returns a random action.
    def selectAction(self, qValues):
        nb_actions = qValues.shape[0]

        if np.random.uniform() < self.epsilon:
            action = np.random.randint(0, nb_actions)
        else:
            action = np.argmax(qValues)
        
        return action

# Agent class implementing the SARSA Lambda algorithm. The class' design methodology follows that of the Keras-RL 2 library. Currently only setup for discrete state and action spaces.
class SARSALambaAgent():

    # Takes in the Gym environment, policy and all the other necessary variable values needed for SARSA Lambda. 
    def __init__(self, env, learningRate = 0.5, gamma = 1.0, lambdaValue = 0.0, policy = EpsilonGreedyQPolicy(), terminalStates = None):
        self.env = env
        self.learningRate = learningRate
        self.gamma = gamma
        self.lambdaValue = lambdaValue
        self.policy = policy
        self.testPolicy = EpsilonGreedyQPolicy(0.0)
        self.terminalStates = terminalStates
        self.nb_actions = None
        self.nb_states = None
        self.currentState = None
        self.currentAction = None
        self.qValues = None
        self.eligibilityTrace = None
        self.qValuesHistory = None

        # Conditional statements for checking that the state and action spaces are discrete.
        if isinstance(self.env.action_space, gym.spaces.discrete.Discrete):
            self.nb_actions = self.env.action_space.n

        if isinstance(self.env.observation_space, gym.spaces.discrete.Discrete):
            self.nb_states = self.env.observation_space.n

    # Resets the Agent and prepares it for the start of learning and playing.
    def reset(self):
        # Q-Values set to random values.
        self.qValues = np.random.rand(self.nb_states, self.nb_actions)
        
        # Sets Q-Values belonging to terminal states to 0.
        if self.terminalStates is not None:
            for state in self.terminalStates:
                self.qValues[state] = np.zeros(self.nb_actions)

        # Starts the collection of the history of the Q-Values.
        self.qValuesHistory = np.copy(self.qValues)[np.newaxis]

        self.eligibilityTrace = None
        self.currentState = None
        self.currentAction = None

    # Resets the episode.
    def resetEpisode(self):
        self.currentState = self.env.reset()
        self.currentAction = self.policy.selectAction(self.qValues[self.currentState])
        self.eligibilityTrace = np.zeros_like(self.qValues)

    # The implementation of the SARSA Lambda algorithm and the recording of the current episode's Q-Values.
    def estimateQValues(self, nb_episodes):
        self.reset()

        for episode in np.arange(nb_episodes):
            self.resetEpisode()
            done = False
            reward = 0.0

            while done is not True:
                nextState, reward, done, info = self.env.step(self.currentAction)
                nextAction = self.policy.selectAction(self.qValues[nextState])
                error = reward + self.gamma * self.qValues[nextState][nextAction] - self.qValues[self.currentState][self.currentAction]
                self.eligibilityTrace[self.currentState][self.currentAction] += 1
                self.qValues += self.learningRate * error * self.eligibilityTrace
                self.eligibilityTrace[:] = self.gamma * self.lambdaValue * self.eligibilityTrace[self.currentState][self.currentAction]
                self.currentState = nextState
                self.currentAction = nextAction
            
            # Stores the current episode's Q-Values.
            self.qValuesHistory = np.concatenate([self.qValuesHistory, np.copy(self.qValues)[np.newaxis]], axis = 0)

    # Plays out an episode using the current Q-Values and prints out the Agent's total reward for the episode.
    def evaluateQValues(self):
        self.resetEpisode()
        done = False
        reward = 0.0
        tReward = 0.0

        while done is not True:
            nextState, reward, done, info = self.env.step(self.currentAction)
            nextAction = self.testPolicy.selectAction(self.qValues[nextState])
            self.currentState = nextState
            self.currentAction = nextAction
            tReward += reward
            
        print(tReward)

    # Returns the Agent's Q-Value history if it has one.
    def getQValuesHistory(self):
        if self.qValuesHistory is not None:
            return np.copy(self.qValuesHistory)
        else:
            return None

In [None]:
# Setup of the Gym environment.
ENV_NAME = 'CliffWalking-v0'
env = gym.make(ENV_NAME)

# Setup of the SARSA Lambda Agents using varying lambda values.
sarsaLambdaAgentOne = SARSALambaAgent(env, 0.5, 1.0, 0.0, EpsilonGreedyQPolicy(0.1), np.array([47]))
sarsaLambdaAgentTwo = SARSALambaAgent(env, 0.5, 1.0, 0.3, EpsilonGreedyQPolicy(0.1), np.array([47]))
sarsaLambdaAgentThree = SARSALambaAgent(env, 0.5, 1.0, 0.5, EpsilonGreedyQPolicy(0.1), np.array([47]))
sarsaLambdaAgentFour = SARSALambaAgent(env, 0.5, 1.0, 0.7, EpsilonGreedyQPolicy(0.1), np.array([47]))
sarsaLambdaAgentFive = SARSALambaAgent(env, 0.5, 1.0, 0.9, EpsilonGreedyQPolicy(0.1), np.array([47]))

# Estimation of the Q-Values for 500 episodes for each Agent.
sarsaLambdaAgentOne.estimateQValues(500)
sarsaLambdaAgentTwo.estimateQValues(500)
sarsaLambdaAgentThree.estimateQValues(500)
sarsaLambdaAgentFour.estimateQValues(500)
sarsaLambdaAgentFive.estimateQValues(500)

# Matrices of the best Q-Value of each state for each episode, for each agent.
qValueHistoryOne = np.amax(sarsaLambdaAgentOne.getQValuesHistory(), axis = 2)
qValueHistoryTwo = np.amax(sarsaLambdaAgentTwo.getQValuesHistory(), axis = 2)
qValueHistoryThree = np.amax(sarsaLambdaAgentThree.getQValuesHistory(), axis = 2)
qValueHistoryFour = np.amax(sarsaLambdaAgentFour.getQValuesHistory(), axis = 2)
qValueHistoryFive = np.amax(sarsaLambdaAgentFive.getQValuesHistory(), axis = 2)


In [None]:
# Creation of the figure with the size based on the common monitor dpi to have a 1920x1080 pixel dimension.
my_dpi = 72
trajectoryFig = plt.figure(figsize=(1920/my_dpi, 1080/my_dpi), dpi=my_dpi, frameon = True)
trajectoryGrid = trajectoryFig.add_gridspec(1, 5)
trajectoryPlot = trajectoryGrid.subplots()

# The saving of the Q-Value heat maps where the Q-Values are scaled|normalized across episodes for each agent and the colour map set to Greens.
for i in np.arange(len(qValueHistoryOne)):
    trajectoryFig.suptitle("Q-Values Heat Map")

    trajectoryPlot[0].set_title("Lambda: 0.0")
    largestValueOne = np.amax(qValueHistoryOne)
    smallestValueOne = np.amin(qValueHistoryOne)
    trajectoryPlot[0].imshow((qValueHistoryOne[i].reshape(4, 12) + np.abs(smallestValueOne))/(np.abs(largestValueOne) + np.abs(smallestValueOne)), alpha=0.8, cmap='Greens')
    
    trajectoryPlot[1].set_title("Lambda: 0.3")
    largestValueTwo = np.amax(qValueHistoryTwo)
    smallestValueTwo = np.amin(qValueHistoryTwo)
    trajectoryPlot[1].imshow((qValueHistoryTwo[i].reshape(4, 12) + np.abs(smallestValueTwo))/(np.abs(largestValueTwo) + np.abs(smallestValueTwo)), alpha=0.8, cmap='Greens')
    
    trajectoryPlot[2].set_title("Lambda: 0.5")
    largestValueThree = np.amax(qValueHistoryThree)
    smallestValueThree = np.amin(qValueHistoryThree)
    trajectoryPlot[2].imshow((qValueHistoryThree[i].reshape(4, 12) + np.abs(smallestValueThree))/(np.abs(largestValueThree) + np.abs(smallestValueThree)), alpha=0.8, cmap='Greens')
    
    trajectoryPlot[3].set_title("Lambda: 0.7")
    largestValueFour = np.amax(qValueHistoryFour)
    smallestValueFour = np.amin(qValueHistoryFour)
    trajectoryPlot[3].imshow((qValueHistoryFour[i].reshape(4, 12) + np.abs(smallestValueFour))/(np.abs(largestValueFour) + np.abs(smallestValueFour)), alpha=0.8, cmap='Greens')
    
    trajectoryPlot[4].set_title("Lambda: 0.9")
    largestValueFive = np.amax(qValueHistoryFive)
    smallestValueFive = np.amin(qValueHistoryFive)
    trajectoryPlot[4].imshow((qValueHistoryFive[i].reshape(4, 12) + np.abs(smallestValueFive))/(np.abs(largestValueFive) + np.abs(smallestValueFive)), alpha=0.8, cmap='Greens')

    plt.savefig(str(i) + '.png', facecolor = 'white', transparent = False)