In [2]:
import sys, os
os.chdir(sys.path[0]+'/search')

from pacman import Directions
from game import Agent
import random
import game
import util

class QLearnAgent(Agent):

    # Constructor, called when we start running the
    def __init__(self, alpha=0.3, epsilon=0.2, gamma=0.8, numTraining = 9990):
        # alpha       - learning rate
        # epsilon     - exploration rate
        # gamma       - discount factor
        # numTraining - number of training episodes
        #
        # These values are either passed from the command line or are
        # set to the default values above. We need to create and set
        # variables for them
        self.alpha = float(alpha)
        self.epsilon = float(epsilon)
        self.gamma = float(gamma)
        self.numTraining = int(numTraining)
        # Count the number of games we have played
        self.episodesSoFar = 0
        # dictionary of Q-values
        self.Q_values = dict()
        # placeholder of the previous state
        self.prev_state = None
        # placeholder of the previous action
        self.prev_action = None
        # placeholder of the previous score
        self.prev_score = None


    # Accessor functions for the variable episodesSoFars controlling learning
    def incrementEpisodesSoFar(self):
        self.episodesSoFar += 1

    def getEpisodesSoFar(self):
        return self.episodesSoFar

    def getNumTraining(self):
        return self.numTraining

    # Accessor functions for parameters
    def setEpsilon(self, value):
        self.epsilon = value

    def getAlpha(self):
        return self.alpha

    def setAlpha(self, value):
        self.alpha = value

    def getGamma(self):
        return self.gamma

    def getMaxAttempts(self):
        return self.maxAttempts


    # getAction
    #
    # The main method required by the game. Called every time that
    # Pacman is expected to move
    def getAction(self, state, debug_mode=False):

        """
        Data about current state
        """
        legal = state.getLegalPacmanActions()
        if Directions.STOP in legal:
            legal.remove(Directions.STOP)
        pacman_position = state.getPacmanPosition()
        ghost_positions = state.getGhostPositions()
        food_locations = state.getFood()
        # construct s'
        curr_state = (str(legal), str(pacman_position), str(ghost_positions), str(food_locations))
        if debug_mode:
            print("Legal moves: " + curr_state[0])
            print("Pacman position: " + curr_state[1])
            print("Ghost positions: " + curr_state[2])
            print("Food locations: ")
            print(curr_state[3])
            print("Score: " + str(state.getScore()) + "\n")

        # initialize Q-value
        if state not in self.Q_values:
            self.initialize_Q_values(state, legal)

        # update Q-value
        if self.prev_state != None:
            self.update_Q_value(state)

        # update placeholders
        self.update_placeholders(state, legal)

        return self.prev_action


    """
    training episodes: initialize Q-values
    """
    def initialize_Q_values(self, state, legal):
        self.Q_values[state] = dict()
        for action in legal:
            if action not in self.Q_values[state]:
                self.Q_values[state][action] = 0.0


    """
    training episodes: update Q-value
    """
    def update_Q_value(self, state, final_step=False):
        # calculate R(s)
        reward = state.getScore() - self.prev_score
        # calculate max(Q(s', a'))
        max_Q_value = 0.0
        if not final_step:
            max_Q_value = max(list(self.Q_values[state].values()))
        # update Q(s, a)
        self.Q_values[self.prev_state][self.prev_action] += (self.alpha * (reward + self.gamma * max_Q_value - self.Q_values[self.prev_state][self.prev_action]))


    """
    update placeholders
    """
    def update_placeholders(self, state, legal):
        # register s' as s
        self.prev_state = state
        # register a' as a
        self.prev_action = self.epsilon_greedy(state, legal)
        # register as previous score
        self.prev_score = state.getScore()


    """
    action selection: epsilon-greedy
    """
    def epsilon_greedy(self, state, legal):
        # generate a random probability
        probability = random.random()
        # if probability is less than exploration rate: random action
        if probability < self.epsilon:
            random_action = random.choice(legal)
            return random_action
        # if probability is greater than exploration rate: max Q-value action
        max_Q_action = None
        for action in legal:
            if max_Q_action == None:
                max_Q_action = action
            if self.Q_values[state][action] > self.Q_values[state][max_Q_action]:
                max_Q_action = action
        return max_Q_action


    """
    Reset placeholder variables
    """
    def reset_placeholders(self):
        self.prev_state = None
        self.prev_action = None
        self.prev_score = None


    # Handle the end of episodes
    #
    # This is called by the game after a win or a loss.
    def final(self, state):

        # update Q-value
        if self.prev_state != None:
            self.update_Q_value(state, final_step=True)

        # reset placeholder variables
        self.reset_placeholders()

        # Keep track of the number of games played, and set learning
        # parameters to zero when we are done with the pre-set number
        # of training episodes
        self.incrementEpisodesSoFar()
        if self.getEpisodesSoFar() == self.getNumTraining():
            msg = "Training Done (turning off epsilon and alpha)"
            print("%s\n%s" % (msg,"-" * len(msg)))
            self.setAlpha(0)
            self.setEpsilon(0)

In [3]:
from pacman import runGames, loadAgent
from pacman import Directions
import pacmanAgents
from util import Queue
import textDisplay
import game
import layout
import random
import graphicsDisplay

argumentos ={}
pacman = QLearnAgent()
ghostType = loadAgent('RandomGhost', True)
textDisplay.SLEEP_TIME = 0
argumentos['layout'] = layout.getLayout('smallClassic')
argumentos['pacman'] = pacman
argumentos['ghosts'] = [ghostType( i+1 ) for i in range( 2 )]
argumentos['display'] = graphicsDisplay.PacmanGraphics(1, frameTime = 0)
argumentos['numGames'] = 10000
argumentos['numTraining'] = 9990
argumentos['record'] = False
argumentos['catchExceptions'] = False
argumentos['timeout'] = 1

res = runGames(**argumentos)

Training Done (turning off epsilon and alpha)
---------------------------------------------


In [4]:
for a in res:
    print(a.state.getScore())

-438.0
-400.0
-387.0
-411.0
-384.0
-351.0
-357.0
-274.0
-376.0
-416.0
