<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [12]:
import random
import numpy as np

class Corridor():
    
    def __init__(self):
        
        self.states = [0, 1, 2, 3, 4] # tiles in the corridor
        self.rewards = [-100, 0, 0, 100, 0] # hole, empty, empty, beer, empty
        self.actions = ['left', 'right'] # actions the player can take in the game
        self.stop_locations = [0, 3] # if the player ends on this tile, the game is over
        self.player_loc = None # keep track of player location
        self.dimensions = (len(self.states), len(self.actions)) # states x actions, use as Q-table's dimensions

    
    def start(self, location):
        
        # initialize player on a tile and check if game is over
        self.player_loc = location
        if self.player_loc in self.stop_locations:
            done = True
        else:
            done = False
        reward = self.rewards[self.player_loc]
            
        return reward, done
    
    
    def step(self, action):
        
        old_loc = self.player_loc # keep track of player origin
        # move player according to action
        if action == 'left':
            self.player_loc = self.player_loc - 1
        elif action == 'right':
            self.player_loc = self.player_loc + 1
        
        # if player tries to move out of bounds, keep him in place
        if self.player_loc not in self.states:
            self.player_loc = old_loc
            
        # return observation of new location, reward and whether game has finished
        observation = self.player_loc
        reward = self.rewards[self.player_loc]
        if self.player_loc in self.stop_locations:
            done = True
        else:
            done = False
            
        return observation, reward, done
    
    
    def __repr__(self):
        
        # string representation of game and player location
        corridor = list('HEEBE')
        corridor[self.player_loc] = 'P'
        corridor = ''.join(corridor)
        return corridor


class Player():
    
    def __init__(self, Q_dim):
        
        # Q-table required for learning
        self.Q_table = np.zeros(Q_dim)
    
    
    def learn(self, game, episodes, discount_rate=0.99, e=0.2):
        
        scores = [] # keep track of scores/number of moves required to complete game
        
        # have player learn a game by playing it X amount of times (with X=episodes)
        for i in range(episodes):
            # spawn player at a random location
            location = random.randint(0, len(game.states)-1)
            score, done = game.start(location)
            # if player didn't spawn on game ending tile...
            while done is not True:
                # take a random action or a greedy action
                if random.uniform(0, 1) < e:
                    # take random action
                    action = random.randint(0, len(game.actions)-1)
                else:
                    # take best action according to what we've learned so far
                    action = np.argmax(self.Q_table[location, :])
                # check what happened after the action
                new_location, reward, done = game.step(game.actions[action])
                # update Q-table 
                self.Q_table[location, action] = reward + discount_rate*max(self.Q_table[new_location, :])
                # update player location and score
                score += reward
                location = new_location
                
            scores.append(score) # save score obtained during last game
            
        return scores
    
        
    def test(self, game, episodes, verbose=False):    
        
        scores = [] # keep track of scores/number of moves required to complete game
        
        # test player on game by playing it X amount of times (with X=episodes)
        for i in range(episodes):
            # spawn player at a random location
            location = random.randint(0, len(game.states)-1)
            score, done = game.start(location)
            if verbose:
                print('Start location: ', location)
            # if player didn't spawn on game ending tile...
            while done is not True:
                # use Q-table to decide next action
                action = np.argmax(self.Q_table[location, :])
                if verbose:
                    print(game)
                new_location, reward, done = game.step(game.actions[action])
                # update player location and score
                score += reward
                location = new_location
                
            if verbose:
                print(game)
            if location == 0: # player fell in hole
                if verbose:
                    print('Game over')
            else:
                if verbose:
                    print('Game won!')
            scores.append(score) # save score obtained during last game
                
        return scores

In [17]:
game = Corridor() # initialize the game
Bart = Player(game.dimensions) # initialize the player
learn_scores = Bart.learn(game, 1000, discount_rate=0.4, e=0.5) # have the player take random actions during 1000 games to learn
print(Bart.Q_table) # Q-table the player learned
test_scores = Bart.test(game, 5, verbose=True) # test if player does a good job
print(np.mean(learn_scores), np.mean(test_scores), test_scores)

[[   0.    0.]
 [-100.   40.]
 [  16.  100.]
 [   0.    0.]
 [ 100.   40.]]
Start location:  3
HEEPE
Game won!
Start location:  1
HPEBE
HEPBE
HEEPE
Game won!
Start location:  3
HEEPE
Game won!
Start location:  3
HEEPE
Game won!
Start location:  3
HEEPE
Game won!
43.0 100.0 [100, 100, 100, 100, 100]
