In [None]:
# Pygame installation
!pip install pygame

In [None]:
import pygame as pg
from pygame import image as img

In [None]:
# Gym Imports
import gym
from gym import Env
from gym.spaces import Discrete, MultiDiscrete # different types of spaces

# Helpers
import numpy as np
import random
import os

# Stable baselines
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Time module to make program halt for presentation purposes
import time

# for checking duplicates in our actions
from collections import Counter

In [None]:
# Function definitions for use
def load_image(file):
    """loads an image, prepares it for play"""
    try:
        surface = pg.image.load(file)
    except pg.error:
        raise SystemExit('Could not load image "%s" %s' % (file, pg.get_error()))
    return surface.convert_alpha() # convert_alpha allows for transparency from .pngs

In [None]:
pg.init()
win = pg.display.set_mode((0,0))
pg.display.set_caption("Custom Environment - Guess Path")

class Robot(pg.sprite.Sprite):
    """Visual for AI space and movement direction"""
    images = [load_image("assets/sprites/robo_R.png"), 
              load_image("assets/sprites/robo_L.png"), 
              load_image("assets/sprites/robo_U.png"),
              load_image("assets/sprites/robo_D.png")]

    def __init__(self, pos):
        pg.sprite.Sprite.__init__(self, self.containers)
        self.image = self.images[0]
        self.setPosition(pos)

    def update(self, action: int):
        if action >= 0:
            self.image = self.images[action]
    
    def setPosition(self, newPos):
        self.rect = self.image.get_rect()
        self.rect.x = newPos[0] - (self.rect.width / 2.0)
        self.rect.y = newPos[1] - (self.rect.height / 2.0)

class Space(pg.sprite.Sprite):
    """Visual for space type. Normal, positive or negative"""
    images = [load_image("assets/sprites/normal.png"), 
              load_image("assets/sprites/plus.png"), 
              load_image("assets/sprites/minus.png"),
              load_image("assets/sprites/start.png"), 
              load_image("assets/sprites/goal.png")]
    spaceType = 0

    def __init__(self, pos, jumpDistance):
        pg.sprite.Sprite.__init__(self, self.containers)
        
        self.jumpDistance = jumpDistance
        
        if(jumpDistance == 0):  # neutral movement - does not push Robot
            self.image = self.images[0]
            self.spaceType = 0
        elif(jumpDistance > 0): # positive movement - pushes robot forward
            self.image = self.images[1]
            self.spaceType = 1
        else:                    # negative movement - pushes robot backward
            self.image = self.images[2]
            self.spaceType = 2

        self.rect = self.image.get_rect()
        self.rect.x = pos[0] - (self.rect.width / 2.0)
        self.rect.y = pos[1] - (self.rect.height / 2.0)
        
    def setType(self, spaceType):
            self.image = self.images[spaceType]
            self.spaceType = spaceType
            self.jumpDistance = 0
            
            if spaceType == 3:
                self.spaceType = 0 # if the current space type is the start, set it to a nothing type
            
            if spaceType == 1:
                self.jumpDistance = 2
            elif spaceType == 2:
                self.jumpDistance = -2

    def getType(self):
        return self.spaceType

# Initialize Game Groups
all = pg.sprite.RenderUpdates()
    
Space.containers = all
Robot.containers = all

# Board does not require rendering
        
class Board():
    """Board that sets up and displays all spaces"""
    def __init__(self, maxCols, pos):
        self.spaces = [[], [], [], [], []]
        
        self.maxCols = maxCols
        
        if self.maxCols < 1:
            self.maxCols = 1 # cap minimum value to max rows in case of emergency

        # Thanks to Jack Malone for help with this loop
        for index, value in enumerate(self.spaces):
            for y in range(self.maxCols):
                value.append(Space(((64 * y) + pos[0], (64 * index) + pos[1]), 0))
                            
        self.middle = (len(self.spaces) // 2)
        self.playerPos = [self.middle,0]
        self.goal = [self.middle, self.maxCols - 1]
                        
    def setStartEnd(self):
        self.spaces[self.playerPos[0]][self.playerPos[1]].setType(3)
        self.spaces[self.goal[0]][self.goal[1]].setType(4)
                        
    def update(self, robot: Robot, action: int):
        additionalReward = 0 # calculate reward for robot here
        
        if action >= 0:
            if action == 0: # right
                  if self.playerPos[1] < self.maxCols - 1:
                        self.playerPos[1] += 1
            elif action == 1: # left
                 if self.playerPos[1] > 0:
                        self.playerPos[1] -= 1
            elif action == 2: # up
                if self.playerPos[0] > 0:
                        self.playerPos[0] -= 1
            elif action == 3: # down
                if self.playerPos[0] < len(self.spaces) - 1:
                        self.playerPos[0] += 1

            # first check to see if the position has to be updated
            # as the robot may move onto a push space
            pushValue = self.spaces[self.playerPos[0]][self.playerPos[1]].jumpDistance
            
            if pushValue > 0:
                additionalReward = 2
            elif pushValue < 0:
                additionalReward = -2
            
            if pushValue != 0: # only do push calculations if the robot has to be pushed
                self.playerPos[1] += pushValue

                # now check to see if the player has jumped outside the bounds of the board
                # only generate reward for a successful jump
                if self.playerPos[1] > self.maxCols - 1:
                    self.playerPos[1] = self.maxCols - 1
                    additionalReward = 0
                elif self.playerPos[1] < 0:
                    self.playerPos[1] = 0
                    additionalReward = 0
                
            # now that the robot has moved, update it's position
            robot.setPosition(((self.playerPos[0] * 64) + 96, (self.playerPos[1] * 64) + 96))
            
            return additionalReward
            
    def initializeSpaces(self):
        # When we want to disable non-deterministic elements,
        # we run this to pre-set all spaces
        # first, reset all spaces back to 0
        for x in range(5):
            for y in range(self.maxCols):
                self.spaces[x][y].setType(0)
        
        # now pre-set all the requires blue and red spaces
        self.spaces[4][3].setType(1)
        self.spaces[0][4].setType(2)
        self.spaces[1][4].setType(2)
        self.spaces[2][4].setType(2)
        self.spaces[3][4].setType(2)
        self.spaces[4][4].setType(2)
        
        self.spaces[4][7].setType(1)
        self.spaces[0][8].setType(2)
        self.spaces[1][8].setType(2)
        self.spaces[2][8].setType(2)
        self.spaces[3][8].setType(2)
        self.spaces[4][8].setType(2)
        
        self.spaces[2][11].setType(1)
        self.spaces[0][12].setType(2)
        self.spaces[1][12].setType(2)
        self.spaces[2][12].setType(2)
        self.spaces[3][12].setType(2)
        self.spaces[4][12].setType(2)
        
    def randomizeSpaces(self):
        # The AI can easily determine where to go if all the blue spaces stay still
        # What if the blue spaces moved during each run of the environment?
        # That's what we do here, we pick a new spot for each of the Blue spaces if random is enabled.
        firstSpot = random.randint(0,4)
        secondSpot = random.randint(0,4)
        thirdSpot = random.randint(0,4)
        
        # Now that we have the new 3 spots, we need to clear the row of any previous spots.
        # range(5) gives 0 -> 4, so we can use it in place of the x pos
        for x in range(5):
            self.spaces[x][3].setType(0)
            self.spaces[x][7].setType(0)
            self.spaces[x][11].setType(0)
            
        # now that the spaces are cleared, place the new blue spots
        self.spaces[firstSpot][3].setType(1)
        self.spaces[secondSpot][7].setType(1)
        self.spaces[thirdSpot][11].setType(1)
        
    def winCheck(self):
        if(self.playerPos[0] == self.goal[0] 
           and self.playerPos[1] == self.goal[1]):
            return True
        return False

pg.quit()

In [None]:
class PathEnv(Env):
    win = pg.display.set_mode((1600,800))
    pg.display.set_caption("Custom Environment - Guess Path RL")
    bg = load_image('assets/sprites/background.png')
    board = Board(16, (100,100))
    shouldRandomize = False
    environmentPaused = False
    
    def __init__(self, randomizeBoard, waitTime):
        # Actions: 0 - Right, 1 - Left, 2 - Up, 3 - Down
        self.action_space = Discrete(4)
        self.timeToWait = waitTime
        
        self.shouldRandomize = randomizeBoard
        
        # Create observation space
        # Position on board, and what space type is directly up, right, down and left of the AI
        # a space type of 0 is considered to be off board
        # space type 1 is neutral, 2 is positive and 3 is negative
        # space type 4 is the goal space
        self.observation_space = MultiDiscrete([80, 5, 5, 5, 5])
        
        # Determine starting state upon initialization
        self.state = self.determineState()
        # 
        self.alloted_length = 60

        self.board.initializeSpaces()
            
        if(randomizeBoard):
            self.board.randomizeSpaces()
           
        self.board.setStartEnd()
            
        self.robo = Robot(((self.board.playerPos[0] * 64) + 96, 
                  (self.board.playerPos[1] * 64) + 96))

    def step(self, action):
        
        # Update Environment elements
        self.robo.update(action)
        
        additionalReward = self.board.update(self.robo, action)
        
        # With elements updated, determine state, reward etc
        self.state = self.determineState()
        
        # Reduce alloted length to use environment by time moved via pygame clock
        self.alloted_length -= 1

        # TODO: Calculate reward
        reward = -1 + additionalReward
        
        # Check if environment is done
        if self.alloted_length <= 0: 
            done = True
        else:
            done = False
            
        if self.board.winCheck():
            reward += 70 # when finding the goal, end the environment and give a good reward
            done = True
        
        # Needed during the return
        info = {}
  
        # Return step information
        return self.state, reward, done, info

    def render(self):
        win.blit(self.bg, (0,0))
        for r in self.board.spaces:
                for c in r: # position all spaces to the correct top left position
                    win.blit(c.image, (c.rect.x, c.rect.y))

        win.blit(self.robo.image, (self.robo.rect.y, self.robo.rect.x))
        pg.display.update()
        time.sleep(self.timeToWait)
        
    def pauseEnv(self, paused):
        self.environmentPaused = paused
    
    def reset(self):
        if not self.environmentPaused:
            # Reset game elements
            self.board.initializeSpaces()

            if(self.shouldRandomize):
                self.board.randomizeSpaces()

        self.board.playerPos = [self.board.middle,0] # reset player pos on board
        self.robo.setPosition(((self.board.playerPos[0] * 64) + 96, 
              (self.board.playerPos[1] * 64) + 96)) # place robo onto the correct spot

        self.board.setStartEnd()
        
        # Reset starting state
        self.state = self.determineState()
        # Reset alloted time to interact
        self.alloted_length = 60
        
        return self.state
    
    def calculateReward(self):
        # calculate reward based for AI
        pass
    
    def determineState(self):
        # the current state is where the player currently is on the board
        pos = self.board.playerPos
        currentSpace = 0
        up = 0
        left = 0
        down = 0
        right = 0
        
        # pos[0] moves along the x axis, 0, 1, 2 etc...
        # pos[1] moves along the y axis, 0, maxCols * 1, maxCols * 2...
        # adding the two together gives us the space they are on
        # 0,0 gives 0
        # 1,0 gives 1
        # 0,1 gives maxCols
        currentSpace = pos[1] + (pos[0] * self.board.maxCols)
        
        # now that we know where the AI is, we will see what each space type is in all directions    
        if pos[0] > 0:
            up = self.board.spaces[pos[0] - 1][pos[1]].getType()
        if pos[1] > 0:
            left = self.board.spaces[pos[0]][pos[1] - 1].getType()
        if pos[0] < len(self.board.spaces) - 1:
            down = self.board.spaces[pos[0] + 1][pos[1]].getType()
        if pos[1] < self.board.maxCols - 1:
            right = self.board.spaces[pos[0]][pos[1] + 1].getType()

        
        state = [currentSpace, up, left, down, right]
        
        
        return state
    
pg.quit()

# Test Environment

Change the Cells below to Code (Click the Cell then Cell -> Cell Type -> Code OR CTRL + Y)

env = PathEnv(False, 0.25)

#-# Run this cell to test that the Environment works properly
#-# This will randomly pick from a Discrete action step, no model is used here.

pg.init()
win = pg.display.set_mode((1160,460))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    #env.board.randomizeSpaces()
    
    while not done:
        pg.event.get()
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward         
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.quit()

pg.quit() # optional pygame quit in case of error

# Train Model

In [None]:
log_path = os.path.join('training', 'logs')
training_log_path = os.path.join(log_path, 'GuessPath_PPO_Random_3')

In [None]:
env = PathEnv(False, 0.0)
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [None]:
env = PathEnv(True, 0.25) # change to code cell if you wish to train an existing model further

In [None]:
model.learn(total_timesteps=240000)

# Save Model

In [None]:
guesspath_path = os.path.join('training', 'saved_models', 'GuessPath_PPO_Random_320000')

In [None]:
model.save(guesspath_path)

In [None]:
del(model) # uncomment to delete the model after saving it

# Load Model

In [None]:
# change path end on guesspath_path to change loaded model
guesspath_path = os.path.join('training', 'saved_models', 'GuessPath_PPO_Random_320000')
env = PathEnv(True, 0.25) # set environment if not set already
model = PPO.load(guesspath_path, env)

# Watch Model
### With a loaded model, it will be used with the environment.

In [None]:
pg.display.init()
win = pg.display.set_mode((1124,460))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        pg.event.get()
        env.render()
        action, _ = model.predict(obs) # predict returns two, but we only require action            
        obs, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.display.quit()

In [None]:
pg.display.quit() # run in case of error / crash

### Dynamically Reinforce Learning
- If our Model continuously picks the same types of actions, they may be in a state they are not expecting.
- To fix this, we can attempt to re-train the Model for a much smaller time.
- This isn't the greatest fix, as any board state that is very different each time will cause constant re-training.

In [None]:
actionCounter = 4 # number of actions to be counted before attempting a repeat check
repeatThreshold = 2 # number of times an action has to be repeated before the model is considered to be stuck

def checkRepeats(actions):
    duplicates = {key:value for key, value in dict(Counter(actions)).items() if value > 1}
    
    print(duplicates)

    if len(duplicates) != 2:
        return False # only check for actions that don't pick any actions outside of the same 2 actions
    
    # we want to make sure the 2 repeated actions are opposite actions
    # so the model is either choosing left/right, or up/down continously
    dupKeys = list(duplicates.keys())
    exit = True
    
    # 0 / 1 - Left / Right
    if (dupKeys[0] == 0 and dupKeys[1] == 1) or (dupKeys[0] == 1 and dupKeys[1] == 0):
        exit = False
    # 2 / 3 - Up / Down
    if (dupKeys[0] == 2 and dupKeys[1] == 3) or (dupKeys[0] == 3 and dupKeys[1] == 2):
        exit = False
    
    if exit:
        return False
    
    # we will now have multiple key value pairs
    # the key is the action, and the value is the number of times that action is picked
    # using this, we can loop through all the key value pairs, and determine if any are being chosen too many times.
    for repeats in duplicates.values():
        print(repeats)
        if repeats >= repeatThreshold:
            return True
    
    return False

def retrain(env, model):
    env.pauseEnv(True) # stop the board from being randomized or changed from it's current state
    model.learn(total_timesteps=10000) # re-train on a much smaller scale
    env.reset() # reset the environment now that re-training is complete
    env.pauseEnv(False) # allow the board to change now that re-training is completed

# Dynamically Run Model
- With the above functions declared, we can now let the model play in our environment.
- If the loop detects the Model is picking duplicate actions and is stuck, it will re-train the model briefly.

In [None]:
env = PathEnv(True, 0.25) # set environment if not set already

In [None]:
pg.display.init()
win = pg.display.set_mode((1124,460))
pg.display.set_caption("Custom Environment - Guess Path RL")
actions = []

episodes = 10
for episode in range(1, episodes+1):
    actions.clear()
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        pg.event.get()
        env.render()
        action, _ = model.predict(obs) # predict returns two, but we only require action
        actions.append(action)
        obs, reward, done, info = env.step(action)
        score += reward
        
        if len(actions) >= actionCounter: # check the last few actions for repeated actions           
            if checkRepeats(actions):
                retrain(env, model) 
                actions.clear()
            actions.clear()
 
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.display.quit()

In [None]:
pg.display.quit()
env.pauseEnv(False) # reset in case of crash

### Viewing Logs with Tensorboard
#### Note: This should not be run within Jupyter, as it will freeze the notebook while it runs.
#### However, it can be run within the Notebook purely for demonstration purposes, so it will be included.

In [None]:
import os # uncomment this if you wish to only view logs

In [None]:
check = 'PPO_2_NoRandom320000'

log_path = os.path.join('training', 'logs')
training_log_path = os.path.join(log_path, check)

print(training_log_path)

In [None]:
!tensorboard --logdir={training_log_path}
# if you are running this from jupyter, go to http://localhost:6006 to access