In [None]:
# Pygame installation
!pip install pygame

In [1]:
import pygame as pg
from pygame import image as img

pygame 2.1.0 (SDL 2.0.16, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Gym Imports
import gym
from gym import Env
from gym.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete # different types of spaces

# Helpers
import numpy as np
import random
import os

# Stable baselines
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Time module to make program halt for presentation purposes
import time

In [3]:
# Function definitions for use
def load_image(file):
    """loads an image, prepares it for play"""
    try:
        surface = pg.image.load(file)
    except pg.error:
        raise SystemExit('Could not load image "%s" %s' % (file, pg.get_error()))
    return surface.convert_alpha() # convert_alpha allows for transparency from .pngs

In [4]:
pg.init()
win = pg.display.set_mode((0,0))
pg.display.set_caption("Custom Environment - Guess Path")

class Robot(pg.sprite.Sprite):
    """Visual for AI space and movement direction"""
    images = [load_image("assets/sprites/robo_R.png"), 
              load_image("assets/sprites/robo_L.png"), 
              load_image("assets/sprites/robo_U.png"),
              load_image("assets/sprites/robo_D.png")]

    def __init__(self, pos):
        pg.sprite.Sprite.__init__(self, self.containers)
        self.image = self.images[0]
        self.rect = self.image.get_rect()
        self.rect.x = pos[0] - (self.rect.width / 2.0)
        self.rect.y = pos[1] - (self.rect.height / 2.0)

    def update(self, action: int):
        if action >= 0:
            self.image = self.images[action]
    
    def setPosition(self, newPos):
        self.rect = self.image.get_rect()
        self.rect.x = newPos[0] - (self.rect.width / 2.0)
        self.rect.y = newPos[1] - (self.rect.height / 2.0)

class Space(pg.sprite.Sprite):
    """Visual for space type. Normal, positive or negative"""
    images = [load_image("assets/sprites/normal.png"), 
              load_image("assets/sprites/plus.png"), 
              load_image("assets/sprites/minus.png"),
              load_image("assets/sprites/start.png"), 
              load_image("assets/sprites/goal.png")]
    spaceType = 0

    def __init__(self, pos, jumpDistance):
        pg.sprite.Sprite.__init__(self, self.containers)
        
        self.jumpDistance = jumpDistance
        
        if(jumpDistance == 0):  # neutral movement - does not push Robot
            self.image = self.images[0]
        elif(jumpDistance > 0): # positive movement - pushes robot forward
            self.image = self.images[1]
        else:                    # negative movement - pushes robot backward
            self.image = self.images[2]

        self.rect = self.image.get_rect()
        self.rect.x = pos[0] - (self.rect.width / 2.0)
        self.rect.y = pos[1] - (self.rect.height / 2.0)
        
    def setType(self, spaceType):
            self.image = self.images[spaceType]
            self.spaceType = spaceType
            
            if spaceType == 3:
                self.spaceType = 0 # if the current space type is the start, set it to a nothing type
            
            if spaceType == 1:
                self.jumpDistance = 2
            elif spaceType == 2:
                self.jumpDistance = -2

# Initialize Game Groups
all = pg.sprite.RenderUpdates()
    
Space.containers = all
Robot.containers = all

# Board does not require rendering
        
class Board():
    """Board that sets up and displays all spaces"""
    def __init__(self, maxCols, pos):
        self.spaces = [[], [], [], [], []]
        
        self.maxCols = maxCols
        
        if self.maxCols < 1:
            self.maxCols = 1 # cap minimum value to max rows in case of emergency
        
        self.middle = (len(self.spaces) // 2)
        self.playerPos = [0,self.middle]
        self.goal = [self.middle, self.maxCols - 1]
        
        # Thanks to Jack Malone for help with this loop
        for index, value in enumerate(self.spaces):
                    for y in range(self.maxCols):
                            value.append(Space(((64 * y) + pos[0], (64 * index) + pos[1]), 0))
                        
    def setStartEnd(self):
        self.spaces[self.playerPos[1]][self.playerPos[0]].setType(3)
        self.spaces[self.goal[0]][self.goal[1]].setType(4)
                        
    def update(self, robot: Robot, action: int):
        additionalReward = 0 # calculate reward for robot here
        
        if action >= 0:
            if action == 0: # right
                  if self.playerPos[0] < self.maxCols - 1:
                        self.playerPos[0] += 1
            elif action == 1: # left
                 if self.playerPos[0] > 0:
                        self.playerPos[0] -= 1
            elif action == 2: # up
                if self.playerPos[1] > 0:
                        self.playerPos[1] -= 1
            elif action == 3: # down
                if self.playerPos[1] < len(self.spaces) - 1:
                        self.playerPos[1] += 1
            
            # first check to see if the position has to be updated
            # as the robot may move onto a push space
            
            pushValue = self.spaces[self.playerPos[1]][self.playerPos[0]].jumpDistance
            
            if pushValue > 0:
                additionalReward = 2
            elif pushValue < 0:
                additionalReward = -2
            
            if pushValue != 0: # only do push calculations if the robot has to be pushed
                self.playerPos[0] += pushValue

                # now check to see if the player has jumped outside the bounds of the board
                # only generate reward for a successful jump
                if self.playerPos[0] > self.maxCols - 1:
                    self.playerPos[0] = self.maxCols - 1
                    additionalReward = 0
                elif self.playerPos[0] < 0:
                    self.playerPos[0] = 0
                    additionalReward = 0
                
            # now that the robot has moved, update it's position
            robot.setPosition(((self.playerPos[0] * 64) + 96, (self.playerPos[1] * 64) + 96))
            
            return additionalReward
            
    def initializeSpaces(self):
        self.spaces[self.middle][3].setType(1)
        self.spaces[0][4].setType(2)
        self.spaces[1][4].setType(2)
        self.spaces[2][4].setType(2)
        self.spaces[3][4].setType(2)
        self.spaces[4][4].setType(2)
        
        self.spaces[0][7].setType(1)
        self.spaces[0][8].setType(2)
        self.spaces[1][8].setType(2)
        self.spaces[2][8].setType(2)
        self.spaces[3][8].setType(2)
        self.spaces[4][8].setType(2)
        
        self.spaces[3][11].setType(1)
        self.spaces[0][12].setType(2)
        self.spaces[1][12].setType(2)
        self.spaces[2][12].setType(2)
        self.spaces[3][12].setType(2)
        self.spaces[4][12].setType(2)
        
    def winCheck(self):
        if(self.playerPos[1] == self.goal[0] 
           and self.playerPos[0] == self.goal[1]):
            return True
        return False

pg.quit()



In [5]:
class PathEnv(Env):
    win = pg.display.set_mode((1600,800))
    pg.display.set_caption("Custom Environment - Guess Path RL")
    bg = load_image('assets/sprites/background.png')
    board = Board(16, (100,100))
    board.setStartEnd()
    board.initializeSpaces()
    robo = Robot(((board.playerPos[0] * 64) + 96, 
              (board.playerPos[1] * 64) + 96))
    action = -1
    clock = pg.time.Clock()
    
    def __init__(self):
        # Actions: 0 - Left, 1 - Up, 2 - Right, 3 - Down
        self.action_space = Discrete(4)
        
        # Create observation space
        self.observation_space = Discrete(80)
        
        # Determine starting state upon initialization
        self.state = self.determineState()
        # 
        self.alloted_length = 600

    def step(self, action):
        
        self.clock.tick(30)
        
        # Update Environment elements
        self.robo.update(action)
        
        additionalReward = self.board.update(self.robo, action)
        
        # With elements updated, determine state, reward etc
        self.state = self.determineState()
        
        # Reduce alloted length to use environment by time moved via pygame clock
        self.alloted_length -= self.clock.get_time()

        # TODO: Calculate reward
        reward = -1 + additionalReward
        
        # Check if environment is done
        if self.alloted_length <= 0: 
            done = True
        else:
            done = False
            
        if self.board.winCheck():
            reward += 30 # when finding the goal, end the environment and give a good reward
            done = True
        
        # Needed during the return
        info = {}
  
        # Return step information
        return self.state, reward, done, info

    def render(self):
        win.blit(self.bg, (0,0))
        for r in self.board.spaces:
                for c in r: # position all spaces to the correct top left position
                    win.blit(c.image, (c.rect.x, c.rect.y))

        win.blit(self.robo.image, (self.robo.rect.x, self.robo.rect.y))
        pg.display.update()
    
    def reset(self):
        # Reset game elements
        
        self.board.playerPos = [0,self.board.middle] # reset player pos on board
        
        self.robo.setPosition(((self.board.playerPos[0] * 64) + 96, 
              (self.board.playerPos[1] * 64) + 96)) # place robo onto the correct spot
        
        # Reset starting state
        self.state = self.determineState()
        # Reset alloted time to interact
        self.alloted_length = 600
        return self.state
    
    def calculateReward(self):
        # calculate reward based for AI
        pass
    
    def determineState(self):
        # the current state is where the player currently is on the board
        
        # since we calculate the current position by multiplying the two
        # x and y positions on our array together,
        # we want to make sure we do not multiply by 0,
        # as this will always set the state to 0.
        
        if self.board.playerPos[0] == 0:    # if the x position is 0,
            state = self.board.playerPos[1] # set the state to be just the y position
        elif self.board.playerPos[1] == 0:    # otherwise if the y position is 0
            state = self.board.playerPos[0] # then set the state to be just the x position
        else:# if both positions are not 0, then the state is both positions multiplied together   
            state = self.board.playerPos[0] * self.board.playerPos[1]
        
        return state
    
pg.quit()

In [6]:
env = PathEnv()

In [7]:
env.observation_space.sample()

70

In [None]:
# Run this cell to test that the Environment works properly
# This will randomly pick from a Discrete action step, no model is used here.

pg.init()
win = pg.display.set_mode((1600,800))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 10
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        pg.event.get()
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward         
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.quit()

In [None]:
pg.quit() # optional pygame quit in case of error

# Train Model

In [8]:
log_path = os.path.join('training', 'logs')
training_log_path = os.path.join(log_path, 'GuessPathPPO_2')

In [None]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

In [34]:
model.learn(total_timesteps=20000)

Logging to training\logs\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 18.9     |
|    ep_rew_mean     | -14.2    |
| time/              |          |
|    fps             | 30       |
|    iterations      | 1        |
|    time_elapsed    | 67       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 19          |
|    ep_rew_mean          | -13.8       |
| time/                   |             |
|    fps                  | 30          |
|    iterations           | 2           |
|    time_elapsed         | 136         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.003928896 |
|    clip_fraction        | 0.0482      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.352      |
|    explained_variance   | 0.373       |

<stable_baselines3.ppo.ppo.PPO at 0x1f33f27e7f0>

# Save Model

In [35]:
guesspath_path = os.path.join('training', 'saved_models', 'GuessModel_PPO_6')

In [36]:
model.save(guesspath_path)

In [37]:
del(model) # uncomment to delete the model after saving it

In [38]:
model = PPO.load(guesspath_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Watch Model
### With a loaded model, it will be used with the environment.

In [31]:
env = PathEnv() # set environment if not set already

In [40]:
pg.display.init()
win = pg.display.set_mode((1600,800))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 20
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        pg.event.get()
        env.render()
        action, _ = model.predict(obs) # predict returns two, but we only require action
        obs, reward, done, info = env.step(action)
        score += reward         
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.display.quit()

Episode:1 Score:-1
Episode:2 Score:21
Episode:3 Score:21
Episode:4 Score:21
Episode:5 Score:21
Episode:6 Score:20
Episode:7 Score:21
Episode:8 Score:21
Episode:9 Score:-8
Episode:10 Score:21
Episode:11 Score:21
Episode:12 Score:21
Episode:13 Score:21
Episode:14 Score:21
Episode:15 Score:21
Episode:16 Score:15
Episode:17 Score:21
Episode:18 Score:21
Episode:19 Score:20
Episode:20 Score:21
