In [None]:
# Pygame installation
!pip install pygame

In [1]:
import pygame as pg
from pygame import image as img

pygame 2.1.0 (SDL 2.0.16, Python 3.9.7)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Gym Imports
import gym
from gym import Env
from gym.spaces import Discrete, Dict, MultiDiscrete # different types of spaces

# Helpers
import numpy as np
import random
import os

# Stable baselines
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Time module to make program halt for presentation purposes
import time

In [3]:
# Function definitions for use
def load_image(file):
    """loads an image, prepares it for play"""
    try:
        surface = pg.image.load(file)
    except pg.error:
        raise SystemExit('Could not load image "%s" %s' % (file, pg.get_error()))
    return surface.convert_alpha() # convert_alpha allows for transparency from .pngs

In [65]:
pg.init()
win = pg.display.set_mode((0,0))
pg.display.set_caption("Custom Environment - Guess Path")

class Robot(pg.sprite.Sprite):
    """Visual for AI space and movement direction"""
    images = [load_image("assets/sprites/robo_R.png"), 
              load_image("assets/sprites/robo_L.png"), 
              load_image("assets/sprites/robo_U.png"),
              load_image("assets/sprites/robo_D.png")]

    def __init__(self, pos):
        pg.sprite.Sprite.__init__(self, self.containers)
        self.image = self.images[0]
        self.setPosition(pos)

    def update(self, action: int):
        if action >= 0:
            self.image = self.images[action]
    
    def setPosition(self, newPos):
        self.rect = self.image.get_rect()
        self.rect.x = newPos[0] - (self.rect.width / 2.0)
        self.rect.y = newPos[1] - (self.rect.height / 2.0)

class Space(pg.sprite.Sprite):
    """Visual for space type. Normal, positive or negative"""
    images = [load_image("assets/sprites/normal.png"), 
              load_image("assets/sprites/plus.png"), 
              load_image("assets/sprites/minus.png"),
              load_image("assets/sprites/start.png"), 
              load_image("assets/sprites/goal.png")]
    spaceType = 0

    def __init__(self, pos, jumpDistance):
        pg.sprite.Sprite.__init__(self, self.containers)
        
        self.jumpDistance = jumpDistance
        
        if(jumpDistance == 0):  # neutral movement - does not push Robot
            self.image = self.images[0]
            self.spaceType = 0
        elif(jumpDistance > 0): # positive movement - pushes robot forward
            self.image = self.images[1]
            self.spaceType = 1
        else:                    # negative movement - pushes robot backward
            self.image = self.images[2]
            self.spaceType = 2

        self.rect = self.image.get_rect()
        self.rect.x = pos[0] - (self.rect.width / 2.0)
        self.rect.y = pos[1] - (self.rect.height / 2.0)
        
    def setType(self, spaceType):
            self.image = self.images[spaceType]
            self.spaceType = spaceType
            
            if spaceType == 3:
                self.spaceType = 0 # if the current space type is the start, set it to a nothing type
            
            if spaceType == 1:
                self.jumpDistance = 2
            elif spaceType == 2:
                self.jumpDistance = -2
    def getType(self):
        return self.spaceType

# Initialize Game Groups
all = pg.sprite.RenderUpdates()
    
Space.containers = all
Robot.containers = all

# Board does not require rendering
        
class Board():
    """Board that sets up and displays all spaces"""
    def __init__(self, maxCols, pos):
        self.spaces = [[], [], [], [], []]
        
        self.maxCols = maxCols
        
        if self.maxCols < 1:
            self.maxCols = 1 # cap minimum value to max rows in case of emergency

        # Thanks to Jack Malone for help with this loop
        for index, value in enumerate(self.spaces):
            for y in range(self.maxCols):
                value.append(Space(((64 * y) + pos[0], (64 * index) + pos[1]), 0))
                            
        self.middle = (len(self.spaces) // 2)
        self.playerPos = [self.middle,0]
        self.goal = [self.middle, self.maxCols - 1]
                        
    def setStartEnd(self):
        self.spaces[self.playerPos[0]][self.playerPos[1]].setType(3)
        self.spaces[self.goal[0]][self.goal[1]].setType(4)
                        
    def update(self, robot: Robot, action: int):
        additionalReward = 0 # calculate reward for robot here
        
        if action >= 0:
            if action == 0: # right
                  if self.playerPos[1] < self.maxCols - 1:
                        self.playerPos[1] += 1
            elif action == 1: # left
                 if self.playerPos[1] > 0:
                        self.playerPos[1] -= 1
            elif action == 2: # up
                if self.playerPos[0] > 0:
                        self.playerPos[0] -= 1
            elif action == 3: # down
                if self.playerPos[0] < len(self.spaces) - 1:
                        self.playerPos[0] += 1
            
            # first check to see if the position has to be updated
            # as the robot may move onto a push space
            pushValue = self.spaces[self.playerPos[0]][self.playerPos[1]].jumpDistance
            
            if pushValue > 0:
                additionalReward = 2
            elif pushValue < 0:
                additionalReward = -2
            
            if pushValue != 0: # only do push calculations if the robot has to be pushed
                self.playerPos[1] += pushValue

                # now check to see if the player has jumped outside the bounds of the board
                # only generate reward for a successful jump
                if self.playerPos[1] > self.maxCols - 1:
                    self.playerPos[1] = self.maxCols - 1
                    additionalReward = 0
                elif self.playerPos[1] < 0:
                    self.playerPos[1] = 0
                    additionalReward = 0
                
            # now that the robot has moved, update it's position
            robot.setPosition(((self.playerPos[0] * 64) + 96, (self.playerPos[1] * 64) + 96))
            
            return additionalReward
            
    def initializeSpaces(self):
        self.spaces[0][3].setType(1)
        self.spaces[0][3].setType(1)
        self.spaces[0][4].setType(2)
        self.spaces[1][4].setType(2)
        self.spaces[2][4].setType(2)
        self.spaces[3][4].setType(2)
        self.spaces[4][4].setType(2)
        
        self.spaces[4][7].setType(1)
        self.spaces[0][8].setType(2)
        self.spaces[1][8].setType(2)
        self.spaces[2][8].setType(2)
        self.spaces[3][8].setType(2)
        self.spaces[4][8].setType(2)
        
        self.spaces[2][11].setType(1)
        self.spaces[0][12].setType(2)
        self.spaces[1][12].setType(2)
        self.spaces[2][12].setType(2)
        self.spaces[3][12].setType(2)
        self.spaces[4][12].setType(2)
        
    def randomizeSpaces(self):
        # For non-deterministic elements, we will randomize every space on the board
        # We will first randomize every possible space, then place the player and goal
        # spaces afterwards, to ensure they both exist.
        for y in self.spaces:
            for x in y:
                x.setType(random.randint(0,2))
        
        # now with every space randomized, we place the start and end
        self.playerPos = [self.middle,0]
        self.goal = [self.middle, self.maxCols - 1]
        
        self.setStartEnd()
        
    def winCheck(self):
        if(self.playerPos[0] == self.goal[0] 
           and self.playerPos[1] == self.goal[1]):
            return True
        return False

pg.quit()

In [66]:
class PathEnv(Env):
    win = pg.display.set_mode((1600,800))
    pg.display.set_caption("Custom Environment - Guess Path RL")
    bg = load_image('assets/sprites/background.png')
    board = Board(16, (100,100))
    shouldRandomize = False
    
    def __init__(self, randomizeBoard, waitTime):
        # Actions: 0 - Left, 1 - Up, 2 - Right, 3 - Down
        self.action_space = Discrete(4)
        self.timeToWait = waitTime
        
        self.shouldRandomize = randomizeBoard
        
        # Create observation space
        # Position on board, and what space type is directly up, right, down and left of the AI
        # a space type of 0 is considered to be off board
        # space type 1 is neutral, 2 is positive and 3 is negative
        # space type 4 is the goal space
        self.observation_space = MultiDiscrete([80, 5, 5, 5, 5])
        
        # Determine starting state upon initialization
        self.state = self.determineState()
        # 
        self.alloted_length = 60

        if(randomizeBoard):
            self.board.randomizeSpaces()
        else:
            self.board.initializeSpaces()
            
        self.board.setStartEnd()
            
        self.robo = Robot(((self.board.playerPos[0] * 64) + 96, 
                  (self.board.playerPos[1] * 64) + 96))

    def step(self, action):
        
        # Update Environment elements
        self.robo.update(action)
        
        additionalReward = self.board.update(self.robo, action)
        
        # With elements updated, determine state, reward etc
        self.state = self.determineState()
        
        # Reduce alloted length to use environment by time moved via pygame clock
        self.alloted_length -= 1

        # TODO: Calculate reward
        reward = -1 + additionalReward
        
        # Check if environment is done
        if self.alloted_length <= 0: 
            done = True
        else:
            done = False
            
        if self.board.winCheck():
            reward += 70 # when finding the goal, end the environment and give a good reward
            done = True
        
        # Needed during the return
        info = {}
  
        # Return step information
        return self.state, reward, done, info

    def render(self):
        win.blit(self.bg, (0,0))
        for r in self.board.spaces:
                for c in r: # position all spaces to the correct top left position
                    win.blit(c.image, (c.rect.x, c.rect.y))

        win.blit(self.robo.image, (self.robo.rect.y, self.robo.rect.x))
        pg.display.update()
        time.sleep(self.timeToWait)
    
    def reset(self):
        # Reset game elements
        
        if(self.shouldRandomize):
            self.board.randomizeSpaces()
        else:
            self.board.initializeSpaces # re-initialize spaces if not randomized
            
        self.board.playerPos = [self.board.middle,0] # reset player pos on board
        self.board.setStartEnd()
        
        self.robo.setPosition(((self.board.playerPos[0] * 64) + 96, 
              (self.board.playerPos[1] * 64) + 96)) # place robo onto the correct spot
        
        # Reset starting state
        self.state = self.determineState()
        # Reset alloted time to interact
        self.alloted_length = 60
        return self.state
    
    def calculateReward(self):
        # calculate reward based for AI
        pass
    
    def determineState(self):
        # the current state is where the player currently is on the board
        pos = self.board.playerPos
        currentSpace = 0
        up = 0
        left = 0
        down = 0
        right = 0
        
        if pos[0] == 0:    # if the x position is 0,
            currentSpace = pos[1] # set the state to be just the y position
        elif pos[1] == 0:    # otherwise if the y position is 0
            currentSpace = pos[0] # then set the state to be just the x position
        else:# if both positions are not 0, then the state is both positions multiplied together   
            currentSpace = pos[0] * pos[1]
        
        # now that we know where the AI is, we will see what each space type is in all directions    
        if pos[0] > 0:
            up = self.board.spaces[pos[0] - 1][pos[1]].getType()
        if pos[1] > 0:
            left = self.board.spaces[pos[0]][pos[1] - 1].getType()
        if pos[0] < len(self.board.spaces) - 1:
            down = self.board.spaces[pos[0] + 1][pos[1]].getType()
        if pos[1] < self.board.maxCols - 1:
            right = self.board.spaces[pos[0]][pos[1] + 1].getType()

        
        state = [currentSpace, up, left, down, right]
        
        
        return state
    
pg.quit()

# Test Environment

In [67]:
env = PathEnv(False, 0.25)

In [68]:
# Run this cell to test that the Environment works properly
# This will randomly pick from a Discrete action step, no model is used here.

pg.init()
win = pg.display.set_mode((1124,460))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 3
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    #env.board.randomizeSpaces()
    
    while not done:
        pg.event.get()
        env.render()
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        score += reward         
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.quit()

Episode:1 Score:-62
Episode:2 Score:-62
Episode:3 Score:-64


In [None]:
pg.quit() # optional pygame quit in case of error

# Train Model

In [7]:
log_path = os.path.join('training', 'logs')
training_log_path = os.path.join(log_path, 'GuessPathPPO_1')

In [8]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [22]:
env = PathEnv(False, 0.0)

In [72]:
model.learn(total_timesteps=240000)

Logging to training\logs\PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 15       |
|    ep_rew_mean     | 61       |
| time/              |          |
|    fps             | 1954     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 14.2       |
|    ep_rew_mean          | 61         |
| time/                   |            |
|    fps                  | 1386       |
|    iterations           | 2          |
|    time_elapsed         | 2          |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.04272328 |
|    clip_fraction        | 0.154      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.112     |
|    explained_variance   | 0.724      |
|    learning_

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 13.9      |
|    ep_rew_mean          | 61.4      |
| time/                   |           |
|    fps                  | 1102      |
|    iterations           | 11        |
|    time_elapsed         | 20        |
|    total_timesteps      | 22528     |
| train/                  |           |
|    approx_kl            | 0.0465996 |
|    clip_fraction        | 0.0373    |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.128    |
|    explained_variance   | 0.975     |
|    learning_rate        | 0.0003    |
|    loss                 | 0.0959    |
|    n_updates            | 2480      |
|    policy_gradient_loss | -0.0138   |
|    value_loss           | 0.379     |
---------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 13.1        |
|    ep_rew_mean          | 62.9  

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 13           |
|    ep_rew_mean          | 63           |
| time/                   |              |
|    fps                  | 1076         |
|    iterations           | 21           |
|    time_elapsed         | 39           |
|    total_timesteps      | 43008        |
| train/                  |              |
|    approx_kl            | 0.0006628508 |
|    clip_fraction        | 0.0083       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.0795      |
|    explained_variance   | 0.993        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.041        |
|    n_updates            | 2580         |
|    policy_gradient_loss | -0.000174    |
|    value_loss           | 0.105        |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1066          |
|    iterations           | 31            |
|    time_elapsed         | 59            |
|    total_timesteps      | 63488         |
| train/                  |               |
|    approx_kl            | 0.00019437767 |
|    clip_fraction        | 0.00815       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0825       |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0713        |
|    n_updates            | 2680          |
|    policy_gradient_loss | 6.22e-05      |
|    value_loss           | 0.104         |
-------------------------------------------
------------------------------------------
| rollout/                |      

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 13           |
|    ep_rew_mean          | 63           |
| time/                   |              |
|    fps                  | 1061         |
|    iterations           | 41           |
|    time_elapsed         | 79           |
|    total_timesteps      | 83968        |
| train/                  |              |
|    approx_kl            | 0.0016728519 |
|    clip_fraction        | 0.00347      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.022       |
|    explained_variance   | 0.993        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.074        |
|    n_updates            | 2780         |
|    policy_gradient_loss | -0.000361    |
|    value_loss           | 0.109        |
------------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_m

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1056          |
|    iterations           | 51            |
|    time_elapsed         | 98            |
|    total_timesteps      | 104448        |
| train/                  |               |
|    approx_kl            | 6.7752844e-05 |
|    clip_fraction        | 0.000439      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00545      |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0169        |
|    n_updates            | 2880          |
|    policy_gradient_loss | -6.5e-06      |
|    value_loss           | 0.102         |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1053          |
|    iterations           | 61            |
|    time_elapsed         | 118           |
|    total_timesteps      | 124928        |
| train/                  |               |
|    approx_kl            | 0.00040077372 |
|    clip_fraction        | 0.00439       |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.0365       |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0355        |
|    n_updates            | 2980          |
|    policy_gradient_loss | 5.61e-06      |
|    value_loss           | 0.101         |
-------------------------------------------
-------------------------------------------
| rollout/                |     

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 13          |
|    ep_rew_mean          | 63          |
| time/                   |             |
|    fps                  | 1052        |
|    iterations           | 70          |
|    time_elapsed         | 136         |
|    total_timesteps      | 143360      |
| train/                  |             |
|    approx_kl            | 0.004017869 |
|    clip_fraction        | 0.00264     |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.0132     |
|    explained_variance   | 0.982       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.0772      |
|    n_updates            | 3070        |
|    policy_gradient_loss | -0.00191    |
|    value_loss           | 0.168       |
-----------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 13        |
|    ep_rew_mean          | 63        |
| time/                   |           |
|    fps                  | 1052      |
|    iterations           | 79        |
|    time_elapsed         | 153       |
|    total_timesteps      | 161792    |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.00283  |
|    explained_variance   | 0.994     |
|    learning_rate        | 0.0003    |
|    loss                 | 0.0351    |
|    n_updates            | 3160      |
|    policy_gradient_loss | -2.01e-07 |
|    value_loss           | 0.101     |
---------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 13           |
|    ep_rew_mean          | 63 

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1051          |
|    iterations           | 89            |
|    time_elapsed         | 173           |
|    total_timesteps      | 182272        |
| train/                  |               |
|    approx_kl            | 0.00096451375 |
|    clip_fraction        | 0.000439      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00303      |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0508        |
|    n_updates            | 3260          |
|    policy_gradient_loss | 8.74e-05      |
|    value_loss           | 0.101         |
-------------------------------------------
------------------------------------------
| rollout/                |      

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1052          |
|    iterations           | 99            |
|    time_elapsed         | 192           |
|    total_timesteps      | 202752        |
| train/                  |               |
|    approx_kl            | 0.00011453236 |
|    clip_fraction        | 0.000439      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00341      |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.0357        |
|    n_updates            | 3360          |
|    policy_gradient_loss | -1.8e-06      |
|    value_loss           | 0.102         |
-------------------------------------------
-------------------------------------------
| rollout/                |     

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 13           |
|    ep_rew_mean          | 63           |
| time/                   |              |
|    fps                  | 1054         |
|    iterations           | 108          |
|    time_elapsed         | 209          |
|    total_timesteps      | 221184       |
| train/                  |              |
|    approx_kl            | 0.0007321051 |
|    clip_fraction        | 0.00171      |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.0075      |
|    explained_variance   | 0.994        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.046        |
|    n_updates            | 3450         |
|    policy_gradient_loss | 2.54e-05     |
|    value_loss           | 0.102        |
------------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_l

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 13            |
|    ep_rew_mean          | 63            |
| time/                   |               |
|    fps                  | 1055          |
|    iterations           | 118           |
|    time_elapsed         | 228           |
|    total_timesteps      | 241664        |
| train/                  |               |
|    approx_kl            | 5.3667463e-08 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00238      |
|    explained_variance   | 0.994         |
|    learning_rate        | 0.0003        |
|    loss                 | 0.055         |
|    n_updates            | 3550          |
|    policy_gradient_loss | -7.2e-07      |
|    value_loss           | 0.1           |
-------------------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x1bd94ad11f0>

# Save Model

In [73]:
guesspath_path = os.path.join('training', 'saved_models', 'GuessModel_PPO_PRESET_5')

In [74]:
model.save(guesspath_path)

In [75]:
del(model) # uncomment to delete the model after saving it

In [76]:
model = PPO.load(guesspath_path, env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Watch Model
### With a loaded model, it will be used with the environment.

In [77]:
env = PathEnv(False, 0.2) # set environment if not set already

In [78]:
pg.display.init()
win = pg.display.set_mode((1124,460))
pg.display.set_caption("Custom Environment - Guess Path RL")

episodes = 15
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        pg.event.get()
        env.render()
        action, _ = model.predict(obs) # predict returns two, but we only require action
        obs, reward, done, info = env.step(action)
        score += reward
        
    print('Episode:{} Score:{}'.format(episode, score))
env.close()
pg.display.quit()

Episode:1 Score:-60
Episode:2 Score:-60
Episode:3 Score:-60
Episode:4 Score:-60
Episode:5 Score:-60
Episode:6 Score:-60
Episode:7 Score:-60
Episode:8 Score:-60
Episode:9 Score:-60
Episode:10 Score:-60
Episode:11 Score:-60
Episode:12 Score:-60
Episode:13 Score:-60
Episode:14 Score:-60
Episode:15 Score:-60
