Import Dependancies 

In [1]:
from vizdoom import * #Import all of vizdoom
import numpy as np #Numpy for identity matrix
import time #To make the program sleep (wait), so we can actually see what's happening
from stable_baselines3.common import env_checker #Import the env_checker class from stable_baselines3 to check the environment
from stable_baselines3 import PPO #Import the PPO class for training
from stable_baselines3.common.evaluation import evaluate_policy #Import the evaluate_policy function to evaluate the model
import os #To save the model to the correct path
from vizdoom_with_ai_gym_env_test import VizDoomGym_Simple, Deadly_Corridor_VZG, TrainAndLogCallback #Import the environment class and TrainAndLogCallback 
from pathfinder import doomfinder, create_new_checkpoint_directory


In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_basic') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_basic' #Directory to save the logs

In [None]:
callback = TrainAndLogCallback(check_freq=20000, save_path=CHECKPOINT_DIR) #After every 20000 steps of training model, we save the model

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder("basic.cfg"), render=False) #Create the environment
print(env.get_state().game_variables) #Print the game variables
env_checker.check_env(env) #Check the environment to see if its valid

In [None]:
#env.render(render_in_greyscale=True) #Render the environment in greyscale, crashes the whole thing now and IDK why, not particularly important to fix ATM

Use PPO algorithm for training

In [None]:
#Env already created in previous cell
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=2048) #Create the model

#CnnPolicy is a convolutional neural network policy, which is used for images
#env is the environment
#verbose is the verbosity level
#tensorboard_log is the directory to save the logs
#learning_rate is the learning rate of the model
#n_steps is the number of steps to train the model

In [None]:
#Train the model
model.learn(total_timesteps=100000, callback=callback) #Train the model for 100000 steps

Test models

In [None]:
#model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_2/best_model_50000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_3/best_model_100000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder('basic.cfg'), render=True) #Reload env with rendering enabled

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100, render=True) #Evaluate the model for 100 episodes

In [None]:
#Test for 5 episodes but sleep so that we can see whats going on

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds


Weirdly the model performs much worse when trained for 1M steps than when trained for 100k steps

In [None]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_1/best_model_100000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

In [None]:
#Test for 5 episodes but sleep so that we can see whats going on

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Trying a new level (Defend The Center)

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=True) #Reload env with new map
print(env.get_state().game_variables) #Print the game variables
env_checker.check_env(env) #Check the environment to see if its valid

In [None]:
#Try with old model (boooo its not good at this map)

model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_4/best_model_50000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Train a new model

In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_defend_the_center') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_defend_the_center' #Directory to save the logs
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=False) #Reload env with new map
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=8192) #Create the model with more n_steps, more n_steps for more complex things
model.learn(total_timesteps=300000, callback=callback) #Train the model for 300000 steps

Testing the model

In [8]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_defend_the_center_14/best_model_500000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=True) #Reload env with rendering enabled

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, render=True) #Evaluate the model for 10 episodes
print(mean_reward) #Print the mean reward

In [9]:
#Try with new model

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  #Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Episode: 0, Total Reward: 2.1399999999999615
Episode: 1, Total Reward: -15.180000000000007
Episode: 2, Total Reward: -5.320000000000005
Episode: 3, Total Reward: -4.660000000000046
Episode: 4, Total Reward: 8.39


Just to see if it works, lets try and do the previous model test but now let it move in all directions

In [11]:
from gymnasium import Env #Import OpenAI Gym's Env class
from gymnasium.spaces import Discrete, Box #Import OpenAI Gym's Discrete and Box spaces
import cv2 #OpenCV for image processing, used for modifying the DOOM environment to make it run faster 

class VizDoomGym_Simple_All_Dir(Env): #Copy of last environment class, but now we can move in all directions
    def __init__(self, config_path, render=False): #Constructor
        
        #Configs this is used for: basic.cfg, defend_the_center.cfg

        super(VizDoomGym_Simple_All_Dir, self).__init__() #Inherit from Env class

        #Args: 
            #config_path (str): The path to the configuration file
            #render (bool): Whether to render the environment or not, false by default

        #Setup game
        self.game = vizdoom.DoomGame() #Create a DoomGame object
        self.game.load_config(config_path) #Load the configuration file from file path, ex: doomfinder("basic.cfg")

        #Set window visibility
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)

        self.game.init() #Start the game

        #Setup action and observation space
        self.observation_space = Box(low=0, high=255, shape=(100, 160, 1), dtype=np.uint8) #Observation space, 100x160x1 image
        self.action_space = Discrete(7) #Action space, 7 actions

        #Game variables
        self.ammo = self.game.get_state().game_variables[0]  #Get the ammo count, initialize to the current ammo
        self.health = 100 #Initialize health to 100 (assuming we start at full health)


    def step(self, action, limit = 1000): #Take a step in the environment 
        #Args:
            #action (int): The action to take
            #limit (int): Unimplemented "limit" for the episode, most likely will be a time limit
        #Returns:
            #observation (np.array): The screen buffer of the environment
            #reward (float): The reward for the action taken
            #terminated (bool) Whether the episode is finished or not (by reaching the goal)
            #truncated (bool): Whether the episode has reached some terminal state without reaching the goal (ie: running out of time)
            #info (dict): Additional information about the environment

        #Specify actions and take a step
        actions = np.identity(7) #Create an identity matrix with 7 rows (7 actions), TURN_LEFT, TURN_RIGHT, MOVE_FOWARD, MOVE_BACKWARDS, ATTACK, MOVE_LEFT, MOVE_RIGHT,  these are the actions we can take in the environment
        movement_reward = self.game.make_action(actions[action], 4) #Reward for taking a random action, second parameter is frame skip (skip 4 frames before taking the next action), the reason we do this is because it saves us time while being easy to see what is happening 
        reward = movement_reward #Initialize reward to movement reward
        truncated = False #Not implemented yet, so set to False. The idea is that if step passes some sort of limit, like a time limit, then the episode is truncated.
        info = {} #Initialize info to an empty dictionary
        Basic = False

        if self.game.get_state(): #If the game is not finished
            observation = self.game.get_state().screen_buffer #Get the screen buffer
            observation = self.greyscale(observation) #Convert the image to greyscale

            #Get game variables
            game_variables = self.game.get_state().game_variables
            if len(game_variables) == 2:
                ammo, health = game_variables
            else:
                ammo = game_variables[0]
                health = 100 #Assume health is 100 if it's not provided (effectively ignoring it)
                Basic = True
            
            #Calculate reward deltas
            if(Basic == False): #If its the basic config we just ignore all deltas entirely, I know this is janky but its just a testing enviorment so whatever
                ammo_delta = ammo - self.ammo #Current ammo - old ammo = ammo used
                ammo = self.ammo 
                health_delta = health - self.health  #Current health - old health = damage taken
                health = self.health
            
                #reward = movement_reward*2 + ammo_delta*0.0384615385 + health_delta*0.01 #Calculate the reward, the idea is the max score is 2, if we lose all heath our score is subtracted by 1, if we lose all ammo our score is subtracted by 1
                reward = movement_reward*2 + ammo_delta*0.01 + health_delta*0.00 #Calculate the reward, the idea is the max score is 2, if we lose all heath our score is subtracted by 1, if we lose all ammo our score is subtracted by 1

            info = {"ammo": ammo}
        else:
            observation = np.zeros(self.observation_space.shape) #Return a blank screen

        terminated = self.game.is_episode_finished() #Check if the episode is finished

        return observation, movement_reward, terminated, truncated, info

    def render(self, render_in_greyscale=False): #Render the environment for a frame
        #Args:
            #render_in_greyscale (bool): Whether to render the environment in greyscale or not
        
        if self.game.get_state() and render_in_greyscale:  #Only render if there's a valid game state
            observation = self.game.get_state().screen_buffer
            greyscale_obs = self.greyscale(observation)  #Convert to greyscale
            #Render using OpenCV to visualize
            cv2.imshow("VizDoom Environment", greyscale_obs.squeeze())  #Remove extra dimension and display
            cv2.waitKey(1)  #Wait 1ms between frames to allow for rendering
        elif self.game.get_state():  #Only render if there's a valid game state
            observation = self.game.get_state().screen_buffer
            #Render using OpenCV to visualize
            cv2.imshow("VizDoom Environment", observation.squeeze())  #Remove extra dimension and display
            cv2.waitKey(1)  #Wait 1ms between frames to allow for rendering
        else:
            print("No game state to render.")

            
    def reset(self, seed=None): #Reset the environment when we start a new game
        #Args:
            #seed (int): The seed for the random number generator
        #Returns:
            #(observation, info) (tuple)
                #observation (np.array): The screen buffer of the environment
                #info (dict): Additional information about the environment
            
        super().reset(seed=seed) #Implement seeding
        
        self.game.new_episode() #Start a new episode
        state = self.game.get_state().screen_buffer #Get the screen buffer
        observation = self.greyscale(state) #Convert the image to greyscale
        
        #Gather any additional environment-specific info (like ammo, etc.)
        if self.game.get_state():
            ammo = self.game.get_state().game_variables[0]  #Get the ammo count
            info = {"ammo": ammo}
        else:
            info = {} #No gamestate means no info can be gathered
        
        return (observation, info) #Tuple of observation and info

    def greyscale(self, observation=None): #Convert the enivornment to greyscale and resize it
        #Args:
            #observation (np.array): The image of the environment (the current game frame)
        #Returns:
            #grey_return (np.array): The resized greyscale image of the environment
        
        if observation is None and self.game.get_state(): #If no observation is passed
            observation = self.game.get_state().screen_buffer #Get the screen buffer 

        grey = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY) #Convert the image to greyscale
        resize = cv2.resize(grey, (160, 100), interpolation=cv2.INTER_CUBIC) #Resize the image to 160x100
        state = np.reshape(resize, (100, 160, 1)) #Reshape the image to 100x160x1
        
        return state
    
    def get_state(self): 
        #Returns:
            #state (np.array): The current state of the environment
        return self.game.get_state()

    def close(self): #Close the environment
        self.game.close()


In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_defend_the_center') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_defend_the_center' #Directory to save the logs
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model

In [None]:
env = VizDoomGym_Simple_All_Dir(config_path=doomfinder('defend_the_center_all_directions.cfg'), render=False) #Reload env with new map
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=8192) #Create the model with more n_steps, more n_steps for more complex things
model.learn(total_timesteps=1000000, callback=callback) #Train the model for 1000000 steps

In [40]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_defend_the_center_16/best_model_1000000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = VizDoomGym_Simple_All_Dir(config_path=doomfinder('defend_the_center_all_directions.cfg'), render=True) #Reload env with rendering enabled

In [41]:
#Try with new model

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  #Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Episode: 0, Total Reward: 26.0
Episode: 1, Total Reward: 8.0
Episode: 2, Total Reward: 21.0
Episode: 3, Total Reward: 9.0
Episode: 4, Total Reward: 23.0


New test, start with the model we trained in a less complicated env, run the tests again starting at that end point, with the reward shaping

In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_defend_the_center') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_defend_the_center' #Directory to save the logs

In [None]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_defend_the_center_2/best_model_100000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = (VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=False)) #Reload env with new map 
model.set_env(env) #Set the environment for the model
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model
model.learn(total_timesteps=1000000, callback=callback) #Train the model for 1000000 steps

Same thing but with all dir

In [39]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_defend_the_center_16/best_model_1000000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = (VizDoomGym_Simple_All_Dir(config_path=doomfinder('defend_the_center_all_directions.cfg'), render=False)) #Reload env with new map 
model.set_env(env) #Set the environment for the model
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model
model.learn(total_timesteps=1000000, callback=callback) #Train the model for 1000000 steps

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


NameError: name 'CHECKPOINT_DIR' is not defined

Testing with more complicated config (needs custom env, and moving foward each config more complicated than this will probably also need their own env)

In [None]:
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s1.cfg'), render=False) #Reload env with new map, easy version
env_checker.check_env(env) #Check the environment to see if its valid

In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_deadly_corridor') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_deadly_corridor' #Directory to save the logs
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model

In [None]:
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.00001, n_steps=8192, clip_range=.1, gamma=.95, gae_lambda=.9) #Create the model with more complicated hyperparameters because of the more complicated enviornment
#Changes: Learning rate 0.0001 -> 0.00001, n_steps 4096 -> 8192, added clip range, gamma andd gae_lambda
model.learn(total_timesteps=400000, callback=callback) #Train the model for 400000 steps

#Load the model that was created training on the easy difficulty

model = PPO.load('./Training/checkpoints/best_model_PPO_test_deadly_corridor_1/best_model_400000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s2.cfg'), render=False) #Reload env with harder version of map
model.set_env(env) #Set the env to the model
model.learn(total_timesteps=50000, callback=callback) #Train the model for 50000 steps on this higher difficulty



In [None]:
#Repeat the process up to the hardest difficulty

model = PPO.load('./Training/checkpoints/best_model_PPO_test_deadly_corridor_1/best_model_450000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s3.cfg'), render=False) #Reload env with harder version of map
model.set_env(env) #Set the env to the model
model.learn(total_timesteps=50000, callback=callback) #Train the model for 50000 steps on this higher difficulty


model = PPO.load('./Training/checkpoints/best_model_PPO_test_deadly_corridor_1/best_model_500000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s4.cfg'), render=False) #Reload env with harder version of map
model.set_env(env) #Set the env to the model
model.learn(total_timesteps=50000, callback=callback) #Train the model for 50000 steps on this higher difficulty

model = PPO.load('./Training/checkpoints/best_model_PPO_test_deadly_corridor_1/best_model_550000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s5.cfg'), render=False) #Reload env with harder version of map
model.set_env(env) #Set the env to the model
model.learn(total_timesteps=50000, callback=callback) #Train the model for 50000 steps on this higher difficulty

#This should allow the model some time to adjust to the higher difficulties, and should allow it to learn the map better

Test the model

In [37]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_deadly_corridor_1/best_model_625000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s5.cfg'), render=True) #Reload env with rendering enabled

In [38]:
#Try with new model

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  #Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Episode: 0, Total Reward: -472.9422607421875
Episode: 1, Total Reward: -472.9422607421875
Episode: 2, Total Reward: -319.30694580078125
Episode: 3, Total Reward: -472.9422607421875
Episode: 4, Total Reward: -472.9422607421875
