Import Dependancies 

In [1]:
from vizdoom import * #Import all of vizdoom
import numpy as np #Numpy for identity matrix
import time #To make the program sleep (wait), so we can actually see what's happening
from stable_baselines3.common import env_checker #Import the env_checker class from stable_baselines3 to check the environment
from stable_baselines3 import PPO #Import the PPO class for training
from stable_baselines3.common.evaluation import evaluate_policy #Import the evaluate_policy function to evaluate the model
import os #To save the model to the correct path
from vizdoom_with_ai_gym_env_test import VizDoomGym_Simple, Deadly_Corridor_VZG, TrainAndLogCallback #Import the environment class and TrainAndLogCallback 
from pathfinder import doomfinder, create_new_checkpoint_directory


In [2]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_basic') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_basic' #Directory to save the logs

In [3]:
callback = TrainAndLogCallback(check_freq=20000, save_path=CHECKPOINT_DIR) #After every 20000 steps of training model, we save the model

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder("basic.cfg"), render=False) #Create the environment
print(env.get_state().game_variables) #Print the game variables
env_checker.check_env(env) #Check the environment to see if its valid

In [18]:
#env.render(render_in_greyscale=True) #Render the environment in greyscale, crashes the whole thing now and IDK why, not particularly important to fix ATM

Use PPO algorithm for training

In [None]:
#Env already created in previous cell
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=2048) #Create the model

#CnnPolicy is a convolutional neural network policy, which is used for images
#env is the environment
#verbose is the verbosity level
#tensorboard_log is the directory to save the logs
#learning_rate is the learning rate of the model
#n_steps is the number of steps to train the model

In [None]:
#Train the model
model.learn(total_timesteps=100000, callback=callback) #Train the model for 100000 steps

Test models

In [3]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_3/best_model_50000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

In [4]:
env = VizDoomGym_Simple(config_path=doomfinder('basic.cfg'), render=True) #Reload env with rendering enabled

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100, render=True) #Evaluate the model for 100 episodes

In [None]:
#Test for 5 episodes but sleep so that we can see whats going on

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds


Weirdly the model performs much worse when trained for 1M steps than when trained for 100k steps

In [None]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_2/best_model_100000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

In [None]:
#Test for 5 episodes but sleep so that we can see whats going on

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Trying a new level (Defend The Center)

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=True) #Reload env with new map
print(env.get_state().game_variables) #Print the game variables
env_checker.check_env(env) #Check the environment to see if its valid

In [None]:
#Try with old model (boooo its not good at this map)

model = PPO.load('./Training/checkpoints/best_model_PPO_test_basic_4/best_model_50000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  # Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Train a new model

In [2]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_defend_the_center') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_defend_the_center' #Directory to save the logs
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model

In [None]:
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=False) #Reload env with new map
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=8192) #Create the model with more n_steps, more n_steps for more complex things
model.learn(total_timesteps=100000, callback=callback) #Train the model for 100000 steps

Testing the model

In [4]:
model = PPO.load('./Training/checkpoints/best_model_PPO_test_defend_the_center_3/best_model_200000.zip') #Load the model (hardcoded to load a specific model but adjust as needed)
env = VizDoomGym_Simple(config_path=doomfinder('defend_the_center.cfg'), render=True) #Reload env with rendering enabled

In [None]:
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=10, render=True) #Evaluate the model for 10 episodes

In [5]:
#Try with new model

for episode in range(5):
    obs, _ = env.reset()  #Reset the environment and get only the observation
    done = False  #Set done to false
    total_reward = 0  #Set total reward to 0
    while not done:  #While the game isn't done
        action, _ = model.predict(obs)  #Get the action
        obs, reward, done, truncated, info = env.step(action)  #Take the action
        total_reward += reward  #Add the reward to the total reward
        time.sleep(0.05)  #Sleep for 0.05 seconds
    print('Episode: {}, Total Reward: {}'.format(episode, total_reward))  #Print the episode and total reward
    time.sleep(2)  #Sleep for 2 seconds

Episode: 0, Total Reward: -306.4
Episode: 1, Total Reward: -278.20000000000005
Episode: 2, Total Reward: -575.2
Episode: 3, Total Reward: -382.70000000000016
Episode: 4, Total Reward: -508.8


Testing with more complicated config (needs custom env, and moving foward each config more complicated than this will probably also need their own env)

In [None]:
CHECKPOINT_DIR = create_new_checkpoint_directory('best_model_PPO_test_deadly_corridor') #Directory to save the model
LOG_DIR = './logs/log_PPO_test_deadly_corridor' #Directory to save the logs
callback = TrainAndLogCallback(check_freq=25000, save_path=CHECKPOINT_DIR) #After every 25000 steps of training model, we save the model

In [None]:
env = Deadly_Corridor_VZG(config_path=doomfinder('deadly_corridor_s5.cfg'), render=False) #Reload env with new map
model = PPO('CnnPolicy', env, verbose=1, tensorboard_log=LOG_DIR, learning_rate=0.0001, n_steps=4096) #Create the model with more n_steps, more n_steps for more complex things
model.learn(total_timesteps=100000, callback=callback) #Train the model for 100000 steps