# Import

In [1]:
from vizdoom import *
import random
import time
import numpy as np
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback
import os
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common import policies
# import ppo for training
from stable_baselines3 import PPO
from stable_baselines3 import DQN

# Basic Environment

# Testing the basic game

In [2]:
game = DoomGame()
game.load_config('VizDoom/scenarios/basic.cfg')
game.init()

In [3]:
# This is the set of actions we can take in the environment
actions = np.identity(3, dtype=np.uint8)

In [4]:
state = game.get_state()
state.game_variables

array([50.])

In [5]:
episodes = 10 
for episode in range(episodes): 
    # Create a new episode or game 
    game.new_episode()
    # Check the game isn't done 
    while not game.is_episode_finished(): 
        # Get the game state 
        state = game.get_state()
        # Get the game image 
        img = state.screen_buffer
        # Get the game variables - ammo
        info = state.game_variables
        # Take an action
        reward = game.make_action(random.choice(actions),4)
        # Print rewward 
        print('reward:', reward) 
        time.sleep(0.02)
    print('Result:', game.get_total_reward())
    time.sleep(2)


reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: 97.0
reward: -1.0
Result: 51.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: 99.0
Result: -30.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -4.0
reward: -9.0
reward: -4.0
reward: -4.

In [5]:
game.close()

# Set-up openAI framework

In [6]:
import torch

In [7]:
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    print('CUDA is available on this device!')
else:
    device = torch.device("cpu")           # a CPU device object
    print('CUDA is not available on this device :(')

CUDA is available on this device!


In [8]:
# Create DOOM OpenAI Gym SIMPLE Environment
class VizDoomGym(Env): 
    def __init__(self, render=False): 
        super().__init__()
        self.game = DoomGame()
         
        #load basic configuration for simple environment
        self.game.load_config('VizDoom/scenarios/basic.cfg')
        
        #Set visibility of game
        if render == True: 
            self.game.set_window_visible(True)
        else:
            self.game.set_window_visible(False)
        
        # Start the game 
        self.game.init()
        
        # Create observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        
        # Create action space
        self.action_space = Discrete(3)
        
    # Create Step function
    def step(self, action):
        actions = np.identity(3)
        reward = self.game.make_action(actions[action], 4) 
        
        # Get information from the game
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Render
    def render(): 
        pass
    
    # Reset game
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [10]:
env = VizDoomGym(render=True)

In [11]:
state = env.reset()

In [None]:
env.close()

# Create Callback

In [9]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

# Hyper parameter Tuning

In [18]:
import optuna

In [40]:
def optimise_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': int(trial.suggest_loguniform('n_steps', 32, 2048)),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),
    }

def optimise_agent(trial):
    model_params = optimise_ppo(trial)
    env = VizDoomGym(render=False)
    model = PPO('CnnPolicy', env, verbose =1, **model_params)
    model.learn(25000)
    
    
    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)

    return -1 * last_reward
    
    

In [41]:
study = optuna.create_study()

[32m[I 2023-04-06 17:37:50,513][0m A new study created in memory with name: no-name-6299b18d-eb55-49cb-8e8e-2b82195977b9[0m


In [42]:
study.optimize(optimise_agent, n_trials = 20)

  'n_steps': int(trial.suggest_loguniform('n_steps', 32, 2048)),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1),
  'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1),


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=63 and n_envs=1)


---------------------------
| time/              |    |
|    fps             | 39 |
|    iterations      | 1  |
|    time_elapsed    | 1  |
|    total_timesteps | 63 |
---------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 21.6          |
|    ep_rew_mean          | -18           |
| time/                   |               |
|    fps                  | 31            |
|    iterations           | 2             |
|    time_elapsed         | 4             |
|    total_timesteps      | 126           |
| train/                  |               |
|    approx_kl            | 3.1291493e-05 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -1.1          |
|    explained_variance   | -0.00108      |
|    learning_rate        | 2.09e-05      |
|    loss                 | 1.97e+03      |
|    n_updates            | 10            |
|    policy_

[33m[W 2023-04-06 17:38:09,150][0m Trial 0 failed with parameters: {'n_steps': 63.52288032807318, 'learning_rate': 2.0913273644322846e-05, 'ent_coef': 5.6402618165441294e-08} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "C:\Users\Ga401\anaconda3\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\Ga401\AppData\Local\Temp\ipykernel_30840\3716317451.py", line 13, in optimise_agent
    model.learn(25000)
  File "C:\Users\Ga401\anaconda3\lib\site-packages\stable_baselines3\ppo\ppo.py", line 307, in learn
    return super().learn(
  File "C:\Users\Ga401\anaconda3\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 248, in learn
    continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps)
  File "C:\Users\Ga401\anaconda3\lib\site-packages\stable_baselines3\common\on_policy_algorithm.py", line 

KeyboardInterrupt: 

In [32]:
print(study.best_params)

{'n_steps': 1247.3982614438928, 'learning_rate': 0.00020658149856258678, 'ent_coef': 0.01390447360746765}


# Training Agent

-PPO sensitive to learning rate and batch size  
-DQN sensitive to learning rate, Exploration rate, Batch size, Replay Buffer,Discount Factor

In [10]:
env = VizDoomGym(render=False)

In [11]:
CHECKPOINT_DIR = './model/basic_model/PPO_model_basic'
LOG_DIR = './model_log/log_basic'

callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [12]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1,
            learning_rate=0.00015, n_steps=2048, batch_size = 64, 
            ent_coef=0.0001)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps=100000, callback=callback)

Logging to ./model_log/log_basic\PPO_8
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 27.3     |
|    ep_rew_mean     | -48.5    |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 62       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 22.6        |
|    ep_rew_mean          | -29.3       |
| time/                   |             |
|    fps                  | 31          |
|    iterations           | 2           |
|    time_elapsed         | 128         |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009979952 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.000

-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 11.1        |
|    ep_rew_mean          | 46.3        |
| time/                   |             |
|    fps                  | 30          |
|    iterations           | 11          |
|    time_elapsed         | 740         |
|    total_timesteps      | 22528       |
| train/                  |             |
|    approx_kl            | 0.052591946 |
|    clip_fraction        | 0.343       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.859      |
|    explained_variance   | 0.64        |
|    learning_rate        | 0.00015     |
|    loss                 | 1.71e+03    |
|    n_updates            | 100         |
|    policy_gradient_loss | 0.00806     |
|    value_loss           | 4.09e+03    |
-----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 12.2  

# Evaluate

In [78]:
# Import eval policy to test agent
from stable_baselines3.common.evaluation import evaluate_policy

In [86]:
# Reload model from disc
model = PPO.load('./model/PPO2_model_basic/model_80000')

In [13]:
import gc
torch.cuda.empty_cache()
gc.collect()

1564

In [81]:
# Create rendered environment
env = VizDoomGym(render=False)

In [82]:
# Evaluate mean reward for 10 games
mean_reward, _ = evaluate_policy(model, env, n_eval_episodes=100)



In [83]:
mean_reward

84.57

In [87]:
# Create rendered environment
env = VizDoomGym(render=True)

In [88]:
for episode in range(10): 
    obs = env.reset()
    done = False
    result_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # time.sleep(0.20)
        result_reward += reward
    print('Episode {}: Total Reward is {}'.format(episode, result_reward))
    time.sleep(1)

Total Reward for episode 95.0 is 0
Total Reward for episode 83.0 is 1
Total Reward for episode 95.0 is 2
Total Reward for episode 75.0 is 3
Total Reward for episode 75.0 is 4
Total Reward for episode 95.0 is 5
Total Reward for episode 71.0 is 6
Total Reward for episode 95.0 is 7
Total Reward for episode 67.0 is 8
Total Reward for episode 87.0 is 9
