# Import

In [2]:
from vizdoom import *
import random
import time
import numpy as np
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback
import os
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common import policies
# import ppo for training
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env

# Loading Basic Model

In [None]:
# Reload model from disc
model = PPO.load('./model/basic_model/PPO_model_basic/model_10000')

In [13]:
doom = DoomGame()
doom.load_config('VizDoom/scenarios/defend_the_center.cfg')
doom.init()

In [14]:
# This is the set of actions we can take in the environment
actions = np.identity(3, dtype=np.uint8)

In [15]:
# This is the set of actions we can take in the environment
actions = np.identity(3, dtype=np.uint8)
state = doom.get_state()
state.game_variables

array([ 26., 100.])

In [53]:
# Loop through episodes 
episodes = 10 
for episode in range(episodes): 
    # Create a new episode or game 
    doom.new_episode()
    # Check the game isn't done 
    while not doom.is_episode_finished(): 
        # Get the game state 
        state = doom.get_state()
        # Get the game image 
        img = state.screen_buffer
        # Get the game variables - ammo
        info = state.game_variables
        # Take an action
        reward = doom.make_action(random.choice(actions),4)
        # Print rewward 
        # print('reward:', reward) 
        time.sleep(0.02)
    print('Result:', doom.get_total_reward())
    time.sleep(2)

Result: 0.0
Result: 0.0
Result: 1.0
Result: -1.0
Result: 1.0
Result: -1.0
Result: 0.0
Result: 0.0
Result: -1.0
Result: -1.0


In [51]:
# Reload model from disc
model = PPO.load('./model/PPO2_model_basic/model_100000')

In [54]:
env = VizDoom(render=True)

In [55]:
for episode in range(10): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(total_reward, episode))
    time.sleep(2)

Total Reward for episode 4.0 is 0
Total Reward for episode 3.0 is 1
Total Reward for episode 3.0 is 2
Total Reward for episode 1.0 is 3
Total Reward for episode 0.0 is 4
Total Reward for episode 3.0 is 5
Total Reward for episode 0.0 is 6
Total Reward for episode 2.0 is 7
Total Reward for episode 7.0 is 8
Total Reward for episode 2.0 is 9


In [8]:
doom.close()

In [28]:
state = doom.get_state()

In [29]:
state.game_variables

In [28]:
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env

In [67]:
env = make_vec_env(lambda: VizDoom(False), n_envs=4)

In [66]:
env.close()

# Set-up openAI framework


In [57]:
# Create Vizdoom OpenAI Gym Environment
class VizDoom(Env): 
    # Function that is called when we start the env
    def __init__(self, render=False): 
        # Inherit from Env
        super().__init__()
        # Setup the game 
        self.game = DoomGame()
        self.game.load_config('VizDoom/scenarios/defend_the_center.cfg')
        
        # Render frame logic
        if render == False: 
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game 
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        self.action_space = Discrete(3)
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(3)
        reward = self.game.make_action(actions[action], 4) 
        
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Define how to render the game or environment 
    def render(): 
        self.game.render(mode = 'human')
    
    # What happens when we start a new game 
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [46]:
env = VizDoom(render = False)

In [45]:
env.close()

In [48]:
from stable_baselines3.common import env_checker

In [49]:
env_checker.check_env(env)

# HyperParameter Tuning

In [168]:
import optuna

In [192]:
def optimise_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': int(trial.suggest_loguniform('n_steps', 640, 8960)),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-7, 1e-1),
    }

def optimise_agent(trial):
    model_params = optimise_ppo(trial)
    env = VizDoom(render=False)
    model = PPO('CnnPolicy', env, verbose =1, **model_params)
    model.learn(30000)
    
    
    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)

    return -1 * last_reward

In [193]:
study = optuna.create_study()

[32m[I 2023-04-08 00:12:47,783][0m A new study created in memory with name: no-name-9304f84e-3b40-4752-8524-f184ae680763[0m


In [None]:
study.optimize(optimise_agent, n_trials = 10)

  'n_steps': int(trial.suggest_loguniform('n_steps', 640, 8960)),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'ent_coef': trial.suggest_loguniform('ent_coef', 1e-7, 1e-1),


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8644 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 182      |
|    ep_rew_mean     | 67.8     |
| time/              |          |
|    fps             | 33       |
|    iterations      | 1        |
|    time_elapsed    | 255      |
|    total_timesteps | 8644     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 187        |
|    ep_rew_mean          | 98.6       |
| time/                   |            |
|    fps                  | 32         |
|    iterations           | 2          |
|    time_elapsed         | 536        |
|    total_timesteps      | 17288      |
| train/                  |            |
|    approx_kl            | 0.01015413 |
|    clip_fraction        | 0.146      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.94      |
|    explained_variance   | 1.07e-05   |
|    learning_rate        | 3.93e-05   |
|   

[32m[I 2023-04-08 00:31:39,065][0m Trial 0 finished with value: 23.875656127929688 and parameters: {'n_steps': 8644.196732534117, 'learning_rate': 3.9314477322718775e-05, 'ent_coef': 0.0052528441125594405}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2411 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 218      |
|    ep_rew_mean     | 78.9     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 74       |
|    total_timesteps | 2411     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 120       |
|    ep_rew_mean          | 110       |
| time/                   |           |
|    fps                  | 31        |
|    iterations           | 2         |
|    time_elapsed         | 154       |
|    total_timesteps      | 4822      |
| train/                  |           |
|    approx_kl            | 38.796406 |
|    clip_fraction        | 0.987     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.16     |
|    explained_variance   | 1.3e-05   |
|    learning_rate        | 0.0116    |
|    loss           

In [172]:
print(study.best_params)

{'n_steps': 8168.417112954666, 'learning_rate': 0.010338653002033907, 'ent_coef': 0.09275994092754133}


In [58]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [80]:
# Reload model from disc
model = PPO.load('./model/PPO2_model_basic/model_100000')

In [81]:
CHECKPOINT_DIR = './model/PPO1_model_defend'
LOG_DIR = './model_logs/log_defend'

In [82]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [83]:
model.set_env(VizDoom(render = False))

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [84]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.00001, n_steps=8192, clip_range=.1, gamma=.95, ent_coef = 0.09)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [85]:
model.learn(total_timesteps=20000, callback=callback)

Logging to ./model_logs/log_defend\PPO_5


KeyboardInterrupt: 

In [93]:
CHECKPOINT_DIR = './model/PPO2_model_defend'
LOG_DIR = './model_logs/log_defend'

In [97]:
# Reload model from disc
Basic_model = PPO.load('./model/PPO2_model_basic/model_80000')

In [None]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.0001, n_steps=8192, ent_coef = 0.09)

In [None]:
model.learn(total_timesteps=20000, callback=callback)

In [86]:
CHECKPOINT_DIR = './model/PPO3_model_defend'
LOG_DIR = './model_logs/log_defend'

In [87]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.00001, n_steps=8192, clip_range=.1, gamma=.95, ent_coef = 0.09)

Using cuda device
Wrapping the env in a VecTransposeImage.


In [88]:
model.learn(total_timesteps=20000, callback=callback)

Logging to ./model_logs/log_defend\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 79.2     |
|    ep_rew_mean     | 0.21     |
| time/              |          |
|    fps             | 38       |
|    iterations      | 1        |
|    time_elapsed    | 854      |
|    total_timesteps | 32768    |
---------------------------------


<stable_baselines3.ppo.ppo.PPO at 0x16bc777c400>

# Evaluate

In [89]:
# Import eval policy to test agent
from stable_baselines3.common.evaluation import evaluate_policy

In [90]:
# Reload model from disc
model = PPO.load('./model/PPO1_model_defend/model_10000')

In [91]:
# Create rendered environment
env = VizDoomGym(render=True)

In [92]:
for episode in range(10): 
    obs = env.reset()
    done = False
    result_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # time.sleep(0.20)
        result_reward += reward
    print('Episode {}: Total Reward is {}'.format(episode, result_reward))
    time.sleep(1)

Total Reward for episode 0.0 is 0
Total Reward for episode 0.0 is 1
Total Reward for episode 3.0 is 2


KeyboardInterrupt: 

In [4]:
import torch

In [5]:
import gc
torch.cuda.empty_cache()
gc.collect()

540