### Setup VizDoom For Python

#### Importing Dependencies

In [1]:
# import vizdoom library
from vizdoom import DoomGame
# import random
import random
# import time 
import time
# import numpy
import numpy as np 
# import pytorch
import torch

#### VizDoom Environment

In [2]:
game = DoomGame()

In [3]:
game.load_config('GitHub/ViZDoom-master/scenarios/basic.cfg')
game.init()

In [4]:
game.close()

In [5]:
actions = np.identity(3,dtype=np.uint8)

#### Convert Vizdoom Environment to Gym

In [6]:
# import gym env
from gym import Env
# import Discrete, Box
from gym.spaces import Discrete, Box
# import opencv
import cv2

In [7]:
# Create Vizdoom OpenAI Gym Environment
class VizDoomGym(Env): 
    # Function that is called when we start the env
    def __init__(self, render=False): 
        # Inherit from Env
        super().__init__()
        # Setup the game 
        self.game = DoomGame()
        self.game.load_config('GitHub/ViZDoom-master/scenarios/basic.cfg')
        
        # Render frame logic
        if render == False: 
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        # Start the game 
        self.game.init()
        
        # Create the action space and observation space
        self.observation_space = Box(low=0, high=255, shape=(100,160,1),dtype=np.uint8) 
        self.action_space = Discrete(3)
        
    # This is how we take a step in the environment
    def step(self, action):
        # Specify action and take step 
        actions = np.identity(3)
        reward = self.game.make_action(actions[action], 4) 
        
        # Get all the other stuff we need to retun 
        if self.game.get_state(): 
            state = self.game.get_state().screen_buffer
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info 
    
    # Define how to render the game or environment 
    def render(): 
        pass
    
    # What happens when we start a new game 
    def reset(self): 
        self.game.new_episode()
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.game.close()

In [8]:
env = VizDoomGym(render=True)

In [9]:
state = env.reset()

In [10]:
env.close()

#### Callbacks

In [12]:
# Import os for file nav
import os 
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback

In [13]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [14]:
CHECKPOINT_DIR = './train/train_defend'
LOG_DIR = './logs/log_defend'

In [15]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

#### Train Model

In [16]:
# import ppo for training
from stable_baselines3 import PPO

In [17]:
# Non rendered environment
env = VizDoomGym()

In [18]:
model = PPO('MlpPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.0001, n_steps=4096)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [19]:
model.learn(total_timesteps=100000, callback=callback)

Logging to ./logs/log_defend\PPO_10
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 26.7     |
|    ep_rew_mean     | -50.9    |
| time/              |          |
|    fps             | 23       |
|    iterations      | 1        |
|    time_elapsed    | 173      |
|    total_timesteps | 4096     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 30.5        |
|    ep_rew_mean          | -69.4       |
| time/                   |             |
|    fps                  | 21          |
|    iterations           | 2           |
|    time_elapsed         | 387         |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.012464103 |
|    clip_fraction        | 0.0365      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.00042

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 25.8       |
|    ep_rew_mean          | -55.8      |
| time/                   |            |
|    fps                  | 21         |
|    iterations           | 11         |
|    time_elapsed         | 2062       |
|    total_timesteps      | 45056      |
| train/                  |            |
|    approx_kl            | 0.00941414 |
|    clip_fraction        | 0.0626     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.895     |
|    explained_variance   | 0.0817     |
|    learning_rate        | 0.0001     |
|    loss                 | 2.12e+03   |
|    n_updates            | 100        |
|    policy_gradient_loss | -0.00453   |
|    value_loss           | 4.29e+03   |
----------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 22.4         |
|    ep_re

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 20.6         |
|    ep_rew_mean          | -18.9        |
| time/                   |              |
|    fps                  | 4            |
|    iterations           | 21           |
|    time_elapsed         | 21492        |
|    total_timesteps      | 86016        |
| train/                  |              |
|    approx_kl            | 0.0040369416 |
|    clip_fraction        | 0.0295       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.789       |
|    explained_variance   | 0.411        |
|    learning_rate        | 0.0001       |
|    loss                 | 1.41e+03     |
|    n_updates            | 200          |
|    policy_gradient_loss | -0.000427    |
|    value_loss           | 3.24e+03     |
------------------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len

<stable_baselines3.ppo.ppo.PPO at 0x1cdc4fc16c0>

### Test Model

In [20]:
# Create rendered environment
env = VizDoomGym(render=True)

for episode in range(100): 
    obs = env.reset()
    done = False
    total_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(total_reward, episode))
    time.sleep(2)

In [22]:
env.close()