### Importing the libraries

In [16]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union

import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch as th

from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.callbacks import BaseCallback

In [66]:
#!cd github & git clone https://github.com/mwydmuch/ViZDoom.git

### Importing  OpenAI and Doom Dependencies

In [3]:
from vizdoom import *
import random
import time
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2

### Game Setup

In [4]:
#Creating a game instance
game = DoomGame()
#Load our desired Doom configurations 
game.load_config('github/ViZDoom/scenarios/basic.cfg')
#starting up the game
#game.init()

True

In [16]:
game.close()

### Wrapping and defining our environment 

In [11]:
#Define our vizdoom environment class
class VizDoomGym(Env):
    #Initialize our environment
    def __init__(self, render=False):
        #inherit from env base class
        super().__init__()
        self.game = DoomGame()
        #This allows us to load up our configurations which defines our maps,rewards,buttons etc...
        self.game.load_config('github/ViZDoom/scenarios/basic.cfg')
        
        
        #Determine if to render game window
        if render == False:
            self.game.set_window_visible(False)
        else:
            self.game.set_window_visible(True)
        
        #start up the game
        self.game.init()
        
        #create our observation space.
        #We want the same of the observation space to match the game frame exactly- 
        #This is what is used to establish the parameters for the underlying models.
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8)
        #define our action space
        self.action_space = Discrete(3)
    #take step in environment
    def step(self, action):
        #Define the action to take
        actions = np.identity(3, dtype=np.uint8)
        #this actions will be a matrix defining if the agent go left,
        #right or shoot and also our frame skip parameter
        reward = self.game.make_action(actions[action],4)        
        
        #return numpy zeroes array if nothing is returned
        if self.game.get_state():
            state = self.game.get_state().screen_buffer
            #gray scaling the captured image
            state = self.grayscale(state)
            ammo = self.game.get_state().game_variables[0]
            info = ammo
        else:
            state = np.zeros(self.observation_space.shape)
            info = 0      
        
        
        
        info = {"info":info}
        done = self.game.is_episode_finished()
        
        return state, reward, done, info
    #render game 
    def render():
        pass    
    def reset(self):
        self.game.new_episode()        
        state = self.game.get_state().screen_buffer
        return self.grayscale(state)
    
    #grayscale and resize the image
    def grayscale(self, observation):
        #take the observation, grab the color channel and move it to the end
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    #close the game
    def close(self):
        self.game.close()

### Verify our environment

In [6]:
#import environment checker
from stable_baselines3.common import env_checker

In [12]:
env = VizDoomGym(render=True)

In [13]:
env_checker.check_env(env)

In [14]:
env.close()

### Defining our Custom Neural Network

In [17]:
class CustomNetwork(nn.Module):
  
    def __init__(
        self,
        feature_dim: int,
        last_layer_dim_pi: int = 64,
        last_layer_dim_vf: int = 64,
    ):
        super(CustomNetwork, self).__init__()

        # IMPORTANT:
        # Save output dimensions, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi
        self.latent_dim_vf = last_layer_dim_vf

        # Policy network
        self.policy_net = nn.Sequential(
            nn.Linear(feature_dim, last_layer_dim_pi), nn.ReLU()
        )
        # Value network
        self.value_net = nn.Sequential(
            nn.Linear(feature_dim, last_layer_dim_vf), nn.ReLU()
        )

    def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        
        return self.policy_net(features), self.value_net(features)

    def forward_actor(self, features: th.Tensor) -> th.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: th.Tensor) -> th.Tensor:
        return self.value_net(features)


### Defining our Custom Actor-Critic Policy

In [18]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        lr_schedule: Callable[[float], float],
        net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
        activation_fn: Type[nn.Module] = nn.Tanh,
        *args,
        **kwargs,
    ):

        super(CustomActorCriticPolicy, self).__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            # Pass remaining arguments to base class
            *args,
            **kwargs,
        )
        # Disable orthogonal initialization
        self.ortho_init = False

    def _build_mlp_extractor(self) -> None:
        self.mlp_extractor = CustomNetwork(self.features_dim)


### Setting up Callback

In [50]:
class TrainingAndLoggingCallback(BaseCallback):
    
    #We pass in how frequently we want to save our model, were we are-
    #going to be saving it and logs
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainingAndLoggingCallback, self). __init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

In [42]:
#create nessesary folders
CHECKPOINT_DIR = './train/train_basic'
LOG_DIR = './train/log_basic'

In [43]:
#create an instance of our train and logging callbacks
#after the stated steps of training our model, we will save a version of it
callback = TrainingAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

### Train our Model

In [46]:
#create our environment without rendering it
env = VizDoomGym()

In [51]:
#Create our model using our custom actor critic policy and NN
model = PPO(CustomActorCriticPolicy, env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.001)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [52]:
#Train our model
model.learn(total_timesteps=40000,callback=callback)

Logging to ./train/log_basic\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 38.7     |
|    ep_rew_mean     | -125     |
| time/              |          |
|    fps             | 234      |
|    iterations      | 1        |
|    time_elapsed    | 8        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 38          |
|    ep_rew_mean          | -125        |
| time/                   |             |
|    fps                  | 165         |
|    iterations           | 2           |
|    time_elapsed         | 24          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.007983889 |
|    clip_fraction        | 0.0736      |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | -0.000134

------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 25.4         |
|    ep_rew_mean          | -49.7        |
| time/                   |              |
|    fps                  | 139          |
|    iterations           | 11           |
|    time_elapsed         | 161          |
|    total_timesteps      | 22528        |
| train/                  |              |
|    approx_kl            | 0.0036789463 |
|    clip_fraction        | 0.0352       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.913       |
|    explained_variance   | 0.353        |
|    learning_rate        | 0.0003       |
|    loss                 | 2.4e+03      |
|    n_updates            | 100          |
|    policy_gradient_loss | -0.00194     |
|    value_loss           | 5.34e+03     |
------------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mea

<stable_baselines3.ppo.ppo.PPO at 0x1d69756d3a0>

### Test and evaluate our model

In [34]:
#Import evaluate policy to test our agent
from stable_baselines3.common.evaluation import evaluate_policy

In [35]:
#Load model
model = PPO.load('./train/train_basic/best_model_100000')

In [36]:
#create our environment with rendering to test model
env = VizDoomGym(render=True)

In [37]:
#loop through each game
for episode in range(5):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(total_reward, episode))
    time.sleep(2)

Total Reward for episode 95.0 is 0
Total Reward for episode 9.0 is 1
Total Reward for episode 95.0 is 2
Total Reward for episode -395.0 is 3
Total Reward for episode 95.0 is 4


In [38]:
env.close()