# Import

In [1]:
from vizdoom import *
import random
import time
import numpy as np
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2
# Import callback class from sb3
from stable_baselines3.common.callbacks import BaseCallback
import os
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common import policies
# import ppo for training
from stable_baselines3 import PPO
import torch
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env

# Complex Environment

In [2]:
doom = DoomGame()
doom.load_config('VizDoom/scenarios/deadly_corridor_s1.cfg')
doom.init()

In [3]:
# This is the set of actions we can take in the environment
actions = np.identity(7, dtype=np.uint8)

In [4]:
# This is the set of actions we can take in the environment
actions = np.identity(7, dtype=np.uint8)
state = doom.get_state()
state.game_variables

array([100.,  -1.,   0.,   0.,   0.,   0.,   0.])

In [16]:
# Loop through episodes 
episodes = 10 
for episode in range(episodes): 
    # Create a new episode or game 
    doom.new_episode()
    # Check the game isn't done 
    while not doom.is_episode_finished(): 
        # Get the game state 
        state = doom.get_state()
        # Get the game image 
        img = state.screen_buffer
        # Get the game variables - ammo
        info = state.game_variables
        # Take an action
        reward = doom.make_action(random.choice(actions),4)
        # Print rewward 
        # print('reward:', reward) 
        time.sleep(0.02)
    print('Result:', doom.get_total_reward())
    time.sleep(2)

Result: 72.08050537109375
Result: 368.4128875732422
Result: -111.71275329589844
Result: -24.210433959960938
Result: -56.12446594238281
Result: -57.519012451171875
Result: -115.98973083496094
Result: -63.54986572265625
Result: -88.73764038085938
Result: -51.16035461425781


In [5]:
doom.close()

In [28]:
state = doom.get_state()

In [29]:
state.game_variables

# Set-up openAI framework


In [6]:
# Create DOOM OpenAI Gym SIMPLE Environment
class VizDoom(Env): 
    def __init__(self, render=False): 
        super().__init__()
        self.doom = DoomGame()
        
        #load deadly_corridor configuration (complex environment)
        self.doom.load_config('VizDoom/scenarios/deadly_corridor_s1.cfg')
        
        # Define action & observation space
        self.action_space = Discrete(7)
        self.observation_space = Box(low=0, high=255, shape=(100,160,1), dtype=np.uint8) 
        
        #Set visibility of game
        if render == True: 
            self.doom.set_window_visible(True)
        else:
            self.doom.set_window_visible(False)
        
        # Start the game 
        self.doom.init()
        
        #game variable
        self.selected_weapon_ammo = 52
        self.hitcount = 0
        self.killcount = 0
        self.hits_taken = 0
        self.itemcount = 0
        self.damage_taken = 0
        
    # Create Step function
    def step(self, action):
        actions = np.identity(7)
        moving_reward = self.doom.make_action(actions[action], 4) 
        
        reward = 0
        
        # Get information from the game
        if self.doom.get_state(): 
            state = self.doom.get_state().screen_buffer
            state = self.grayscale(state)
    
            
            #Reward shaping
            gameVariables = self.doom.get_state().game_variables
            health, selected_weapon_ammo, hitcount, killcount, hits_taken, itemcount, damage_taken = gameVariables
            
            hitcount_result = hitcount - self.hitcount
            self.hitcount = hitcount
            
            killcount_result = killcount - self.killcount
            self.killcount = killcount
            
            
            hits_taken_result = - hits_taken + self.hits_taken
            self.hits_taken = hits_taken
            
            itemcount_result = itemcount - self.itemcount
            self.itemcount = itemcount
            
            selected_weapon_ammo_result = selected_weapon_ammo - self.selected_weapon_ammo
            self.selected_weapon_ammo = selected_weapon_ammo
            
            damage_taken_result = -damage_taken + self.damage_taken
            self.damage_taken = damage_taken
            
            reward = damage_taken_result*30 + hitcount_result*250 + selected_weapon_ammo_result*7 + killcount_result*30 + itemcount_result*3 + moving_reward
            
            info = selected_weapon_ammo
        else: 
            state = np.zeros(self.observation_space.shape)
            info = 0 
        
        info = {"info":info}
        done = self.doom.is_episode_finished()
        
        return state, reward, done, info 
    
    def render(self, mode='human'):
        if mode == 'rgb_array':
            return np.zeros((64, 64, 3), dtype=np.uint8)  # return an empty image
        elif mode == 'ansi':
            return ''  # return an empty string
        else:
            super(MyEnvironment, self).render(mode=mode)
    
    # Reset game
    def reset(self): 
        self.doom.new_episode()
        state = self.doom.get_state().screen_buffer
        return self.grayscale(state)
    
    # Grayscale the game frame and resize it 
    def grayscale(self, observation):
        gray = cv2.cvtColor(np.moveaxis(observation, 0, -1), cv2.COLOR_BGR2GRAY)
        resize = cv2.resize(gray, (160,100), interpolation=cv2.INTER_CUBIC)
        state = np.reshape(resize, (100,160,1))
        return state
    
    # Call to close down the game
    def close(self): 
        self.doom.close()

In [7]:
env = VizDoom(render = False)

In [163]:
env.close()

In [8]:
from stable_baselines3.common import env_checker

In [9]:
env_checker.check_env(env)

# HyperParameter Tuning

In [168]:
import optuna

In [192]:
def optimise_ppo(trial):
    """ Learning hyperparamters we want to optimise"""
    return {
        'n_steps': int(trial.suggest_loguniform('n_steps', 640, 8960)),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
        'ent_coef': trial.suggest_loguniform('ent_coef', 1e-7, 1e-1),
    }

def optimise_agent(trial):
    model_params = optimise_ppo(trial)
    env = VizDoom(render=False)
    model = PPO('CnnPolicy', env, verbose =1, **model_params)
    model.learn(30000)
    
    
    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    while n_episodes < 4:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        reward_sum += reward

        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)

    return -1 * last_reward

In [193]:
study = optuna.create_study()

[32m[I 2023-04-08 00:12:47,783][0m A new study created in memory with name: no-name-9304f84e-3b40-4752-8524-f184ae680763[0m


In [194]:
study.optimize(optimise_agent, n_trials = 10)

  'n_steps': int(trial.suggest_loguniform('n_steps', 640, 8960)),
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
  'ent_coef': trial.suggest_loguniform('ent_coef', 1e-7, 1e-1),


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=8644 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 182      |
|    ep_rew_mean     | 67.8     |
| time/              |          |
|    fps             | 33       |
|    iterations      | 1        |
|    time_elapsed    | 255      |
|    total_timesteps | 8644     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 187        |
|    ep_rew_mean          | 98.6       |
| time/                   |            |
|    fps                  | 32         |
|    iterations           | 2          |
|    time_elapsed         | 536        |
|    total_timesteps      | 17288      |
| train/                  |            |
|    approx_kl            | 0.01015413 |
|    clip_fraction        | 0.146      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.94      |
|    explained_variance   | 1.07e-05   |
|    learning_rate        | 3.93e-05   |
|   

[32m[I 2023-04-08 00:31:39,065][0m Trial 0 finished with value: 23.875656127929688 and parameters: {'n_steps': 8644.196732534117, 'learning_rate': 3.9314477322718775e-05, 'ent_coef': 0.0052528441125594405}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2411 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 218      |
|    ep_rew_mean     | 78.9     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 74       |
|    total_timesteps | 2411     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 120       |
|    ep_rew_mean          | 110       |
| time/                   |           |
|    fps                  | 31        |
|    iterations           | 2         |
|    time_elapsed         | 154       |
|    total_timesteps      | 4822      |
| train/                  |           |
|    approx_kl            | 38.796406 |
|    clip_fraction        | 0.987     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.16     |
|    explained_variance   | 1.3e-05   |
|    learning_rate        | 0.0116    |
|    loss           

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 108       |
|    ep_rew_mean          | -14       |
| time/                   |           |
|    fps                  | 30        |
|    iterations           | 12        |
|    time_elapsed         | 961       |
|    total_timesteps      | 28932     |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | -9.75e-07 |
|    explained_variance   | 0.902     |
|    learning_rate        | 0.0116    |
|    loss                 | 476       |
|    n_updates            | 110       |
|    policy_gradient_loss | 2.18e-08  |
|    value_loss           | 4.1e+03   |
---------------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 105           |
|    ep_rew_mean          | 

[32m[I 2023-04-08 00:49:20,128][0m Trial 1 finished with value: 160.99990463256836 and parameters: {'n_steps': 2411.013829289084, 'learning_rate': 0.011640321797837848, 'ent_coef': 9.091527312452334e-06}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=2320 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 193      |
|    ep_rew_mean     | 75       |
| time/              |          |
|    fps             | 33       |
|    iterations      | 1        |
|    time_elapsed    | 68       |
|    total_timesteps | 2320     |
---------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 139      |
|    ep_rew_mean          | 11.1     |
| time/                   |          |
|    fps                  | 32       |
|    iterations           | 2        |
|    time_elapsed         | 144      |
|    total_timesteps      | 4640     |
| train/                  |          |
|    approx_kl            | 57.86133 |
|    clip_fraction        | 0.992    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.0699  |
|    explained_variance   | 9.2e-05  |
|    learning_rate        | 0.00932  |
|    loss                 | 5.56e+03

---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 113       |
|    ep_rew_mean          | -11.9     |
| time/                   |           |
|    fps                  | 30        |
|    iterations           | 12        |
|    time_elapsed         | 912       |
|    total_timesteps      | 27840     |
| train/                  |           |
|    approx_kl            | 1.2385926 |
|    clip_fraction        | 0.015     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.00107  |
|    explained_variance   | 0.888     |
|    learning_rate        | 0.00932   |
|    loss                 | 480       |
|    n_updates            | 110       |
|    policy_gradient_loss | 0.0531    |
|    value_loss           | 5.25e+03  |
---------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 112        |
|    ep_rew_mean          | -13.4    

[32m[I 2023-04-08 01:06:13,391][0m Trial 2 finished with value: 209.98968887329102 and parameters: {'n_steps': 2320.5271869658977, 'learning_rate': 0.009322342019146265, 'ent_coef': 0.0021151216643668404}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=7414 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 213      |
|    ep_rew_mean     | 98.4     |
| time/              |          |
|    fps             | 30       |
|    iterations      | 1        |
|    time_elapsed    | 242      |
|    total_timesteps | 7414     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 204       |
|    ep_rew_mean          | 148       |
| time/                   |           |
|    fps                  | 29        |
|    iterations           | 2         |
|    time_elapsed         | 503       |
|    total_timesteps      | 14828     |
| train/                  |           |
|    approx_kl            | 0.0160996 |
|    clip_fraction        | 0.201     |
|    clip_range           | 0.2       |
|    entropy_loss         | -1.93     |
|    explained_variance   | 7.63e-06  |
|    learning_rate        | 0.000101  |
|    loss           

[32m[I 2023-04-08 01:27:26,394][0m Trial 3 finished with value: 152.58916473388672 and parameters: {'n_steps': 7414.178037424672, 'learning_rate': 0.00010149827495487078, 'ent_coef': 0.004437401206035003}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4651 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 220      |
|    ep_rew_mean     | 80.7     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 141      |
|    total_timesteps | 4651     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 147       |
|    ep_rew_mean          | 13.6      |
| time/                   |           |
|    fps                  | 31        |
|    iterations           | 2         |
|    time_elapsed         | 294       |
|    total_timesteps      | 9302      |
| train/                  |           |
|    approx_kl            | 26.933367 |
|    clip_fraction        | 0.998     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.03     |
|    explained_variance   | 1.35e-05  |
|    learning_rate        | 0.00923   |
|    loss           

[32m[I 2023-04-08 01:45:36,853][0m Trial 4 finished with value: 165.99876022338867 and parameters: {'n_steps': 4651.407759294581, 'learning_rate': 0.009225296965775249, 'ent_coef': 0.0003160654432648936}. Best is trial 0 with value: 23.875656127929688.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=4235 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 174      |
|    ep_rew_mean     | 61       |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 129      |
|    total_timesteps | 4235     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 180         |
|    ep_rew_mean          | 54.4        |
| time/                   |             |
|    fps                  | 30          |
|    iterations           | 2           |
|    time_elapsed         | 274         |
|    total_timesteps      | 8470        |
| train/                  |             |
|    approx_kl            | 0.012253544 |
|    clip_fraction        | 0.121       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.94       |
|    explained_variance   | 1.22e-05    |
|    learning_rate        | 3.

[32m[I 2023-04-08 02:05:15,597][0m Trial 5 finished with value: -5.399448394775391 and parameters: {'n_steps': 4235.429057133876, 'learning_rate': 3.294570236858586e-05, 'ent_coef': 1.998342701539892e-06}. Best is trial 5 with value: -5.399448394775391.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=6556 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 259      |
|    ep_rew_mean     | 83.5     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 201      |
|    total_timesteps | 6556     |
---------------------------------
---------------------------------------
| rollout/                |           |
|    ep_len_mean          | 155       |
|    ep_rew_mean          | 11.7      |
| time/                   |           |
|    fps                  | 31        |
|    iterations           | 2         |
|    time_elapsed         | 420       |
|    total_timesteps      | 13112     |
| train/                  |           |
|    approx_kl            | 60.817234 |
|    clip_fraction        | 0.997     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.0467   |
|    explained_variance   | 6.28e-05  |
|    learning_rate        | 0.00385   |
|    loss           

[32m[I 2023-04-08 02:24:03,732][0m Trial 6 finished with value: 262.9928894042969 and parameters: {'n_steps': 6556.8725197734, 'learning_rate': 0.003853436331065002, 'ent_coef': 6.973990921530112e-05}. Best is trial 5 with value: -5.399448394775391.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=5593 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 171      |
|    ep_rew_mean     | 52.9     |
| time/              |          |
|    fps             | 34       |
|    iterations      | 1        |
|    time_elapsed    | 162      |
|    total_timesteps | 5593     |
---------------------------------
--------------------------------------
| rollout/                |          |
|    ep_len_mean          | 126      |
|    ep_rew_mean          | 17.1     |
| time/                   |          |
|    fps                  | 28       |
|    iterations           | 2        |
|    time_elapsed         | 393      |
|    total_timesteps      | 11186    |
| train/                  |          |
|    approx_kl            | 86.5476  |
|    clip_fraction        | 0.992    |
|    clip_range           | 0.2      |
|    entropy_loss         | -0.0567  |
|    explained_variance   | 7.55e-05 |
|    learning_rate        | 0.0148   |
|    loss                 | 3.47e+03

[32m[I 2023-04-08 12:19:28,441][0m Trial 7 finished with value: 160.98826217651367 and parameters: {'n_steps': 5593.264006958653, 'learning_rate': 0.014833096573661157, 'ent_coef': 8.560725705999883e-07}. Best is trial 5 with value: -5.399448394775391.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=1850 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 229      |
|    ep_rew_mean     | 31.5     |
| time/              |          |
|    fps             | 29       |
|    iterations      | 1        |
|    time_elapsed    | 61       |
|    total_timesteps | 1850     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 261         |
|    ep_rew_mean          | 142         |
| time/                   |             |
|    fps                  | 29          |
|    iterations           | 2           |
|    time_elapsed         | 126         |
|    total_timesteps      | 3700        |
| train/                  |             |
|    approx_kl            | 0.053512257 |
|    clip_fraction        | 0.311       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.92       |
|    explained_variance   | 3.26e-05    |
|    learning_rate        | 0.

-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 107           |
|    ep_rew_mean          | -18.2         |
| time/                   |               |
|    fps                  | 30            |
|    iterations           | 11            |
|    time_elapsed         | 674           |
|    total_timesteps      | 20350         |
| train/                  |               |
|    approx_kl            | 3.2023374e-05 |
|    clip_fraction        | 0.000862      |
|    clip_range           | 0.2           |
|    entropy_loss         | -0.00168      |
|    explained_variance   | 0.868         |
|    learning_rate        | 0.000308      |
|    loss                 | 336           |
|    n_updates            | 100           |
|    policy_gradient_loss | 4.23e-05      |
|    value_loss           | 4.74e+03      |
-------------------------------------------
-------------------------------------------
| rollout/                |     

[32m[I 2023-04-08 12:37:06,553][0m Trial 8 finished with value: 60.99372100830078 and parameters: {'n_steps': 1850.022432509417, 'learning_rate': 0.0003082849920142514, 'ent_coef': 0.04786984472324129}. Best is trial 5 with value: -5.399448394775391.[0m


Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


We recommend using a `batch_size` that is a factor of `n_steps * n_envs`.
Info: (n_steps=3394 and n_envs=1)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 206      |
|    ep_rew_mean     | 83.3     |
| time/              |          |
|    fps             | 32       |
|    iterations      | 1        |
|    time_elapsed    | 104      |
|    total_timesteps | 3394     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 127        |
|    ep_rew_mean          | 204        |
| time/                   |            |
|    fps                  | 30         |
|    iterations           | 2          |
|    time_elapsed         | 223        |
|    total_timesteps      | 6788       |
| train/                  |            |
|    approx_kl            | 0.47561276 |
|    clip_fraction        | 0.541      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.84      |
|    explained_variance   | 7.99e-06   |
|    learning_rate        | 0.000489   |
|   

[32m[I 2023-04-08 12:55:03,816][0m Trial 9 finished with value: -1402.1233978271484 and parameters: {'n_steps': 3394.994496832663, 'learning_rate': 0.0004891682519177643, 'ent_coef': 1.705155573551758e-07}. Best is trial 9 with value: -1402.1233978271484.[0m


In [195]:
print(study.best_params)

{'n_steps': 3394.994496832663, 'learning_rate': 0.0004891682519177643, 'ent_coef': 1.705155573551758e-07}


# Train Model

In [10]:
class TrainAndLoggingCallback(BaseCallback):

    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainAndLoggingCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'model_{}'.format(self.n_calls))
            self.model.save(model_path)

        return True

In [11]:
env = VizDoom(False)

In [12]:
CHECKPOINT_DIR = './model/deadly_model/PPO3_model_deadly/'
LOG_DIR = './model_log/log_deadly'

In [13]:
callback = TrainAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [14]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.00001, n_steps=8192, ent_coef = 0.00001, gamma = 0.95, clip_range=.1, gae_lambda=.9)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [None]:
model.learn(total_timesteps=500000, callback=callback)

Logging to ./model_log/log_deadly\PPO_6
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 254      |
|    ep_rew_mean     | 71       |
| time/              |          |
|    fps             | 33       |
|    iterations      | 1        |
|    time_elapsed    | 246      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 241         |
|    ep_rew_mean          | 118         |
| time/                   |             |
|    fps                  | 30          |
|    iterations           | 2           |
|    time_elapsed         | 539         |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.002248768 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.1         |
|    entropy_loss         | -1.94       |
|    explained_variance   | -4.4

In [270]:
# Reload model from disc
model = PPO.load('./model/PPO2_model_corridor/model_130000')

In [271]:
# Access the policy network used by the model
policy = model.policy

# Modify the learning rate of the optimizer
policy.optimizer.lr = 0.0001

# Modify the ent_coef parameter
policy.ent_coef = 0.1

In [272]:
# Access the policy network used by the model
policy = model.policy

# Access the learning rate of the optimizer
lr = policy.optimizer.lr
ent_coef = policy.ent_coef
# Print the learning rate
print("Current learning rate: {}".format(lr))
print("Current ent_coef: {}".format(ent_coef))

Current learning rate: 0.0001
Current ent_coef: 0.1


In [273]:
policy = model.policy

# Get the number of output nodes in the model
n_output_nodes = policy.action_net.out_features

# Print the number of output nodes in the model
print("Number of output nodes in the model: {}".format(n_output_nodes))

Number of output nodes in the model: 7


In [274]:
pretrained_model = PPO.load('./model/PPO2_model_corridor/model_130000')

In [275]:
model = PPO('CnnPolicy', env, tensorboard_log=LOG_DIR, verbose=1, learning_rate=0.0001, n_steps=8192, ent_coef = 0.1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


In [276]:
model.set_parameters(pretrained_model.get_parameters())

In [277]:
# Access the policy network used by the model
policy = model.policy

In [279]:
model.learn(total_timesteps=100000, callback=callback)

Logging to ./logs/log_corridor\PPO_5
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 146      |
|    ep_rew_mean     | 206      |
| time/              |          |
|    fps             | 13       |
|    iterations      | 1        |
|    time_elapsed    | 604      |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 165         |
|    ep_rew_mean          | 230         |
| time/                   |             |
|    fps                  | 13          |
|    iterations           | 2           |
|    time_elapsed         | 1237        |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.025899043 |
|    clip_fraction        | 0.282       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.83       |
|    explained_variance   | 0.453  

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 275        |
|    ep_rew_mean          | 845        |
| time/                   |            |
|    fps                  | 13         |
|    iterations           | 11         |
|    time_elapsed         | 6703       |
|    total_timesteps      | 90112      |
| train/                  |            |
|    approx_kl            | 0.06696437 |
|    clip_fraction        | 0.454      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.69      |
|    explained_variance   | 0.233      |
|    learning_rate        | 0.0001     |
|    loss                 | 1.45e+04   |
|    n_updates            | 100        |
|    policy_gradient_loss | 0.0146     |
|    value_loss           | 1.4e+04    |
----------------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 294         |
|    ep_rew_m

<stable_baselines3.ppo.ppo.PPO at 0x15756682c20>

# Evaluate

In [181]:
# Import eval policy to test agent
from stable_baselines3.common.evaluation import evaluate_policy

In [27]:
# Reload model from disc
model = PPO.load('./model/deadly_model/PPO_model_deadly/model_500000')

In [34]:
# Create rendered environment
env = VizDoom(render=True)

In [36]:
for episode in range(10): 
    obs = env.reset()
    done = False
    result_reward = 0
    while not done: 
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # time.sleep(0.20)
        result_reward += reward
    print('Episode {}: Total Reward is {}'.format(episode, result_reward))
    time.sleep(1)

Total Reward for episode 1498.1846618652344 is 0
Total Reward for episode 481.5829162597656 is 1
Total Reward for episode 1785.810043334961 is 2
Total Reward for episode 724.0308685302734 is 3
Total Reward for episode 579.0882873535156 is 4
Total Reward for episode 1219.1358337402344 is 5
Total Reward for episode 1475.3241271972656 is 6
Total Reward for episode 775.8914489746094 is 7
Total Reward for episode 1239.0442199707031 is 8
Total Reward for episode 559.2709045410156 is 9


In [202]:
import gc
torch.cuda.empty_cache()
gc.collect()

16407