### Importing the libraries

In [1]:
from typing import Callable, Dict, List, Optional, Tuple, Type, Union, Any

import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import torch as th

from stable_baselines3 import PPO
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
from stable_baselines3.common.policies import ActorCriticPolicy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import VecTransposeImage
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import EvalCallback

In [111]:
#!cd github & git clone https://github.com/mwydmuch/ViZDoom.git

For this project we will be making use of openai gym to train our agents in a Unity game environment

### Import Unity gym dependencies

In [2]:
from gym_unity.envs import UnityToGymWrapper
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.side_channel.engine_configuration_channel import EngineConfigurationChannel
from mlagents_envs.registry import default_registry
from mlagents_envs.registry import default_registry
import os
import random
import time
import gym
from gym import Env
from gym.spaces import Discrete, Box
import cv2

### Game Setup

We pass in the file path that contains the game executable file

In [3]:
env_path = "C:/Users/Jolomi/Downloads/CCT 4th Year/Capstone/Unity_ML/UL_ML/TestBuild_ML2/UL_ML.exe"

We will be training our agent in a Unity engine environment using customized algorithms, to do this we need to "wrap" the game with openai gym wrapper

This enables us to collect observations from game along with rewards and pass it to our agent. This in turn enables our agent to perform actions in the game world which will be how our agent learns. 

### Wrapping and defining our environment 

In [4]:
class UnityGymCrawler(Env):
    def __init__(self, render=False):
        super(UnityGymCrawler, self).__init__()
        
        #Used for modifying the unity environment
        channel = EngineConfigurationChannel()
        
        #Our Unity script is written in C# while most RL algorithms are in python 
        #We will use the UnityEnvironment wrapper to be able to communicate with the
        #unity environment using python.       
        #We will also decide if we want to render game environment while training. ,side_channels=[channel]
        if render == False:
            env = UnityEnvironment(env_path, worker_id = 5, no_graphics=True)
        else:
            env = UnityEnvironment(env_path, worker_id = 0,side_channels=[channel])       
        
        
        #We change the time scale of the game using a unity environment side channel
        #This enables us to speed up the learning process but the physics in the game may perform unpredictably
        channel.set_configuration_parameters(time_scale = 2.0)
        #Wrapping the python unity environment so that we can use it in openai gym
        env = UnityToGymWrapper(env, allow_multiple_obs=True)
        
        
        
        #We define the action space and size as well as the observation space
        #this allows our RL algorithms to effectivly communicate with the environment
        self.env = env
        self.action_space = self.env.action_space
        self.action_size = self.env.action_size
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,32), dtype=np.uint8)       

    #reset the environment 
    def reset(self):       
        return self.env.reset()
    #This moves the game foward and returns the game state s, reward r
    #done d(to indicate if the game is done) and information info from the game such as player lives
    def step(self, action):
        s, r, d, info = self.env.step(action)
        return s, float(r), d, info
    #Closes the environment
    def close(self):
        #self.env.close()
        pass
    #render the environment
    def render(self, mode="human"):
        #self.env.render()
        pass

In [17]:
class UnityGymWorm(Env):
    def __init__(self, render=False):
        super(UnityGymWorm, self).__init__()
        
        #Used for modifying the unity environment
        channel = EngineConfigurationChannel()
        
        #Our Unity script is written in C# while most RL algorithms are in python 
        #We will use the UnityEnvironment wrapper to be able to communicate with the
        #unity environment using python.       
        #We will also decide if we want to render game environment while training. ,side_channels=[channel]
        if render == False:
            env = UnityEnvironment(env_path, worker_id = 0,side_channels=[channel], no_graphics=True)
        else:
            env = UnityEnvironment(env_path, worker_id = 0,side_channels=[channel])       
        
        
        #We change the time scale of the game using a unity environment side channel
        #This enables us to speed up the learning process but the physics in the game may perform unpredictably
        channel.set_configuration_parameters(time_scale = 2.0)
        #Wrapping the python unity environment so that we can use it in openai gym
        env = UnityToGymWrapper(env, allow_multiple_obs=True)
        
        
        
        #We define the action space and size as well as the observation space
        #this allows our RL algorithms to effectivly communicate with the environment
        self.env = env
        self.action_space = self.env.action_space
        self.action_size = self.env.action_size
        self.observation_space = gym.spaces.Box(low=0, high=1, shape=(1,64), dtype=np.uint8)       

    #reset the environment 
    def reset(self):       
        return self.env.reset()
    #This moves the game foward and returns the game state s, reward r
    #done d(to indicate if the game is done) and information info from the game such as player lives
    def step(self, action):
        s, r, d, info = self.env.step(action)
        return s, float(r), d, info
    #Closes the environment
    def close(self):
        #self.env.close()
        pass
    #render the environment
    def render(self, mode="human"):
        #self.env.render()
        pass

In [88]:
env.close()

### Defining our Custom Neural Network

Creating a neural network for our Actor and Critic network, this will be the brains of our agent enabling it to learn and take action

In [8]:
class CustomNetwork(nn.Module):
  
    def __init__(
        self,
        feature_dim: int,
        last_layer_dim_pi: int = 32,
        last_layer_dim_vf: int = 32,        
    ):
        super(CustomNetwork, self).__init__()

        # IMPORTANT:
        # Save output dimensions, used to create the distributions
        self.latent_dim_pi = last_layer_dim_pi        
        self.latent_dim_vf = last_layer_dim_vf      

        # Policy network
        self.policy_net = nn.Sequential(
            nn.Linear(feature_dim, last_layer_dim_pi), nn.ReLU()            
        )
        # Value network
        self.value_net = nn.Sequential(
            nn.Linear(feature_dim, last_layer_dim_vf), nn.ReLU()            
        )

    def forward(self, features: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        
        return self.policy_net(features), self.value_net(features)

    def forward_actor(self, features: th.Tensor) -> th.Tensor:
        return self.policy_net(features)

    def forward_critic(self, features: th.Tensor) -> th.Tensor:
        return self.value_net(features)


### Defining our Custom Actor-Critic Policy

We create policies for the actor-critic network, here we can specify the number of neurons and size of our the hidden layers
in our networks using net_arch and how many are shared amoung the neural networks

In [9]:
class CustomActorCriticPolicy(ActorCriticPolicy):
    def __init__(
        self,
        observation_space: gym.spaces.Space,
        action_space: gym.spaces.Space,
        lr_schedule: Callable[[float], float],
        net_arch: Optional[List[Union[int, Dict[str, List[int]]]]] = None,
        #net_arch = [dict(pi=[32, 32, 32], vf=[32, 32, 32])],
        activation_fn: Type[nn.Module] = th.nn.ReLU,
        #activation_fn: Type[nn.Module] = nn.Tanh,
        *args,
        **kwargs,
    ):

        super(CustomActorCriticPolicy, self).__init__(
            observation_space,
            action_space,
            lr_schedule,
            net_arch,
            activation_fn,
            # Pass remaining arguments to base class
            *args,
            **kwargs,
        )
        # Disable orthogonal initialization
        self.ortho_init = False

    def _build_mlp_extractor(self) -> None:
        self.mlp_extractor = CustomNetwork(self.features_dim)


### Setting up Callback

 We pass in how frequently we want to save our model, were we are going to be saving it and logs

In [10]:
class TrainingAndLoggingCallback(BaseCallback):
    
   
    def __init__(self, check_freq, save_path, verbose=1):
        super(TrainingAndLoggingCallback, self). __init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path
        
    def _init_callback(self):
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)
            
            
    def _on_step(self):
        if self.n_calls % self.check_freq == 0:
            model_path = os.path.join(self.save_path, 'best_model_{}'.format(self.n_calls))
            self.model.save(model_path)
            
        return True

Create folders for saving our models and logs

In [11]:
CHECKPOINT_DIR = './train/train_basic6'
LOG_DIR = './train/log_basic6'

Create an instance of our train and logging callbacks. After the specified training steps, we will save the best version of our agent

In [12]:
callback = TrainingAndLoggingCallback(check_freq=10000, save_path=CHECKPOINT_DIR)

In [16]:
env.close()

### Train our Model

Create the video game environment without rendering it to save compute power

In [18]:
env = UnityGymWorm()

[INFO] Connected to Unity environment with package version 1.5.0-preview and communication version 1.2.0
[INFO] Connected new brain: WormStatic?team=0


In [157]:
#env = UnityGymCrawler()

[INFO] Connected to Unity environment with package version 1.5.0-preview and communication version 1.2.0
[INFO] Connected new brain: WormStatic?team=0




We now create our model using our custom neural network and policies

In [72]:
model = PPO(CustomActorCriticPolicy, env, tensorboard_log=LOG_DIR, verbose=1) , policy_kwargs=policy_kwargs

In [152]:
#model = PPO("MlpPolicy", env, tensorboard_log=LOG_DIR, verbose=1, learning_rate = 0.001)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


Train our model using a specified number of timesteps

In [153]:
model.learn(total_timesteps=60000,callback=callback)

Logging to ./train/log_basic6\PPO_7


KeyboardInterrupt: 

The model appears to be very unstable and unable to learn. We can imporve our model by altering between various hyperparameters like the learning rate, gamma and n_steps, we can also manually alter the number of neurons and hidden layers in our networks to find the best combination. 

However, doing all of this manually is going to be very time consuming and tedious. So, we will automate our hyperparameter search using Optuna.

## Optuna for hyperparameter tuning

### Dependencies for optuna

In [19]:
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler

### Optuna variables

We specify variables used by optuna, including the number of trails to be performed to find the ideal hyperparameters, policy etc

In [20]:
N_TRIALS = 100
N_STARTUP_TRIALS = 10
N_EVALUATIONS = 2
N_TIMESTEPS = int(2e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3
DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": env,
}

### Optuna Sampler

In our sampler we specify the range of values for the various hyperparameters that Optuna is going to search in order to find an ideal combination that yields a stable agent 

We also specify various configurations of neural networks for Optuna to search, this includes different number of neurons and hidden layers

In [21]:
def sample_PPO_params(trial: optuna.Trial) -> Dict[str, Any]:
    
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("gae_lambda_", gae_lambda)
    trial.set_user_attr("n_steps", n_steps)

    net_arch = [
        {"pi": [32], "vf": [32]} if net_arch == "tiny"
        else {"pi": [32, 32], "vf": [32, 32]}   
    ]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
            "ortho_init": ortho_init,
        },
    }



### Callback for evaluating trial

Callback to be used in evaluating and testing performance

In [22]:
class TrialEvalCallback(EvalCallback):
    
    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True

### Model Tuning

Using how specified hyperparameter and neural network architecture ranges, Optuna will not run a number of trials to find the best combination that produces the best possible agent.

After a stabel agent is found and Optuna determines that a better model is no longer possible, the trail is stopped and the  hyperparameters and network architecture values of the best model is displayed.

In [23]:
def objective(trial: optuna.Trial) -> float:

    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters
    kwargs.update(sample_PPO_params(trial))
    # Create our model
    model = PPO(**kwargs)
    # Wrapping env used for evaluation
    eval_env = Monitor(env)
    eval_env = DummyVecEnv([lambda: eval_env])
    #eval_env = VecTransposeImage(eval_env)    
    # Create the callback that will periodically evaluate
    # and report the performance
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-04-21 22:37:44,008][0m A new study created in memory with name: no-name-eb22b4b9-1357-461e-879d-a915358c053d[0m
[32m[I 2022-04-21 22:52:38,616][0m Trial 0 finished with value: 7.666666666666667e-06 and parameters: {'gamma': 0.0001536317694767627, 'max_grad_norm': 0.5775432638324483, 'gae_lambda': 0.023255323482526007, 'exponent_n_steps': 9, 'lr': 0.012086094807391527, 'ent_coef': 0.0010235593565421405, 'ortho_init': True, 'net_arch': 'tiny', 'activation_fn': 'relu'}. Best is trial 0 with value: 7.666666666666667e-06.[0m


Number of finished trials:  1
Best trial:
  Value:  7.666666666666667e-06
  Params: 
    gamma: 0.0001536317694767627
    max_grad_norm: 0.5775432638324483
    gae_lambda: 0.023255323482526007
    exponent_n_steps: 9
    lr: 0.012086094807391527
    ent_coef: 0.0010235593565421405
    ortho_init: True
    net_arch: tiny
    activation_fn: relu
  User attrs:
    gamma_: 0.9998463682305232
    gae_lambda_: 0.976744676517474
    n_steps: 512


A stable model was found after only two trails

### Create and train our tuned model

Having completed our hyperparameter search, we plug in the hyperparameters generated from the search into our model along with the network archtecture values.

We supply the searched network architecture, orth_init and activation function using policy kwargs

In [16]:
policy_kwargs2 = dict(
    ortho_init = True,
    activation_fn = th.nn.ReLU,
    net_arch = [dict(pi=[32, 32], vf=[32, 32])],
)

Create our model using searched hyperparameters and policy kwargs

In [17]:
model = PPO("CnnPolicy", env, tensorboard_log=LOG_DIR, verbose=1,
           gamma = 0.9171858127315068, max_grad_norm = 1.5061594927147473,
           gae_lambda = 0.9973635408305496, learning_rate = 0.000459048881080786,
           ent_coef = 2.4386279564585447e-08, policy_kwargs=policy_kwargs2, n_steps = 1024)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.


Train our agent using a specified number of timesteps

In [21]:
model.learn(total_timesteps=20000,callback=callback)

Logging to ./train/log_basic\PPO_40
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 12.9     |
|    ep_rew_mean     | 38.6     |
| time/              |          |
|    fps             | 98       |
|    iterations      | 1        |
|    time_elapsed    | 10       |
|    total_timesteps | 1024     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 6.93       |
|    ep_rew_mean          | 73         |
| time/                   |            |
|    fps                  | 79         |
|    iterations           | 2          |
|    time_elapsed         | 25         |
|    total_timesteps      | 2048       |
| train/                  |            |
|    approx_kl            | 0.07594001 |
|    clip_fraction        | 0.382      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.662     |
|    explained_variance   | 0.584      |
|    lear

----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 4.01       |
|    ep_rew_mean          | 86.7       |
| time/                   |            |
|    fps                  | 61         |
|    iterations           | 11         |
|    time_elapsed         | 183        |
|    total_timesteps      | 11264      |
| train/                  |            |
|    approx_kl            | 0.09675221 |
|    clip_fraction        | 0.0638     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.046     |
|    explained_variance   | 0.788      |
|    learning_rate        | 0.000459   |
|    loss                 | 32.9       |
|    n_updates            | 690        |
|    policy_gradient_loss | 0.0228     |
|    value_loss           | 74.6       |
----------------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 3.36       |
|    ep_rew_mean

<stable_baselines3.ppo.ppo.PPO at 0x2ca14cfcf40>

The training is now stable and our agent has learned to maximize its reward as can be seen from the high ep_rew_mean which is the average reward the agent got per episode 

### Test and evaluate our model

We will evaluate our model in the video game environment to determine its performance

In [None]:
#Import evaluate policy to test our agent
from stable_baselines3.common.evaluation import evaluate_policy

We load our best trained model to use for evaluation

In [22]:
model = PPO.load('./train/train_basic2/best_model_80000')

Create and render our environment to see the agents performance

In [23]:
env = VizDoomGym(render=True)

In [24]:
#loop through each game
for episode in range(10):
    obs = env.reset()
    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs)
        obs, reward, done, info = env.step(action)
        time.sleep(0.20)
        total_reward += reward
    print('Total Reward for episode {} is {}'.format(total_reward, episode))
    time.sleep(2)

Total Reward for episode 71.0 is 0
Total Reward for episode 95.0 is 1
Total Reward for episode 95.0 is 2
Total Reward for episode 95.0 is 3
Total Reward for episode 95.0 is 4
Total Reward for episode 95.0 is 5
Total Reward for episode 95.0 is 6
Total Reward for episode 83.0 is 7
Total Reward for episode 95.0 is 8
Total Reward for episode 79.0 is 9


The agent is fully trained and performs very well as seen from the high rewards above

In [25]:
env.close()