# Training Reward Predictor for Doom 

In [1]:
from utils.env import make_doom_env
from agents.doom_ppo_agent import DoomPpoAgent
from reward_predictors.doom_reward_predictor import DoomRewardPredictor
import gym
from datetime import datetime
from utils.memory import Memory
from torchsummary import summary

# Creating Environment
num_envs = 1
envs = gym.vector.SyncVectorEnv([ make_doom_env(level_config_path='vizdoom/scenarios/basic.cfg', render=False) for i in range(num_envs)])

# Setting up agent
agent = DoomPpoAgent(envs.single_observation_space, 
                     envs.single_action_space, 
                     models_path='./models/training_run_2023_07_07_02_24_27/checkpoint_step_292864')
reward_predictor = DoomRewardPredictor(envs.single_observation_space.shape, 1, hidden_size=512)

# Setting up agent training config
global_step = 0
start_datetime = datetime.now()
start_time = start_datetime.timestamp()
num_steps = 256
num_mini_batches = 32
num_training_epochs=10
batch_size = int(num_envs * num_steps)
mini_batch_size = batch_size // num_mini_batches
total_timesteps = 10000000
num_updates = total_timesteps // batch_size
checkpoint_frequency = 3000
memory = Memory(agent.device, num_steps, num_envs, envs.single_observation_space.shape, envs.single_action_space.shape)

Loading models...
Successfully loaded models!
Updating networks with weights from loaded models...
Successfully updated networks!


In [2]:
import time
import torch
import numpy as np

observation = torch.Tensor(envs.reset()).to(agent.device)
done = torch.zeros(num_envs).to(agent.device)

for update in range(1, num_updates + 1):

    for step in range(0, num_steps):
        global_step += num_envs

        # Getting next action and it's value
        with torch.no_grad():
            action, log_prob, _, value = agent.get_optimal_action_and_value(observation)
            value = value.flatten()

        observation_, reward, done_, info = envs.step(action.cpu().numpy())

        # Saving experience in memory
        memory.remember(
            step=step, 
            observation= observation,
            action=action,
            value=value,
            log_prob=log_prob,
            reward=torch.tensor(np.array(reward, dtype=np.float32)).to(agent.device).view(-1),
            done=done
        )

        # Saving new observation and done status for next step
        observation = torch.Tensor(observation_).to(agent.device) 
        done =  torch.Tensor(done_).to(agent.device)

        for item in info:
            if "episode" in item.keys():
                print(f"global_step={global_step}, episodic_return={item['episode']['r']}")
                break

    reward_predictor.train(memory.observations, memory.actions, memory.rewards, batch_size)

    

global_step=2, episodic_return=95.0
global_step=4, episodic_return=95.0
global_step=6, episodic_return=95.0
global_step=8, episodic_return=95.0
global_step=10, episodic_return=95.0
global_step=19, episodic_return=67.0
global_step=21, episodic_return=95.0
global_step=23, episodic_return=95.0
global_step=31, episodic_return=71.0
global_step=37, episodic_return=72.0
global_step=39, episodic_return=95.0
global_step=41, episodic_return=95.0
global_step=43, episodic_return=95.0
global_step=51, episodic_return=71.0
global_step=54, episodic_return=91.0
global_step=56, episodic_return=95.0
global_step=58, episodic_return=95.0
global_step=60, episodic_return=95.0
global_step=62, episodic_return=95.0
global_step=67, episodic_return=83.0
global_step=74, episodic_return=75.0
global_step=76, episodic_return=95.0
global_step=78, episodic_return=95.0
global_step=86, episodic_return=71.0
global_step=88, episodic_return=95.0
global_step=96, episodic_return=71.0
global_step=98, episodic_return=95.0
globa

KeyboardInterrupt: 

# Testing Reward Predictor