In [None]:
import gym
import argparse
import importlib
import time
import random
import numpy as np

import tensorflow as tf
import torch

import os
from os import listdir, makedirs
from os.path import isfile, join

from environments import JellyBeanEnv, MujocoEnv

In [2]:
def evaluate_agent(agent, env, n_episodes_to_evaluate):
  '''Evaluates the agent for a provided number of episodes.'''
  array_of_acc_rewards = []
  for _ in range(n_episodes_to_evaluate):
    acc_reward = 0
    done = False
    curr_obs = env.reset()
    while not done:
      action = agent.act(curr_obs, mode='eval')
      next_obs, reward, done, _ = env.step(action)
      acc_reward += reward
      curr_obs = next_obs
    array_of_acc_rewards.append(acc_reward)
  return np.mean(np.array(array_of_acc_rewards))

In [3]:
def get_environment(env_type):
  '''Generates an environment specific to the agent type.'''
  if 'jellybean' in env_type:
    env = JellyBeanEnv(gym.make('JBW-COMP579-obj-v1'))
  elif 'mujoco' in env_type:
    env = MujocoEnv(gym.make('Hopper-v2'))
  else:
    raise Exception("ERROR: Please define your env_type to be either 'jellybean' or 'mujoco'!")
  return env

def train_agent(agent,
                env,
                env_eval,
                total_timesteps,
                evaluation_freq,
                n_episodes_to_evaluate):

  seed = 0
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  tf.random.set_random_seed(seed)
  env.seed(seed)
  env_eval.seed(seed)
  
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False 
  timestep = 0
  array_of_mean_acc_rewards = []
    
  while timestep < total_timesteps:

    done = False
    curr_obs = env.reset()
    while not done:    
      action = agent.act(curr_obs, mode='train')
      next_obs, reward, done, _ = env.step(action)
      agent.update(curr_obs, action, reward, next_obs, done, timestep)
      curr_obs = next_obs
        
      timestep += 1
      if timestep % evaluation_freq == 0:
        mean_acc_rewards = evaluate_agent(agent, env_eval, n_episodes_to_evaluate)
        print('timestep: {ts}, acc_reward: {acr:.2f}'.format(ts=timestep, acr=mean_acc_rewards))
        array_of_mean_acc_rewards.append(mean_acc_rewards)

  return array_of_mean_acc_rewards

In [4]:
path = './GROUP_030/'
files = [f for f in listdir(path) if isfile(join(path, f))]
if ('agent.py' not in files) or ('env_info.txt' not in files):
  print("Your GROUP folder does not contain agent.py or env_info.txt!")
  exit()

with open(path+'env_info.txt') as f:
  lines = f.readlines()
env_type = lines[0].lower()

env = get_environment(env_type) 
env_eval = get_environment(env_type)
if 'jellybean' in env_type:
  env_specs = {'scent_space': env.scent_space, 'vision_space': env.vision_space, 'feature_space': env.feature_space, 'action_space': env.action_space}
if 'mujoco' in env_type:
  env_specs = {'observation_space': env.observation_space, 'action_space': env.action_space}
agent_module = importlib.import_module('GROUP_030.agent')
agent = agent_module.Agent(env_specs)

# Note these can be environment specific and you are free to experiment with what works best for you
total_timesteps = 2000000
evaluation_freq = 1000
n_episodes_to_evaluate = 20

  f"The environment {path} is out of date. You should consider "


In [6]:
env.action_space

Box(-1.0, 1.0, (3,), float32)

In [7]:
env_specs['action_space']

Box(-1.0, 1.0, (3,), float32)

In [8]:
learning_curve = train_agent(agent, env, env_eval, total_timesteps, evaluation_freq, n_episodes_to_evaluate)


timestep: 1000, acc_reward: 14.15
timestep: 2000, acc_reward: 21.45
timestep: 3000, acc_reward: 19.13
timestep: 4000, acc_reward: 21.15
timestep: 5000, acc_reward: 16.89
timestep: 6000, acc_reward: 12.68
timestep: 7000, acc_reward: 14.86
timestep: 8000, acc_reward: 15.18
timestep: 9000, acc_reward: 21.32
timestep: 10000, acc_reward: 23.30
timestep: 11000, acc_reward: 18.01
timestep: 12000, acc_reward: 17.45
timestep: 13000, acc_reward: 15.71
timestep: 14000, acc_reward: 19.29
timestep: 15000, acc_reward: 17.50
timestep: 16000, acc_reward: 18.46
timestep: 17000, acc_reward: 19.16
timestep: 18000, acc_reward: 14.37
timestep: 19000, acc_reward: 18.62
timestep: 20000, acc_reward: 17.12
timestep: 21000, acc_reward: 25.31
timestep: 22000, acc_reward: 17.95
timestep: 23000, acc_reward: 19.89
timestep: 24000, acc_reward: 25.41
timestep: 25000, acc_reward: 17.43
timestep: 26000, acc_reward: 16.49
timestep: 27000, acc_reward: 28.20
timestep: 28000, acc_reward: 11.80
timestep: 29000, acc_reward: