In [None]:
#Setup

import gym
import sys
sys.path.append('./src')
import torch
device = torch.device("cuda")

#env_id = "ALE/Pong-v5"
#env_id = 'CartPole-v1'
env_id = 'LunarLander-v2'
fbfile = "lunarlandernormal.dat"
# Create the env
env = gym.make(env_id)

env.reset()


from feedback import FeedbackManager

fb = FeedbackManager(fbfile, show_picker=True)

from rewardmodel import MlpRewardModel

rewardmodel = MlpRewardModel(env.observation_space.shape, device, n_hidden=20)

from envwithrewardmodel import EnvWithRewardModel
def makeenvfun():
    def _f():
        #env_id specified higher in the notebook
        base_env = gym.make(env_id)
        return EnvWithRewardModel(base_env, rewardmodel)
    return _f 

from stable_baselines3.common.vec_env import DummyVecEnv
vec_env = DummyVecEnv([makeenvfun() for i in range(4)])

from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3 import A2C



model = A2C("MlpPolicy", vec_env, verbose=1)

In [None]:
#Train reward model

for i in range(10000):
    batch = fb.randomBatch(20)
    rewardmodel.train(batch)

In [None]:
#Train agent

reward_target = 250
for i in range(10):
    model.learn(total_timesteps=1000, reset_num_timesteps=False)

    mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
    print(f"{(i+1) * 1000}: mean {mean_reward}, std {std_reward}")
    if mean_reward > reward_target:
        print(f"took {(i+1) * 1000} timesteps to reach mean reward {reward_target}")
        break;

In [None]:
#Env player - this does not play nice with clip harvesting

from envplayer import EnvPlayer

player = EnvPlayer(env, model, rewardmodel)

In [None]:
#Clip harvesting

from harvestclip import harvestClips

for i in range(20):
    clips, obs = harvestClips(env, model, n_timesteps=300)
    fb.queueClips(clips[0], clips[1], obs[0], obs[1])


In [None]:
fb.save()

In [None]:
fb.viewComparisons()