In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [None]:
import torch as torch
print(torch._C._cuda_getDeviceCount() > 0)
import imageio

In [None]:
import os
import gym 
from stable_baselines3 import DQN
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# 2. Load and Test Environment

In [None]:
#environment_name = "Pong-v0"
rewards = [[1,0,5], [1,10,4], [1,4,6]]
#rewards = [[1,0,8], [1,10,3]]

In [None]:
#env = gym.make(environment_name)
from GridEnv import WindyGridworld
env = WindyGridworld(
        height=20,
        width=11,
        rewards=rewards,
        wind=True,
        allowed_actions=['L', 'R', 'C'],
        reward_terminates_episode=False
    )

In [None]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

# 3. Train an RL Model

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
log_path = os.path.join('Training','Logs')

In [None]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [None]:
save_path = os.path.join('Training', 'Saved Models')
log_path = os.path.join('Training', 'Logs')

In [None]:
from GridEnv import WindyGridworld
env = env = WindyGridworld(
        height=20,
        width=11,
        rewards=rewards,
        wind=True,
        allowed_actions=['L', 'R', 'C'],
        reward_terminates_episode=False
    )

print(env.rewards)


In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=4, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, 
                             eval_freq=10000, 
                             best_model_save_path=save_path, 
                             verbose=1)

In [None]:
len(env.observation_space.shape)

In [None]:
model_plenty = DQN('CnnPolicy', env, verbose = 1, buffer_size = 10000, tensorboard_log=log_path)

In [None]:
model_plenty.learn(total_timesteps=3000000, callback=eval_callback)

In [None]:
save_path = os.path.join('Training', 'Saved Models', 'DQN_model_GridWorld_Plenty_3M')
#load_path = os.path.join('Training', 'Saved Models', 'DQN_model_GridWorld_Plenty')
model_plenty.save(save_path)


In [None]:
evaluate_policy(model_plenty, env, n_eval_episodes=30, render=True)

In [None]:
env.close()

In [None]:
# Saving Vid 
import imageio
import numpy as np
import cv2



images = []
obs = model_plenty.env.reset()
img = model_plenty.env.render(mode='rgb_array')
for i in range(350):
    images.append(img)
    action, _ = model_plenty.predict(obs)
    obs, _, _ ,_ = model_plenty.env.step(action)
    img = model_plenty.env.render(mode='rgb_array')

cv2.imshow('200',images[200])
imageio.mimsave('lander_a2c.gif', images)

In [None]:
env.close()