<a href="https://colab.research.google.com/github/NC25/gym_fishing/blob/master/fishing-v0/PPO2_fishing_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Dependencies

In [None]:
%tensorflow_version 1.x
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install stable-baselines[mpi]==2.10.0

# Packaging

In [None]:
!git clone https://github.com/boettiger-lab/gym_fishing.git

In [None]:
!python gym_fishing/setup.py sdist bdist_wheel 

In [None]:
!pip install -e ./gym_fishing/

In [None]:
!ls

In [6]:
!cd gym_fishing

In [7]:
import gym_fishing

# Remove Warnings

In [8]:

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)
import tensorflow as tf
tf.get_logger().setLevel('INFO')
tf.autograph.set_verbosity(0)
import logging
tf.get_logger().setLevel(logging.ERROR)

# Imports

In [9]:
import pandas as pd

In [10]:
import gym

In [11]:
import numpy as np

In [12]:
from stable_baselines import PPO2

from stable_baselines.common.policies import MlpPolicy

In [13]:
env = gym.make('fishing-v0')
model = PPO2(MlpPolicy, env , verbose=2)


Wrapping the env in a DummyVecEnv.


In [15]:
obs = env.reset()
for i in range(100):
  action, _states = model.predict(obs)
  obs, rewards, dones, info = env.step(action)
  print("harvest: " + str(action) + "\n" + "obs: " + str(obs) + "\n" + str(rewards) + "\n" + str(dones) + "\n" + str(info))
  env.render()
  
env.close()

harvest: 2
obs: [0.74394534]
0.010000000000000002
False
{}
harvest: 1
obs: [0.74460623]
0.012000000000000002
False
{}
harvest: 1
obs: [0.74283492]
0.014400000000000001
False
{}
harvest: 1
obs: [0.74797786]
0.01728
False
{}
harvest: 1
obs: [0.7535071]
0.020736
False
{}
harvest: 0
obs: [0.74216733]
0.020736
False
{}
harvest: 1
obs: [0.74917526]
0.0248832
False
{}
harvest: 1
obs: [0.73973179]
0.02985984
False
{}
harvest: 0
obs: [0.72438768]
0.02985984
False
{}
harvest: 0
obs: [0.71583256]
0.02985984
False
{}
harvest: 2
obs: [0.72745732]
0.023887872
False
{}
harvest: 2
obs: [0.73938066]
0.019110297600000003
False
{}
harvest: 1
obs: [0.73946722]
0.022932357120000003
False
{}
harvest: 1
obs: [0.73208245]
0.027518828544000003
False
{}
harvest: 2
obs: [0.73845657]
0.022015062835200005
False
{}
harvest: 0
obs: [0.7504917]
0.022015062835200005
False
{}
harvest: 0
obs: [0.74896835]
0.022015062835200005
False
{}
harvest: 2
obs: [0.75895795]
0.017612050268160005
False
{}
harvest: 1
obs: [0.7414027]

In [16]:
def evaluate(model, num_episodes=100): #episodes are the # of attempts the agent has to learn the env
  #returns mean reward
  env = model.get_env()
  all_episode_rewards = []
  total_action_space = []
  for episode in range(num_episodes):
    episode_rewards = []
    action_space = []
    done = False
    obs = env.reset()
    while not done:
      action, _states = model.predict(obs)
      obs, reward, done, info = env.step(action)
      episode_rewards.append(reward)
      action_space.append(action)
    all_episode_rewards.append(sum(episode_rewards))
    total_action_space.append(action_space)

  current_action_space = total_action_space.append(action_space)
  print("Actions taken: " + str(current_action_space))

  mean_episode_reward = np.mean(all_episode_rewards)
  print("Mean episode reward: " + str(mean_episode_reward) + " Number of episodes: " + str(num_episodes))

  return mean_episode_reward
  return current_action_space

#save helper






In [None]:
before_train_agent = evaluate(model)

In [17]:
#from stable_baselines3.common.evaluation import evaluate_policy 1.679

In [19]:
 trained_model = model.learn(total_timesteps=10000)

--------------------------------------
| approxkl           | 0.00013798967 |
| clipfrac           | 0.0           |
| explained_variance | 0.098         |
| fps                | 230           |
| n_updates          | 1             |
| policy_entropy     | 1.0984386     |
| policy_loss        | -0.0028428696 |
| serial_timesteps   | 128           |
| time_elapsed       | 3.19e-05      |
| total_timesteps    | 128           |
| value_loss         | 0.048134048   |
--------------------------------------
--------------------------------------
| approxkl           | 0.00017571829 |
| clipfrac           | 0.0           |
| explained_variance | -0.0142       |
| fps                | 935           |
| n_updates          | 2             |
| policy_entropy     | 1.0971485     |
| policy_loss        | -0.0014365921 |
| serial_timesteps   | 256           |
| time_elapsed       | 0.556         |
| total_timesteps    | 256           |
| value_loss         | 0.0016803794  |
-------------------------

In [19]:
evaluate(trained_model, num_episodes=100) 

Actions taken: None
Mean episode reward: 3.325334 Number of episodes: 100


3.325334

In [None]:
for i in range(100): #timesteps are the number of iterations the agent goes through
  action, _state = trained_model.predict(obs)
  obs, rewards, dones, info = env.step(action)
  print(action)
  print(evaluate(trained_model, num_episodes=100))
  env.render()

env.close()
  

# Saving and Loading

In [74]:
import os

# save directory
save_dir = '/tmp/gym/'
#creates recursive directory: fills out any missing directories
os.makedirs(save_dir, exist_ok=True) #error is not raised if target directory already exists

trained_model.save(save_dir +'PPO2_fishing-v0')     #save model

#sample observation and action
obs = trained_model.env.observation_space.sample()
action = trained_model.env.action_space.sample()

print("action + state: " + str(trained_model.predict(obs, deterministic=False)))






action + state: (0, None)


In [29]:
#del model

#load model 
loaded_model = PPO2.load(save_dir + 'PPO2_fishing')
print(loaded_model.predict(obs, deterministic=False))


Loading a model without an environment, this model cannot be trained until it has a valid environment.
(0, None)


# Loading and Tuning Previous Models

Let's load a DQN

In [59]:
from stable_baselines import DQN
from stable_baselines import A2C

In [60]:
from stable_baselines.common.vec_env import DummyVecEnv

In [62]:
save_dir = '/tmp/gym/'
os.makedirs(save_dir, exist_ok=True)

model = A2C('MlpPolicy', 'fishing-v0', verbose=2, gamma=.9).learn(2000)
model.save(save_dir + '/DQN_fishing')

#delete model
del model
loaded_model = A2C.load(save_dir + '/DQN_fishing', verbose=2)






Creating environment from the given name, wrapped in a DummyVecEnv.
---------------------------------
| explained_variance | 0.0154   |
| fps                | 24       |
| nupdates           | 1        |
| policy_entropy     | 1.1      |
| total_timesteps    | 5        |
| value_loss         | 0.00167  |
---------------------------------
---------------------------------
| explained_variance | -0.164   |
| fps                | 644      |
| nupdates           | 100      |
| policy_entropy     | 1.1      |
| total_timesteps    | 500      |
| value_loss         | 2.69e-05 |
---------------------------------
----------------------------------
| explained_variance | -1.07e+14 |
| fps                | 737       |
| nupdates           | 200       |
| policy_entropy     | 1.1       |
| total_timesteps    | 1000      |
| value_loss         | 0.000214  |
----------------------------------
---------------------------------
| explained_variance | -0.384   |
| fps                | 778      |
| nupd

In [63]:
#showed loaded hyperparameters
print("loaded: ", "gamma = ", loaded_model.gamma)

loaded:  gamma =  0.9


In [64]:
#Vectorize env - Allow us to train multiple envs into 1 env

loaded_model.set_env(DummyVecEnv([lambda: gym.make('fishing-v0')]))


# Training

In [65]:
loaded_model.learn(8000)

---------------------------------
| explained_variance | 0.00996  |
| fps                | 24       |
| nupdates           | 1        |
| policy_entropy     | 1.1      |
| total_timesteps    | 5        |
| value_loss         | 0.00142  |
---------------------------------
---------------------------------
| explained_variance | 0.195    |
| fps                | 614      |
| nupdates           | 100      |
| policy_entropy     | 1.1      |
| total_timesteps    | 500      |
| value_loss         | 1.21e-06 |
---------------------------------
----------------------------------
| explained_variance | -1.18e+09 |
| fps                | 694       |
| nupdates           | 200       |
| policy_entropy     | 1.1       |
| total_timesteps    | 1000      |
| value_loss         | 9.75e-06  |
----------------------------------
---------------------------------
| explained_variance | 0.000458 |
| fps                | 746      |
| nupdates           | 300      |
| policy_entropy     | 1.1      |
| tota

<stable_baselines.a2c.a2c.A2C at 0x7face2961c50>

# Gym and VecEnv Wrapping

We can modify past environments

In [42]:
from stable_baselines.bench.monitor import Monitor

In [72]:
env = Monitor(gym.make('fishing-v0'), filename=None, allow_early_resets=True)
env = DummyVecEnv([lambda: env])

#env.get_episode_rewards()

loaded_model = DQN('MlpPolicy', env, verbose=2).learn(int(10000))



--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 100      |
| mean 100 episode reward | 1.1      |
| steps                   | 3376     |
--------------------------------------
--------------------------------------
| % time spent exploring  | 2        |
| episodes                | 200      |
| mean 100 episode reward | 1.1      |
| steps                   | 5063     |
--------------------------------------


# Probability Distribution

In [None]:
from stable_baselines.common.distributions import MultiCategoricalProbabilityDistribution

In [None]:
MultiCategoricalProbabilityDistribution