# Test customized gym environment


In [5]:
import gymnasium as gym
from gymnasium.envs.registration import register
import numpy as np
import random
import stable_baselines3 as sb

In [8]:
print(gym.__version__)
print(sb.__version__)

0.28.1
2.0.0a5


In [2]:
env = gym.make("gym_basic:basic-v2", render_mode="human")

In [3]:
from stable_baselines3.common.env_checker import check_env
check_env(env)

In [4]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
print(q_table)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [5]:
num_episodes = 1000
max_steps_per_episode = 10 # but it won't go higher than 1

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01

exploration_decay_rate = 0.01 #if we decrease it, will learn slower

In [6]:
rewards_all_episodes = []

# Q-Learning algorithm
for episode in range(num_episodes):
    state = env.reset()[0]
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploration -exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate: 
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, terminated, truncated, info = env.step(action)
        
        # Update Q-table for Q(s,a)
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
            
        state = new_state
        rewards_current_episode += reward
        
        if terminated == True | truncated == True: 
            break
            
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    
# Calculate and print the average reward per 10 episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 100)
count = 100
print("********** Average  reward per thousand episodes **********\n")

for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r / 100)))
    count += 100
    
# Print updated Q-table
print("\n\n********** Q-table **********\n")
print(q_table)

********** Average  reward per thousand episodes **********

100 :  -0.04
200 :  0.7000000000000004
300 :  0.8400000000000005
400 :  0.8600000000000005
500 :  1.0000000000000007
600 :  0.9600000000000006
700 :  1.0000000000000007
800 :  0.9800000000000006
900 :  0.9800000000000006
1000 :  0.9800000000000006


********** Q-table **********

[[-0.79410887 -0.9282102   1.         -0.9282102  -0.90152291]
 [ 0.          0.          0.          0.          0.        ]]


In [7]:
from stable_baselines3 import PPO
# from stable_baselines3.td3 import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv

model = PPO("MlpPolicy", env, verbose=1)

model.learn(total_timesteps=10000)

obs = env.reset()[0]
for i in range(10):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, terminated, truncated, info = env.step(action)
    env.render()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -0.68    |
| time/              |          |
|    fps             | 176      |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 2048     |
---------------------------------
----------------------------------------
| rollout/                |            |
|    ep_len_mean          | 1          |
|    ep_rew_mean          | -0.12      |
| time/                   |            |
|    fps                  | 178        |
|    iterations           | 2          |
|    time_elapsed         | 22         |
|    total_timesteps      | 4096       |
| train/                  |            |
|    approx_kl            | 0.11111563 |
|    clip_fraction        | 0.848      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.5