# Infrastructure

Purpose of this project: set up a custom gym environment, from scratch, with different versions. Train model with simple q-learning, see how to make it compatible with stable baselines. 

In [22]:
import numpy as np
import gym
import random

Creating environments.

In [23]:
class BasicEnv(gym.Env):

    def __init__(self):
        # There are two actions, first will get reward of 1, second reward of -1. 
        self.action_space = gym.spaces.Discrete(5)
#         self.action_space = 1
        self.observation_space = gym.spaces.Discrete(2)

    def step(self, action):

        # if we took an action, we were in state 1
        state = 1
    
        if action == 2:
            reward = 1
        else:
            reward = -1
            
        # regardless of the action, game is done after a single step
        done = True

        info = {}

        return state, reward, done, info

    def reset(self):
        state = 0
        return state

In [25]:
# same as BasicEnv, with one difference: the reward for each action is a normal variable
# purpose is to see if we can use libraries

class BasicEnv2(gym.Env):
    metadata = {'render.modes': ['human']}

    def __init__(self):
        # There are two actions, first will get reward of 1, second reward of -1. 
        self.action_space = gym.spaces.Discrete(5)
        self.observation_space = gym.spaces.Discrete(2)

    def step(self, action):

        # if we took an action, we were in state 1
        state = 1
    
        reward = np.random.normal(loc = action, scale = action)

        # regardless of the action, game is done after a single step
        done = True

        info = {}

        return state, reward, done, info

    def reset(self):
        state = 0
        return state
  
    def render(self, mode='human'):
        pass

    def close(self):
        pass

In [26]:
# method 1 - build from gym package
env = gym.make("gym_basic:basic-v2")

In [27]:
# method 2 - use local test class
env = BasicEnv2()

# Q-Learning

Source: https://deeplizard.com/learn/video/HGeI30uATws

I copied the code and tested it with the custom environment instead of the built-in Frozen Lake environment. 

In [46]:
env = BasicEnv()

In [47]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table = np.zeros((state_space_size, action_space_size))

print(q_table)

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [48]:
num_episodes = 1000
max_steps_per_episode = 10 # but it won't go higher than 1

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01

exploration_decay_rate = 0.01 #if we decrease it, will learn slower

In [49]:
rewards_all_episodes = []

# Q-Learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploration -exploitation trade-off
        exploration_rate_threshold = random.uniform(0,1)
        if exploration_rate_threshold > exploration_rate: 
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        # Update Q-table for Q(s,a)
        q_table[state, action] = (1 - learning_rate) * q_table[state, action] + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state,:]))
            
        state = new_state
        rewards_current_episode += reward
        
        if done == True: 
            break
            
    # Exploration rate decay
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
    
    rewards_all_episodes.append(rewards_current_episode)
    
# Calculate and print the average reward per 10 episodes
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 100)
count = 100
print("********** Average  reward per thousand episodes **********\n")

for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r / 100)))
    count += 100
    
# Print updated Q-table
print("\n\n********** Q-table **********\n")
print(q_table)
        

********** Average  reward per thousand episodes **********

100 :  -0.15999999999999998
200 :  0.7200000000000004
300 :  0.8800000000000006
400 :  0.9600000000000006
500 :  0.9200000000000006
600 :  0.9600000000000006
700 :  1.0000000000000007
800 :  0.9200000000000006
900 :  0.9800000000000006
1000 :  0.9800000000000006


********** Q-table **********

[[-0.91137062 -0.94185026  1.         -0.92023356 -0.84990536]
 [ 0.          0.          0.          0.          0.        ]]


# Verify Environment with Stable Baselines

#modified by southglory, April 9th, 2021.
----------------------------------------------------------

If you got error when using "stable_baselines",
    please uninstall it and use "stable_baselines3", the latest version, instead.
    
    https://github.com/DLR-RM/stable-baselines3
    
    " pip install stable-baselines3"
    and import stable_baselines3.
    
There were some changes from SB2(stable_baselines) to SB3(stable_baselines3).

Because I greatly thanks to this kind tutorial, I updated this part from SB2 to SB3.

In [50]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

In [54]:
env = gym.make('gym_basic:basic-v0')

# Optional: PPO2 requires a vectorized environment to run
# the env is now wrapped automatically when passing it to the constructor
# env = DummyVecEnv([lambda: env])

model = PPO("MlpPolicy", env, verbose = False)
model.learn(total_timesteps=10000)

obs = env.reset()
for i in range(10):
    action, _states = model.predict(obs)
    print(action)
    obs, rewards, dones, info = env.step(action)
    env.render()

2
2
2
2
2
2
2
2
2
2


In [55]:
from stable_baselines3.common.env_checker import check_env

In [56]:
# env = gym.make('gym_basic:basic-v0')
env = BasicEnv()

In [45]:
check_env(env)