In [1]:
import gym
import random
import tensorflow.compat.v1 as tf
import numpy as np
import time
from gym.envs.registration import register
from IPython.display import clear_output
from collections import deque

tf.disable_v2_behavior()
print("Gym:", gym.__version__)

Instructions for updating:
non-resource variables are not supported in the long term
Gym: 0.18.0


In [2]:
try:
    register(
        id='FrozenLakeNoSlip-v0',
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery':False},
        max_episode_steps=100,
        reward_threshold=0.78, # optimum = .8196
    )
except:
    pass

env_name = "FrozenLake-v0"
env_name = "FrozenLakeNoSlip-v0"
# env_name = "CartPole-v0"
env = gym.make(env_name)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Discrete(16)
Action space: Discrete(4)


In [3]:
class Agent():
    def __init__(self, env):
        self.is_discrete = type(env.action_space) == gym.spaces.discrete.Discrete
        
        if self.is_discrete:
            self.action_size = env.action_space.n
            print("Action size:", env.action_space)
        else:
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
            self.action_shape = env.action_space.shape
            print("Action range:", self.action_low, self.action_high)
            
    def get_action(self, state):
        if self.is_discrete:
            action = random.choice(range(self.action_size))
        else:
            action = np.random.uniform(self.action_low,
                                       self.action_high,
                                       self.action_shape)
                                       
        return action


In [4]:
class QNAgent(Agent):
    def __init__(self, env, discount_rate = 0.97, learning_rate = 0.01):
        super().__init__(env)
        self.state_size = env.observation_space.n
        print("State size:", self.state_size)

        self.eps = 1.0
        self.discount_rate = discount_rate
        self.learning_rate = learning_rate
        self.build_model()

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def build_model(self):
        tf.reset_default_graph()
        self.state_in = tf.placeholder(tf.float32, shape=[1])
        self.action_in = tf.placeholder(tf.float32, shape=[1])
        self.target_in = tf.placeholder(tf.float32, shape=[1])

        self.state = tf.one_hot(tf.cast(self.state_in, tf.int32), depth=self.state_size)
        self.action = tf.one_hot(tf.cast(self.action_in, tf.int32), depth=self.action_size)

        self.q_state = tf.layers.dense(self.state, units=self.action_size, name="q_table")
        self.q_action = tf.reduce_sum(tf.multiply(self.q_state, self.action), axis=1)

        self.loss = tf.reduce_sum(tf.square(self.target_in - self.q_action))
        self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

    def get_action(self, state):
        q_state = self.sess.run(self.q_state, feed_dict={self.state_in: [state]})
        action_greedy = np.argmax(q_state)
        action_random = super().get_action(state)
        return action_random if random.random() < self.eps else action_greedy

    def train(self, experience):
        state, action, next_state, reward, done = ([exp] for exp in experience)

        q_next = self.sess.run(self.q_state, feed_dict={self.state_in: next_state})
        q_next[done] = np.zeros([self.action_size])
        q_target = reward + self.discount_rate * np.max(q_next)

        feed = {self.state_in: state, self.action_in: action, self.target_in: q_target}
        self.sess.run(self.optimizer, feed_dict=feed)

        if experience[4]:
            self.eps = self.eps * 0.99

    def __del__(self):
        self.sess.close()

agent = QNAgent(env)

Action size: Discrete(4)
State size: 16


In [6]:
total_reward = 0
streak = 0
for ep in range(100):
    state = env.reset()
    done = False
    #for _ in range(200):
    while not done:
        # action = env.action_space.sample()
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train((state, action, next_state, reward, done))
        state = next_state
        total_reward += reward

        print("s:", state, "a:", action)
        print("Episode: {}, Total reward: {}, eps: {}, Continuous Wins: {}".format(ep,total_reward,agent.eps, streak))
        env.render()
        with tf.variable_scope("q_table", reuse=True):
            weights = agent.sess.run(tf.get_variable("kernel"))
            print(weights)
        time.sleep(0.01)
        clear_output(wait=True)
    if done and reward > 0:
        streak += 1
    else:
        streak = 0
    #env.close()

s: 7 a: 0
Episode: 99, Total reward: 6.0, eps: 0.13397967485796175, Continuous Wins: 1
  (Left)
SFFF
FHF[41mH[0m
FFFH
HFFG
[[-0.13440786 -0.0024018  -0.01501486 -0.03446881]
 [-0.10654026 -0.00136019  0.03135497 -0.01993641]
 [-0.15115955 -0.0636254  -0.04219705 -0.01839138]
 [-0.13989253 -0.16152656  0.07017835 -0.00867659]
 [-0.10890934 -0.15868011  0.00094086 -0.09142596]
 [ 0.12076759 -0.2038557  -0.2234396  -0.08792934]
 [-0.09634129 -0.13689817 -0.0203898  -0.06867222]
 [ 0.35532302 -0.3503401   0.46532357  0.49835777]
 [-0.19643626 -0.12240592  0.08424863  0.08536167]
 [-0.2956907   0.07686745 -0.00840253 -0.08025189]
 [-0.06901645  0.14842707  0.11196747  0.0401201 ]
 [-0.25290307  0.25220704 -0.0162102   0.29980534]
 [-0.00431812  0.12695402  0.19372267  0.10146993]
 [-0.19273552  0.03040382  0.05113023 -0.04552185]
 [-0.03794516  0.24816062  0.19646192 -0.18101904]
 [ 0.04382443 -0.36669105  0.44022697 -0.33669987]]
