In [1]:
# Import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Define the environment
class FrozenLakeEnv:
    def __init__(self, size=4):
        self.size = size
        self.nS = size*size # Number of States
        self.nA = 4 # Number of Actions
        self.P = np.zeros((self.nS, self.nA, self.nS))
        self.R = np.zeros((self.nS, self.nA, self.nS))
        
        # Define the game board
        self.board = np.array([
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]
        ])
        
        # Define the starting and goal states
        self.start_state = 0
        self.goal_state = 15
        
        # Define the possible actions
        self.actions = ['left', 'up', 'right', 'down']
        
        # Define the transition probabilities and rewards
        for i in range(self.nS):
            for j in range(self.nA):
                for k in range(self.nS):
                    if j == 0:  # Move left
                        if i % self.size == 0:  # Wall on left
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i-1] = 1
                            self.R[i, j, i-1] = -1
                    elif j == 1:  # Move up
                        if i < self.size:  # Wall on top
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i-self.size] = 1
                            self.R[i, j, i-self.size] = -1
                    elif j == 2:  # Move right
                        if (i+1) % self.size == 0:  # Wall on right
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i+1] = 1
                            self.R[i, j, i+1] = -1
                    elif j == 3:  # Move down
                        if i >= self.nS-self.size:  # Wall on bottom
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i+self.size] = 1
                            self.R[i, j, i+self.size] = -1
                        
    def reset(self):
        self.current_state = self.start_state
        
    def step(self, action):
        next_state = np.random.choice(self.nS, p=self.P[self.current_state, action])
        reward = self.R[self.current_state, action, next_state]
        self.current_state = next_state
        done = next_state == self.goal_state
        #print('current={} goal={} done={}'.format(
        #    self.current_state, self.goal_state, done))
        return next_state, reward, done, {}

# Define the agent
class QAgent:
    def __init__(self, env):
        self.env = env
        self.nS = env.nS
        self.nA = env.nA
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.epsilon = 0.9
        
        self.model = tf.keras.Sequential([
            layers.Dense(16, input_shape=(1,), activation='relu'),
            layers.Dense(self.nA, activation='linear')
        ])
        self.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                           loss='mean_squared_error')
        
    def get_action(self, state, epsilon=None):
        if epsilon is None:
            epsilon = self.epsilon
        if np.random.rand() < epsilon:
            return np.random.choice(self.nA)
        else:
            q_values = self.model(np.array([state]))
            q_values = q_values.numpy()
            return np.argmax(q_values)
        
    def train(self, state, action, next_state, reward, done):
        q_values = self.model(np.array([state]))
        try:
            q_values = q_values.numpy()
        except:
            pass
        if done:
            q_values[0, action] = reward
        else:
            next_q_values = self.model(np.array([next_state]))
            q_values[0, action] = reward + self.discount_factor * np.max(next_q_values)
        #self.model.train_on_batch(np.array([state]), q_values)
        self.model.fit(np.array([state]), q_values, verbose=0)

# Define the game
env = FrozenLakeEnv()
agent = QAgent(env)

# Train the agent
num_episodes = 100
j = 0
for i in range(num_episodes):
    print('EPISODE {} epsilon={} j={}'.format(i, agent.epsilon, j))
    env.reset()
    state = env.start_state
    done = False
    j = 0
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, next_state, reward, done)
        state = next_state
        j += 1
    agent.epsilon *= 0.95

# Test the agent
env.reset()
state = env.start_state
done = False
agent.epsilon = 0.0
j = 0
while not done and j < 100:
    print(j)
    action = agent.get_action(state)
    next_state, reward, done, _ = env.step(action)
    print(f'State: {state}, Action: {action}, Reward: {reward}, Done: {done}')
    state = next_state
    j += 1


2023-01-24 01:36:36.652484: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 01:36:36.665551: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 01:36:36.666283: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-24 01:36:36.667537: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

EPISODE 0 epsilon=0.9 j=


2023-01-24 01:36:37.768898: I tensorflow/stream_executor/cuda/cuda_blas.cc:1786] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


EPISODE 1 epsilon=0.855 j=
EPISODE 2 epsilon=0.8122499999999999 j=
EPISODE 3 epsilon=0.7716374999999999 j=
EPISODE 4 epsilon=0.7330556249999999 j=
EPISODE 5 epsilon=0.6964028437499998 j=
EPISODE 6 epsilon=0.6615827015624998 j=
EPISODE 7 epsilon=0.6285035664843748 j=
EPISODE 8 epsilon=0.597078388160156 j=
EPISODE 9 epsilon=0.5672244687521482 j=
EPISODE 10 epsilon=0.5388632453145408 j=
EPISODE 11 epsilon=0.5119200830488138 j=
EPISODE 12 epsilon=0.486324078896373 j=
EPISODE 13 epsilon=0.4620078749515544 j=
EPISODE 14 epsilon=0.43890748120397666 j=
EPISODE 15 epsilon=0.4169621071437778 j=
EPISODE 16 epsilon=0.3961140017865889 j=
EPISODE 17 epsilon=0.37630830169725943 j=
EPISODE 18 epsilon=0.3574928866123964 j=
EPISODE 19 epsilon=0.33961824228177656 j=
EPISODE 20 epsilon=0.3226373301676877 j=
EPISODE 21 epsilon=0.3065054636593033 j=
EPISODE 22 epsilon=0.29118019047633814 j=
EPISODE 23 epsilon=0.2766211809525212 j=
EPISODE 24 epsilon=0.26279012190489515 j=
EPISODE 25 epsilon=0.24965061580965