In [1]:
# Import libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

# Define the environment
class FrozenLakeEnv:
    def __init__(self, size=4):
        self.size = size
        self.nS = size*size
        self.nA = 4
        self.P = np.zeros((self.nS, self.nA, self.nS))
        self.R = np.zeros((self.nS, self.nA, self.nS))
        
        # Define the game board
        self.board = np.array([
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0],
            [0, 0, 0, 0]
        ])
        
        # Define the starting and goal states
        self.start_state = 0
        self.goal_state = 15
        
        # Define the possible actions
        self.actions = ['left', 'up', 'right', 'down']
        
        # Define the transition probabilities and rewards
        for i in range(self.nS):
            for j in range(self.nA):
                for k in range(self.nS):
                    if j == 0:  # Move left
                        if i % self.size == 0:  # Wall on left
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i-1] = 1
                            self.R[i, j, i-1] = -1
                    elif j == 1:  # Move up
                        if i < self.size:  # Wall on top
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i-self.size] = 1
                            self.R[i, j, i-self.size] = -1
                    elif j == 2:  # Move right
                        if (i+1) % self.size == 0:  # Wall on right
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i+1] = 1
                            self.R[i, j, i+1] = -1
                    elif j == 3:  # Move down
                        if i >= self.nS-self.size:  # Wall on bottom
                            self.P[i, j, i] = 1
                            self.R[i, j, i] = -1
                        else:
                            self.P[i, j, i+self.size] = 1
                            self.R[i, j, i+self.size] = -1
                        
    def reset(self):
        self.current_state = self.start_state
        
    def step(self, action):
        next_state = np.random.choice(self.nS, p=self.P[self.current_state, action])
        reward = self.R[self.current_state, action, next_state]
        self.current_state = next_state
        done = next_state == self.goal_state
        return next_state, reward, done, {}

# Define the agent
class QAgent:
    def __init__(self, env):
        self.env = env
        self.nS = env.nS
        self.nA = env.nA
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.epsilon = 0.1
        
        self.model = tf.keras.Sequential([
            layers.Dense(16, input_shape=(1,), activation='relu'),
            layers.Dense(self.nA, activation='linear')
        ])
        
    def get_action(self, state, epsilon=None):
        if epsilon is None:
            epsilon = self.epsilon
        if np.random.rand() < epsilon:
            return np.random.choice(self.nA)
        else:
            q_values = self.model(np.array([state]))
            return np.argmax(q_values)
        
    def train(self, state, action, next_state, reward, done):
        q_values = self.model(np.array([state]))
        if done:
            q_values[0, action] = reward
        else:
            next_q_values = self.model(np.array([next_state]))
            q_values[0, action] = reward + self.discount_factor * np.max(next_q_values)
        self.model.train_on_batch(np.array([state]), q_values)

# Define the game
env = FrozenLakeEnv()
agent = QAgent(env)

# Train the agent
num_episodes = 100
for i in range(num_episodes):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.train(state, action, next_state, reward, done)
        state = next_state

# Test the agent
state = env.reset()
done = False
while not done:
    action = agent.get_action(state, epsilon=0)
    next_state, reward, done, _ = env.step(action)
    print(f'State: {state}, Action: {action}, Reward: {reward}, Done: {done}')
    state = next_state


2023-01-23 22:55:11.808181: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 22:55:11.819833: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 22:55:11.820192: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 22:55:11.821001: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type NoneType).