In [1]:
import random
import numpy as np
import tensorflow as tf

# Define the environment
class Minesweeper:
    def __init__(self):
        self.board = np.zeros((4, 4))
        self.mines = [(np.random.randint(0, 4), np.random.randint(0, 4)) for _ in range(4)]
        for mine in self.mines:
            self.board[mine[0]][mine[1]] = -1
        self.state = (self.board, (0, 0))
        self.end_states = self.mines + [(3, 3)]

    def step(self, action):
        x, y = self.state[1]
        if action == 0:
            x -= 1
        elif action == 1:
            x += 1
        elif action == 2:
            y -= 1
        elif action == 3:
            y += 1
        x = max(min(x, 3), 0)
        y = max(min(y, 3), 0)
        if (x, y) in self.end_states:
            return (self.state[0], (x, y)), -100, True
        self.state = (self.board, (x, y))
        return self.state, -1, False

# Define the Q-Learning algorithm
class QLearning:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.q_table = tf.Variable(tf.random.uniform(shape=(state_size[0], state_size[1], action_size), minval=-1, maxval=1))

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.randint(self.action_size)
        else:
            return tf.argmax(self.q_table[state[0], state[1]]).numpy()

    def update(self, state, action, reward, next_state, alpha, gamma):
        q_next = tf.reduce_max(self.q_table[next_state[0], next_state[1]])
        q_val = self.q_table[state[0], state[1], action]
        q_update = q_val + alpha * (reward + gamma * q_next - q_val)
        self.q_table = tf.tensor_scatter_nd_update(self.q_table, [[state[0], state[1], action]], [q_update])

# Define the training loop
def train(agent, env, episodes, alpha, gamma, epsilon):
    for episode in range(episodes):
        state = (0, 0)
        total_reward = 0
        while True:
            action = agent.get_action(state, epsilon)
            next_state, reward, done = env.step(env.actions[action])
            agent.update(state, action, reward, next_state, alpha, gamma)
            state = next_state
            total_reward += reward
            if done:
                break
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Define the main function
def main():
    env = Minesweeper()
    agent = QLearning(state_size=(4, 4, 4), action_size=4)
    train(agent, env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1)

if __name__ == "__main__":
    main()

2023-01-23 19:23:06.524202: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 19:23:06.567876: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 19:23:06.568344: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-23 19:23:06.569455: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

AttributeError: 'Minesweeper' object has no attribute 'actions'