In [1]:
import random
import numpy as np
import tensorflow as tf

# Define the environment
class Environment:
    def __init__(self):
        self.board_size = (4, 4)
        self.mines = set()
        self.state = (0, 0)
        self.end_state = (3, 3)
        self.actions = ["up", "down", "left", "right"]
        self._place_mines()

    def _place_mines(self):
        # randomly place mines on the board
        while len(self.mines) < 4:
            x, y = random.randint(0, 3), random.randint(0, 3)
            if (x, y) == self.end_state:
                continue
            self.mines.add((x, y))
            
    def step(self, action):
        if action == "up":
            self.state = (max(self.state[0]-1,0), self.state[1])
        elif action == "down":
            self.state = (min(self.state[0]+1,3), self.state[1])
        elif action == "left":
            self.state = (self.state[0], max(self.state[1]-1,0))
        elif action == "right":
            self.state = (self.state[0], min(self.state[1]+1,3))
        done = self.state in self.mines or self.state == self.end_state
        if done:
            return self.state, -100, done
        else:
            return self.state, -1, done

# Define the Q-Learning algorithm
class QLearning:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.q_table = tf.Variable(tf.random.uniform(shape=(state_size[0], state_size[1], action_size), minval=-1, maxval=1))

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return np.random.randint(self.action_size)
        else:
            return tf.argmax(self.q_table[state[0], state[1]]).numpy()

    def update(self, state, action, reward, next_state, alpha, gamma):
        q_next = tf.reduce_max(self.q_table[next_state[0], next_state[1]])
        q_val = self.q_table[state[0], state[1], action]
        q_update = q_val + alpha * (reward + gamma * q_next - q_val)
        self.q_table = tf.tensor_scatter_nd_update(self.q_table, [[state[0], state[1], action]], [q_update])

# Define the training loop
def train(agent, env, episodes, alpha, gamma, epsilon):
    for episode in range(episodes):
        state = (0, 0)
        total_reward = 0
        while True:
            action = agent.get_action(state, epsilon)
            next_state, reward, done = env.step(env.actions[action])
            agent.update(state, action, reward, next_state, alpha, gamma)
            state = next_state
            total_reward += reward
            if done:
                break
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Define the main function
def main():
    env = Minesweeper()
    agent = QLearning(state_size=(4, 4, 4), action_size=4)
    train(agent, env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1)

if __name__ == "__main__":
    main()

NameError: name 'Minesweeper' is not defined