In [None]:
import numpy as np
import random

# Define the environment
class Environment:
    def __init__(self):
        self.state = (0, 0)
        self.board = np.array([[0, 0, 1, 0],
                              [0, 0, 1, 0],
                              [0, 0, 1, 0],
                              [0, 0, 0, 0]])
        self.end_state = (3, 3)
        self.actions = [(0, 1), (1, 0), (0, -1), (-1, 0)]

    def step(self, action):
        next_state = (self.state[0] + action[0], self.state[1] + action[1])
        if next_state[0] < 0 or next_state[0] >= self.board.shape[0] or next_state[1] < 0 or next_state[1] >= self.board.shape[1]:
            return self.state, -1, False
        if self.board[next_state[0], next_state[1]] == 1:
            return self.state, -10, False
        if next_state == self.end_state:
            return next_state, 10, True
        return next_state, -1, False

# Define the Q-Learning algorithm
class QLearning:
    def __init__(self, state_size, action_size):
        self.q_table = {}
        for i in range(state_size[0]):
            for j in range(state_size[1]):
                self.q_table[(i, j)] = [0 for _ in range(action_size)]

    def get_action(self, state, epsilon):
        if np.random.random() < epsilon:
            return random.choice([0, 1, 2, 3])
        else:
            return np.argmax(self.q_table[state])

    def update(self, state, action, reward, next_state, alpha, gamma):
        q_next = max(self.q_table[next_state])
        q_val = self.q_table[state][action]
        self.q_table[state][action] = q_val + alpha * (reward + gamma * q_next - q_val)

# Define the training loop
def train(agent, env, episodes, alpha, gamma, epsilon):
    for episode in range(episodes):
        state = (0, 0)
        total_reward = 0
        while True:
            action = agent.get_action(state, epsilon)
            next_state, reward, done = env.step(env.actions[action])
            agent.update(state, action, reward, next_state, alpha, gamma)
            state = next_state
            total_reward += reward
            if done:
                break
        print(f"Episode {episode + 1}: Total Reward = {total_reward}")

# Define the main function
def main():
    env = Environment()
    agent = QLearning(state_size=(4, 4), action_size=4)
    train(agent, env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1)

if __name__ == "__main__":
    main()
