In [None]:
import numpy as np
import random

class GridWorld:
    def __init__(self, size, obstacles=[], random_obstacles=False):
        self.size = size
        self.grid = np.zeros((size, size))  # Initialize grid with all zeros
        self.agent_position = (0, 0)  # Agent starts at top-left corner
        self.goal_position = (size-1, size-1)  # Goal is at bottom-right corner
        self.grid[self.goal_position] = 1  # Set goal cell value to 1

        # Add obstacles to the grid
        for obstacle in obstacles:
            self.grid[obstacle] = -1

        # If random_obstacles is True, randomly place obstacles
        if random_obstacles:
            num_obstacles = size // 2  # Approximately half of the grid size
            for _ in range(num_obstacles):
                obstacle_position = (random.randint(0, size-1), random.randint(0, size-1))
                # Ensure obstacles are not placed on the starting or goal positions
                while obstacle_position == self.agent_position or obstacle_position == self.goal_position:
                    obstacle_position = (random.randint(0, size-1), random.randint(0, size-1))
                self.grid[obstacle_position] = -1

        self.actions = [(0, 1), (0, -1), (1, 0), (-1, 0)]  # Right, Left, Down, Up

    def reset(self):
        self.agent_position = (0, 0)  # Reset agent position to top-left corner

    def step(self, action):
        # Move the agent according to the selected action
        new_position = (self.agent_position[0] + action[0], self.agent_position[1] + action[1])

        # Check if new position is within the grid boundaries and not an obstacle
        if 0 <= new_position[0] < self.size and 0 <= new_position[1] < self.size and self.grid[new_position] != -1:
            self.agent_position = new_position

        # Calculate reward
        reward = -1  # Default reward for each step
        if self.agent_position == self.goal_position:
            reward = 10  # Reward for reaching the goal

        # Return the new state, reward, and whether the episode is done
        return self.agent_position, reward, self.agent_position == self.goal_position

class QLearningAgent:
    def __init__(self, num_actions, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.num_actions = num_actions
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Epsilon-greedy exploration rate
        self.q_table = {}

    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_actions)  # Random action (exploration)
        else:
            if state not in self.q_table:
                self.q_table[state] = np.zeros(self.num_actions)
            return np.argmax(self.q_table[state])  # Greedy action (exploitation)

    def update_q_value(self, state, action, reward, next_state):
        if state not in self.q_table:
            self.q_table[state] = np.zeros(self.num_actions)
        if next_state not in self.q_table:
            self.q_table[next_state] = np.zeros(self.num_actions)
        # Q-learning update rule
        self.q_table[state][action] += self.alpha * (reward + self.gamma * np.max(self.q_table[next_state]) - self.q_table[state][action])

# Function to visualize the grid
def visualize_grid(grid, agent_position):
    size = grid.shape[0]
    for i in range(size):
        for j in range(size):
            if (i, j) == agent_position:
                print("A", end=" ")  # Agent
            elif grid[i, j] == -1:
                print("X", end=" ")  # Obstacle
            elif grid[i, j] == 1:
                print("G", end=" ")  # Goal
            else:
                print("_", end=" ")  # Empty cell
        print()

# Initialize grid world environment and Q-learning agent
grid_world = GridWorld(size=5, obstacles=[(1, 2), (2, 2)])
agent = QLearningAgent(num_actions=4)

# Train the agent
num_episodes = 1000
for episode in range(num_episodes):
    state = (0, 0)  # Reset to initial state
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done = grid_world.step(grid_world.actions[action])
        agent.update_q_value(state, action, reward, next_state)
        state = next_state
    grid_world.reset()

# Test the trained agent
state = (0, 0)  # Reset to initial state
done = False
total_reward = 0
path = [state]  # Store the path
while not done:
    visualize_grid(grid_world.grid, state)
    action = agent.get_action(state)
    next_state, reward, done = grid_world.step(grid_world.actions[action])
    total_reward += reward
    state = next_state
    path.append(state)  # Append next state to path
    print("Agent's next move:", next_state)
    print("Q-values for current state:")
    print(agent.q_table[state])

print("Reached the goal!")
print("Total reward earned:", total_reward)
print("Path taken by the agent:", path)


A _ _ _ _ 
_ _ X _ _ 
_ _ X _ _ 
_ _ _ _ _ 
_ _ _ _ G 
Agent's next move: (1, 0)
Q-values for current state:
[ 0.27193333 -0.63150684  0.62882    -1.42606648]
_ _ _ _ _ 
A _ X _ _ 
_ _ X _ _ 
_ _ _ _ _ 
_ _ _ _ G 
Agent's next move: (2, 0)
Q-values for current state:
[ 1.8098      0.45183911  1.4210627  -0.50920787]
_ _ _ _ _ 
_ _ X _ _ 
A _ X _ _ 
_ _ _ _ _ 
_ _ _ _ G 
Agent's next move: (2, 1)
Q-values for current state:
[1.45511106 0.37326079 3.122      0.2865396 ]
_ _ _ _ _ 
_ _ X _ _ 
_ A X _ _ 
_ _ _ _ _ 
_ _ _ _ G 
Agent's next move: (3, 1)
Q-values for current state:
[3.19327455 1.37417815 4.58       1.70478654]
_ _ _ _ _ 
_ _ X _ _ 
_ _ X _ _ 
_ A _ _ _ 
_ _ _ _ G 
Agent's next move: (4, 1)
Q-values for current state:
[6.2        1.87706048 4.33508352 2.62565441]
_ _ _ _ _ 
_ _ X _ _ 
_ _ X _ _ 
_ _ _ _ _ 
_ A _ _ G 
Agent's next move: (4, 2)
Q-values for current state:
[8.         3.69550156 5.62550638 3.43592027]
_ _ _ _ _ 
_ _ X _ _ 
_ _ X _ _ 
_ _ _ _ _ 
_ _ A _ G 
Agent's