In [6]:
import gym
from gym import spaces
import numpy as np
import random

# Define the custom environment
class GridWorldEnv(gym.Env):
    def __init__(self):
        super(GridWorldEnv, self).__init__()
        
        # Define a 5x5 grid
        self.grid_size = 5
        
        # Define the observation space (the grid itself)
        self.observation_space = spaces.Box(low=0, high=4, shape=(2,), dtype=np.int32)
        
        # Define the action space (up, down, left, right)
        self.action_space = spaces.Discrete(4)
        
        # Initialize the agent's position
        self.state = np.zeros((self.grid_size, self.grid_size))
        
        # Initialize goal and obstacle positions
        self.goal_position = [self.grid_size - 1, self.grid_size - 1]
        self.obstacle_positions = [[1, 1], [2, 2], [3, 3]]
        
        self.reset()

    def reset(self):
        # Reset the agent to the top-left corner of the grid
        self.agent_position = [0, 0]
        
        # Reset the state
        self.state = np.zeros((self.grid_size, self.grid_size))
        
        # Place the obstacles
        for pos in self.obstacle_positions:
            self.state[pos[0], pos[1]] = -1
        
        # Place the goal
        self.state[self.goal_position[0], self.goal_position[1]] = 1
        
        return np.array(self.agent_position)

    def step(self, action):
        # Define the actions
        if action == 0:  # Up
            self.agent_position[0] = max(self.agent_position[0] - 1, 0)
        elif action == 1:  # Down
            self.agent_position[0] = min(self.agent_position[0] + 1, self.grid_size - 1)
        elif action == 2:  # Left
            self.agent_position[1] = max(self.agent_position[1] - 1, 0)
        elif action == 3:  # Right
            self.agent_position[1] = min(self.agent_position[1] + 1, self.grid_size - 1)
        
        # Check if agent has reached the goal
        if self.agent_position == self.goal_position:
            reward = 1
            done = True
        elif self.agent_position in self.obstacle_positions:
            reward = -1
            done = True
        else:
            reward = -0.1  # Small penalty for each step taken
            done = False
        
        return np.array(self.agent_position), reward, done, {}

    def render(self, mode='human'):
        # Print the grid with the agent's current position
        grid = np.copy(self.state)
        grid[self.agent_position[0], self.agent_position[1]] = 2
        print(grid)

# Instantiate the environment
env = GridWorldEnv()
env.reset()
env.render()


[[ 2.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]


In [7]:
class QLearningAgent:
    def __init__(self, env, learning_rate=0.1, discount_factor=0.99, exploration_rate=1.0, exploration_decay=0.995, exploration_min=0.01):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.exploration_decay = exploration_decay
        self.exploration_min = exploration_min
        
        # Initialize the Q-table with zeros
        self.q_table = np.zeros((self.env.grid_size, self.env.grid_size, self.env.action_space.n))

    def choose_action(self, state):
        if np.random.rand() < self.exploration_rate:
            return self.env.action_space.sample()  # Explore
        else:
            return np.argmax(self.q_table[state[0], state[1], :])  # Exploit

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state[0], next_state[1], :])
        td_target = reward + self.discount_factor * self.q_table[next_state[0], next_state[1], best_next_action]
        td_error = td_target - self.q_table[state[0], state[1], action]
        self.q_table[state[0], state[1], action] += self.learning_rate * td_error
        # Decay exploration rate
        self.exploration_rate = max(self.exploration_min, self.exploration_rate * self.exploration_decay)

    def train(self, num_episodes=1000):
        for episode in range(num_episodes):
            state = self.env.reset()
            done = False

            while not done:
                action = self.choose_action(state)
                next_state, reward, done, _ = self.env.step(action)
                self.update_q_table(state, action, reward, next_state)
                state = next_state

                if done:
                    break

# Instantiate the Q-learning agent
agent = QLearningAgent(env)

# Train the agent
agent.train(num_episodes=500)


In [8]:
# Set exploration rate to minimum for evaluation
agent.exploration_rate = 0.01

# Run a single episode to evaluate the agent
state = env.reset()
done = False

while not done:
    env.render()
    action = agent.choose_action(state)
    state, reward, done, _ = env.step(action)


[[ 2.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  2.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  2.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  0.  2.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  0.  0.  0.]
 [ 0. -1.  0.  2.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  2.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  2.]
 [ 0.  0.  0. -1.  0.]
 [ 0.  0.  0.  0.  1.]]
[[ 0.  0.  0.  0.  0.]
 [ 0. -1.  0.  0.  0.]
 [ 0.  0. -1.  0.  0.]
 [ 0.  0.  0. -1.  2.]
 [ 0.  0.  0.  0.  1.]]


In [9]:
# Assuming the environment and agent classes are already defined as in the previous example

# Instantiate the environment and agent
env = GridWorldEnv()
agent = QLearningAgent(env)

num_episodes = 500
max_steps_per_episode = 100

for episode in range(num_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    for step in range(max_steps_per_episode):
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        
        # Print the reward received for this action
        print(f"Episode: {episode + 1}, Step: {step + 1}, State: {state}, Action: {action}, Reward: {reward}")
        
        # Update Q-table
        agent.update_q_table(state, action, reward, next_state)
        state = next_state
        total_reward += reward
        if done:
            break

    print(f"Episode {episode + 1} finished with total reward: {total_reward}")



Episode: 1, Step: 1, State: [0 0], Action: 2, Reward: -0.1
Episode: 1, Step: 2, State: [0 0], Action: 0, Reward: -0.1
Episode: 1, Step: 3, State: [0 0], Action: 1, Reward: -0.1
Episode: 1, Step: 4, State: [1 0], Action: 1, Reward: -0.1
Episode: 1, Step: 5, State: [2 0], Action: 1, Reward: -0.1
Episode: 1, Step: 6, State: [3 0], Action: 3, Reward: -0.1
Episode: 1, Step: 7, State: [3 1], Action: 2, Reward: -0.1
Episode: 1, Step: 8, State: [3 0], Action: 2, Reward: -0.1
Episode: 1, Step: 9, State: [3 0], Action: 0, Reward: -0.1
Episode: 1, Step: 10, State: [2 0], Action: 3, Reward: -0.1
Episode: 1, Step: 11, State: [2 1], Action: 2, Reward: -0.1
Episode: 1, Step: 12, State: [2 0], Action: 1, Reward: -0.1
Episode: 1, Step: 13, State: [3 0], Action: 2, Reward: -0.1
Episode: 1, Step: 14, State: [3 0], Action: 2, Reward: -0.1
Episode: 1, Step: 15, State: [3 0], Action: 2, Reward: -0.1
Episode: 1, Step: 16, State: [3 0], Action: 1, Reward: -0.1
Episode: 1, Step: 17, State: [4 0], Action: 0, Re

Episode: 116, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 116, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 116, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 116, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 116, Step: 8, State: [4 3], Action: 3, Reward: 1
Episode 116 finished with total reward: 0.30000000000000004
Episode: 117, Step: 1, State: [0 0], Action: 1, Reward: -0.1
Episode: 117, Step: 2, State: [1 0], Action: 1, Reward: -0.1
Episode: 117, Step: 3, State: [2 0], Action: 1, Reward: -0.1
Episode: 117, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 117, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 117, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 117, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 117, Step: 8, State: [4 3], Action: 3, Reward: 1
Episode 117 finished with total reward: 0.30000000000000004
Episode: 118, Step: 1, State: [0 0], Action: 1, Reward: -0.1
Episode: 118, Step: 2, State: [1

Episode: 277, Step: 1, State: [0 0], Action: 1, Reward: -0.1
Episode: 277, Step: 2, State: [1 0], Action: 1, Reward: -0.1
Episode: 277, Step: 3, State: [2 0], Action: 1, Reward: -0.1
Episode: 277, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 277, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 277, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 277, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 277, Step: 8, State: [4 3], Action: 3, Reward: 1
Episode 277 finished with total reward: 0.30000000000000004
Episode: 278, Step: 1, State: [0 0], Action: 1, Reward: -0.1
Episode: 278, Step: 2, State: [1 0], Action: 1, Reward: -0.1
Episode: 278, Step: 3, State: [2 0], Action: 1, Reward: -0.1
Episode: 278, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 278, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 278, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 278, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 278, Step: 8, State

Episode: 441, Step: 2, State: [1 0], Action: 1, Reward: -0.1
Episode: 441, Step: 3, State: [2 0], Action: 1, Reward: -0.1
Episode: 441, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 441, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 441, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 441, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 441, Step: 8, State: [4 3], Action: 3, Reward: 1
Episode 441 finished with total reward: 0.30000000000000004
Episode: 442, Step: 1, State: [0 0], Action: 1, Reward: -0.1
Episode: 442, Step: 2, State: [1 0], Action: 1, Reward: -0.1
Episode: 442, Step: 3, State: [2 0], Action: 1, Reward: -0.1
Episode: 442, Step: 4, State: [3 0], Action: 3, Reward: -0.1
Episode: 442, Step: 5, State: [3 1], Action: 1, Reward: -0.1
Episode: 442, Step: 6, State: [4 1], Action: 3, Reward: -0.1
Episode: 442, Step: 7, State: [4 2], Action: 3, Reward: -0.1
Episode: 442, Step: 8, State: [4 3], Action: 3, Reward: 1
Episode 442 finished with total