ML Practical 6

Name - Mansi Mohan Baviskar

Roll No.- 42505

In [1]:
import numpy as np
import random

# Define the Maze
class Maze:
    def __init__(self):
        self.grid = np.array([[0, 0, 0, 0, 0],
                               [0, 1, 1, 1, 0],
                               [0, 1, 0, 0, 0],
                               [0, 1, 1, 1, 0],
                               [0, 0, 0, 0, 2]])
        self.start_state = (0, 0)  # Starting position
        self.goal_state = (4, 4)    # Goal position
        self.state = self.start_state

    def reset(self):
        self.state = self.start_state
        return self.state

    def step(self, action):
        x, y = self.state

        if action == 0:  # Up
            x = max(0, x - 1)
        elif action == 1:  # Down
            x = min(4, x + 1)
        elif action == 2:  # Left
            y = max(0, y - 1)
        elif action == 3:  # Right
            y = min(4, y + 1)

        if self.grid[x, y] == 1:  # If hitting a wall
            return self.state, -1, False  # Return current state, penalty, and not done

        self.state = (x, y)

        if self.state == self.goal_state:  # Reached the goal
            return self.state, 10, True  # Return goal state, reward, and done
        else:
            return self.state, -0.1, False  # Penalty for each step taken

    def render(self):
        maze_copy = self.grid.copy()
        x, y = self.state
        maze_copy[x, y] = 3  # Mark the agent's position
        print(maze_copy)


In [2]:
class QLearningAgent:
    def __init__(self, maze):
        self.maze = maze
        self.q_table = np.zeros((5, 5, 4))  # 5x5 grid and 4 actions
        self.learning_rate = 0.1
        self.discount_factor = 0.9
        self.epsilon = 1.0  # Exploration rate
        self.epsilon_decay = 0.99
        self.min_epsilon = 0.1
        self.num_episodes = 1000

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            return random.randint(0, 3)  # Explore: choose random action
        else:
            return np.argmax(self.q_table[state[0], state[1]])  # Exploit: choose best action

    def learn(self):
        for episode in range(self.num_episodes):
            state = self.maze.reset()
            done = False

            while not done:
                action = self.choose_action(state)
                next_state, reward, done = self.maze.step(action)

                # Update Q-table using the Q-learning formula
                best_next_action = np.argmax(self.q_table[next_state[0], next_state[1]])
                td_target = reward + self.discount_factor * self.q_table[next_state[0], next_state[1], best_next_action]
                td_delta = td_target - self.q_table[state[0], state[1], action]
                self.q_table[state[0], state[1], action] += self.learning_rate * td_delta

                state = next_state

            # Decay epsilon
            if self.epsilon > self.min_epsilon:
                self.epsilon *= self.epsilon_decay

    def print_q_table(self):
        print(self.q_table)


In [3]:
if __name__ == "__main__":
    maze = Maze()
    agent = QLearningAgent(maze)
    agent.learn()
    agent.print_q_table()


[[[ 3.52245392e+00  4.26126590e+00  3.59136049e+00  8.16229267e-01]
  [-2.37426713e-01 -1.03542103e+00  7.35023142e-01  1.61444984e+00]
  [-8.78206981e-02 -7.22994850e-01 -1.43662834e-01  2.78449792e+00]
  [ 5.69047507e-02 -7.31362211e-01  5.58120291e-02  4.20725462e+00]
  [ 5.23715267e-01  5.70416737e+00  6.82754864e-01  1.41464406e+00]]

 [[ 3.26639895e+00  4.84585100e+00  4.09408210e+00  3.16000727e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [ 1.02001126e+00  7.30873569e+00  7.77316089e-01  2.35788691e+00]]

 [[ 3.93498735e+00  5.49539000e+00  4.73585078e+00  3.82113893e+00]
  [ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00]
  [-4.86843102e-01 -4.84434437e-01 -3.72241337e-01  6.24460058e-01]
  [ 9.67697747e-02  5.26060649e-02  7.52709672e-03  3.29947427e+00]
  [ 1.26867088e+00  8.68263261e+00  1.114212