In [17]:
import numpy as np
import time

class GridAgent:
    #Define Agent and environemnt
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.value_table = np.zeros((grid_size, grid_size, 4))  # Action-value table
        self.explore_prob = 1.0  # Initial exploration probability
        self.target = (grid_size - 1, grid_size - 1)  # Target position

    #Initilaize the agent
    def initialize_position(self):
        """Initialize the agent at the start of the grid."""
        self.position = (1,1)
        return self.position
    #action
    def take_action(self, move):
        """Update position based on the chosen move."""
        row, col = self.position
        if move == 0:  # Move up
            row = max(0, row - 1)
        elif move == 1:  # Move down
            row = min(self.grid_size - 1, row + 1)
        elif move == 2:  # Move left
            col = max(0, col - 1)
        elif move == 3:  # Move right
            col = min(self.grid_size - 1, col + 1)

        self.position = (row, col)
        reward = 1 if self.position == self.target else -1  # Reward
        is_done = self.position == self.target
        return self.position, reward, is_done

    def select_move(self):
        """Select a move based on exploration or exploitation."""
        if np.random.rand() < self.explore_prob:
            return np.random.randint(4)  # Explore randomly
        return np.argmax(self.value_table[self.position])  # Exploit learned values

    def learn(self, num_episodes=50000):
        """Train the agent over a number of episodes."""
        for episode in range(num_episodes):
            current_pos = self.initialize_position() #1,0
            finished = False
            while not finished:
                move = self.select_move() # 1
                next_pos, reward, finished = self.take_action(move)  #(1,0),-1,False
                # Update the value table
                best_future_val = np.max(self.value_table[next_pos])
                self.value_table[current_pos][move] += 0.1 * (
                    reward + 0.9 * best_future_val - self.value_table[current_pos][move]
                )
                current_pos = next_pos
            # Reduce exploration probability gradually
            self.explore_prob *= 0.99

    def showcase(self):
        """Showcase the learned behavior of the agent."""
        current_pos = self.initialize_position()#0,0
        reached_target = False
        while not reached_target:
            print("Current position:", current_pos)
            move = np.argmax(self.value_table[current_pos])  # Always exploit
            print("Selected move:", move)
            current_pos, _, reached_target = self.take_action(move)
            time.sleep(2)
        print("Target reached at position:", current_pos)


# Create and train the agent
navigator = GridAgent()

print("Training the agent on the grid...")
navigator.learn()
print("Training completed.")

print("Starting demonstration...")
navigator.showcase()


Training the agent on the grid...
Training completed.
Starting demonstration...
Current position: (1, 1)
Selected move: 1
Current position: (2, 1)
Selected move: 1
Current position: (3, 1)
Selected move: 1
Current position: (4, 1)
Selected move: 1
Current position: (5, 1)
Selected move: 1
Current position: (6, 1)
Selected move: 1
Current position: (7, 1)
Selected move: 1
Current position: (8, 1)
Selected move: 3
Current position: (8, 2)
Selected move: 3
Current position: (8, 3)
Selected move: 3
Current position: (8, 4)
Selected move: 3
Current position: (8, 5)
Selected move: 3
Current position: (8, 6)
Selected move: 1
Current position: (9, 6)
Selected move: 3
Current position: (9, 7)
Selected move: 3
Current position: (9, 8)
Selected move: 3
Target reached at position: (9, 9)
